{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 565, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 71.78125, "completions/mean_terminated_length": 71.78125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.43730998039245605, "epoch": 0.0017699115044247787, "frac_reward_zero_std": 0.75, "grad_norm": 14.733362336222088, "kl": 0.0, "learning_rate": 0.0, "loss": 0.2291, "num_tokens": 14258.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.748260736465454, "sampling/importance_sampling_ratio/mean": 1.0003552436828613, "sampling/importance_sampling_ratio/min": 0.3887093663215637, "sampling/sampling_logp_difference/max": 0.9449234008789062, "sampling/sampling_logp_difference/mean": 0.020397081971168518, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 61.546875, "completions/mean_terminated_length": 61.546875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2651497721672058, "epoch": 0.0035398230088495575, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.4247787610619464e-09, "loss": 0.0, "num_tokens": 28357.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2627105712890625, "sampling/importance_sampling_ratio/mean": 0.9985069036483765, "sampling/importance_sampling_ratio/min": 0.6752722263336182, "sampling/sampling_logp_difference/max": 0.3926393985748291, "sampling/sampling_logp_difference/mean": 0.01767190359532833, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2806015908718109, "epoch": 0.005309734513274336, "frac_reward_zero_std": 0.75, "grad_norm": 23.26227259495331, "kl": 0.0005717705935239792, "learning_rate": 8.849557522123893e-09, "loss": 0.1264, "num_tokens": 44085.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994414448738098, "sampling/importance_sampling_ratio/min": 0.6796828508377075, "sampling/sampling_logp_difference/max": 1.3504528999328613, "sampling/sampling_logp_difference/mean": 0.020524393767118454, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 35.875, "completions/mean_terminated_length": 35.875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.23058325052261353, "epoch": 0.007079646017699115, "frac_reward_zero_std": 0.5, "grad_norm": 10.797167113972659, "kl": 0.0005771875730715692, "learning_rate": 1.327433628318584e-08, "loss": -0.0717, "num_tokens": 58621.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3210742473602295, "sampling/importance_sampling_ratio/mean": 1.002717137336731, "sampling/importance_sampling_ratio/min": 0.5016065239906311, "sampling/sampling_logp_difference/max": 0.6899392604827881, "sampling/sampling_logp_difference/mean": 0.01839931309223175, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 89.03125, "completions/mean_terminated_length": 89.03125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.4322381019592285, "epoch": 0.008849557522123894, "frac_reward_zero_std": 0.5, "grad_norm": 6.9613059543793785, "kl": 0.0008884783601388335, "learning_rate": 1.7699115044247786e-08, "loss": 0.0106, "num_tokens": 75759.0, "reward": 0.125, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.5849732160568237, "sampling/importance_sampling_ratio/mean": 0.9992538690567017, "sampling/importance_sampling_ratio/min": 0.6169490814208984, "sampling/sampling_logp_difference/max": 0.482968807220459, "sampling/sampling_logp_difference/mean": 0.01715037226676941, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 91.265625, "completions/mean_terminated_length": 91.265625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.45085543394088745, "epoch": 0.010619469026548672, "frac_reward_zero_std": 1.0, "grad_norm": 0.013659497090174135, "kl": 0.00046727299923077226, "learning_rate": 2.2123893805309735e-08, "loss": 0.0, "num_tokens": 92496.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3823795318603516, "sampling/importance_sampling_ratio/mean": 0.9996471405029297, "sampling/importance_sampling_ratio/min": 0.6376644372940063, "sampling/sampling_logp_difference/max": 0.44994306564331055, "sampling/sampling_logp_difference/mean": 0.01948455721139908, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 91.75, "completions/mean_terminated_length": 91.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.5193246006965637, "epoch": 0.012389380530973451, "frac_reward_zero_std": 0.75, "grad_norm": 7.340318994776948, "kl": 0.0008482749690301716, "learning_rate": 2.654867256637168e-08, "loss": 0.1265, "num_tokens": 112880.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.9475336074829102, "sampling/importance_sampling_ratio/mean": 1.0007236003875732, "sampling/importance_sampling_ratio/min": 0.4580051898956299, "sampling/sampling_logp_difference/max": 0.7808747291564941, "sampling/sampling_logp_difference/mean": 0.023239202797412872, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 36.015625, "completions/mean_terminated_length": 36.015625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.20200984179973602, "epoch": 0.01415929203539823, "frac_reward_zero_std": 0.75, "grad_norm": 4.4503832105678285, "kl": 0.0009719593799673021, "learning_rate": 3.0973451327433626e-08, "loss": 0.0025, "num_tokens": 126161.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.3440406322479248, "sampling/importance_sampling_ratio/mean": 1.001471757888794, "sampling/importance_sampling_ratio/min": 0.7208306193351746, "sampling/sampling_logp_difference/max": 0.32735109329223633, "sampling/sampling_logp_difference/mean": 0.012839527800679207, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 117.671875, "completions/mean_terminated_length": 117.671875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.5152678489685059, "epoch": 0.01592920353982301, "frac_reward_zero_std": 1.0, "grad_norm": 0.00915880506969509, "kl": 0.0004841999616473913, "learning_rate": 3.539823008849557e-08, "loss": 0.0, "num_tokens": 143500.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999586820602417, "sampling/importance_sampling_ratio/min": 0.6153971552848816, "sampling/sampling_logp_difference/max": 0.7832646369934082, "sampling/sampling_logp_difference/mean": 0.018428195267915726, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 67.609375, "completions/mean_terminated_length": 67.609375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2896856963634491, "epoch": 0.017699115044247787, "frac_reward_zero_std": 1.0, "grad_norm": 0.022535488141927375, "kl": 0.0009633470908738673, "learning_rate": 3.982300884955752e-08, "loss": 0.0, "num_tokens": 159507.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4800368547439575, "sampling/importance_sampling_ratio/mean": 0.9995597004890442, "sampling/importance_sampling_ratio/min": 0.7342386245727539, "sampling/sampling_logp_difference/max": 0.39206695556640625, "sampling/sampling_logp_difference/mean": 0.012184200808405876, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 65.53125, "completions/mean_terminated_length": 65.53125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.19404293596744537, "epoch": 0.019469026548672566, "frac_reward_zero_std": 0.25, "grad_norm": 6.824992842595387, "kl": 0.0015200059860944748, "learning_rate": 4.424778761061947e-08, "loss": -0.0771, "num_tokens": 173685.0, "reward": 0.90625, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.4212366342544556, "sampling/importance_sampling_ratio/mean": 1.0023908615112305, "sampling/importance_sampling_ratio/min": 0.6726571917533875, "sampling/sampling_logp_difference/max": 0.3965195417404175, "sampling/sampling_logp_difference/mean": 0.014005091972649097, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 100.984375, "completions/mean_terminated_length": 100.984375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.365769624710083, "epoch": 0.021238938053097345, "frac_reward_zero_std": 1.0, "grad_norm": 0.014960286603652508, "kl": 0.0005460747634060681, "learning_rate": 4.8672566371681415e-08, "loss": 0.0, "num_tokens": 190452.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997556209564209, "sampling/importance_sampling_ratio/min": 0.4474831819534302, "sampling/sampling_logp_difference/max": 0.8911995887756348, "sampling/sampling_logp_difference/mean": 0.013871857896447182, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 106.375, "completions/mean_terminated_length": 106.375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.303292840719223, "epoch": 0.023008849557522124, "frac_reward_zero_std": 0.75, "grad_norm": 3.5792693910632183, "kl": 0.0007246644818224013, "learning_rate": 5.309734513274336e-08, "loss": -0.1241, "num_tokens": 208620.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.6080149412155151, "sampling/importance_sampling_ratio/mean": 1.0023037195205688, "sampling/importance_sampling_ratio/min": 0.5335606932640076, "sampling/sampling_logp_difference/max": 0.6281824111938477, "sampling/sampling_logp_difference/mean": 0.017998535186052322, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 34.828125, "completions/mean_terminated_length": 34.828125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.17239561676979065, "epoch": 0.024778761061946902, "frac_reward_zero_std": 1.0, "grad_norm": 0.014602460582552898, "kl": 0.000353145704139024, "learning_rate": 5.7522123893805306e-08, "loss": 0.0, "num_tokens": 224305.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5867887735366821, "sampling/importance_sampling_ratio/mean": 1.0006211996078491, "sampling/importance_sampling_ratio/min": 0.39414942264556885, "sampling/sampling_logp_difference/max": 0.9310252666473389, "sampling/sampling_logp_difference/mean": 0.013991497457027435, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 86.9375, "completions/mean_terminated_length": 86.9375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3107295036315918, "epoch": 0.02654867256637168, "frac_reward_zero_std": 1.0, "grad_norm": 0.007229437086416403, "kl": 0.0005225496715866029, "learning_rate": 6.194690265486725e-08, "loss": 0.0, "num_tokens": 243069.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4709138870239258, "sampling/importance_sampling_ratio/mean": 0.9995893239974976, "sampling/importance_sampling_ratio/min": 0.65098637342453, "sampling/sampling_logp_difference/max": 0.4292665719985962, "sampling/sampling_logp_difference/mean": 0.01991843990981579, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 46.296875, "completions/mean_terminated_length": 46.296875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.32953619956970215, "epoch": 0.02831858407079646, "frac_reward_zero_std": 1.0, "grad_norm": 0.02767166983547206, "kl": 0.00113023747690022, "learning_rate": 6.63716814159292e-08, "loss": 0.0, "num_tokens": 257248.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6294353008270264, "sampling/importance_sampling_ratio/mean": 0.9990890026092529, "sampling/importance_sampling_ratio/min": 0.6623630523681641, "sampling/sampling_logp_difference/max": 0.4882335662841797, "sampling/sampling_logp_difference/mean": 0.0199204720556736, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 44.796875, "completions/mean_terminated_length": 44.796875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.17392346262931824, "epoch": 0.03008849557522124, "frac_reward_zero_std": 1.0, "grad_norm": 0.01600255624211106, "kl": 0.0004199473187327385, "learning_rate": 7.079646017699114e-08, "loss": 0.0, "num_tokens": 270659.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.328696370124817, "sampling/importance_sampling_ratio/mean": 0.9982379674911499, "sampling/importance_sampling_ratio/min": 0.6624902486801147, "sampling/sampling_logp_difference/max": 0.4117494821548462, "sampling/sampling_logp_difference/mean": 0.01237030141055584, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 126.0, "completions/max_terminated_length": 126.0, "completions/mean_length": 28.84375, "completions/mean_terminated_length": 28.84375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.159395232796669, "epoch": 0.03185840707964602, "frac_reward_zero_std": 0.75, "grad_norm": 4.513435214167038, "kl": 0.0023553031496703625, "learning_rate": 7.52212389380531e-08, "loss": -0.0075, "num_tokens": 282889.0, "reward": -0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.9696966409683228, "sampling/importance_sampling_ratio/mean": 0.9974421262741089, "sampling/importance_sampling_ratio/min": 0.5129315257072449, "sampling/sampling_logp_difference/max": 0.6778795719146729, "sampling/sampling_logp_difference/mean": 0.022663142532110214, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 73.859375, "completions/mean_terminated_length": 73.859375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2566125690937042, "epoch": 0.033628318584070796, "frac_reward_zero_std": 1.0, "grad_norm": 0.01530068562910834, "kl": 0.0003858095151372254, "learning_rate": 7.964601769911503e-08, "loss": 0.0, "num_tokens": 299264.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.795694351196289, "sampling/importance_sampling_ratio/mean": 1.0008893013000488, "sampling/importance_sampling_ratio/min": 0.5062554478645325, "sampling/sampling_logp_difference/max": 0.6807138919830322, "sampling/sampling_logp_difference/mean": 0.014813372865319252, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 89.984375, "completions/mean_terminated_length": 89.984375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.4346693754196167, "epoch": 0.035398230088495575, "frac_reward_zero_std": 0.5, "grad_norm": 5.7359841983795175, "kl": 0.0004782738978974521, "learning_rate": 8.4070796460177e-08, "loss": 0.0061, "num_tokens": 314959.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.620740532875061, "sampling/importance_sampling_ratio/mean": 0.9991461038589478, "sampling/importance_sampling_ratio/min": 0.5147078633308411, "sampling/sampling_logp_difference/max": 0.6641558408737183, "sampling/sampling_logp_difference/mean": 0.01632934808731079, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 69.453125, "completions/mean_terminated_length": 69.453125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3627976179122925, "epoch": 0.03716814159292035, "frac_reward_zero_std": 0.75, "grad_norm": 4.59221061699448, "kl": 0.0004590965108945966, "learning_rate": 8.849557522123894e-08, "loss": 0.0508, "num_tokens": 329756.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.8821483850479126, "sampling/importance_sampling_ratio/mean": 1.0001276731491089, "sampling/importance_sampling_ratio/min": 0.4833468794822693, "sampling/sampling_logp_difference/max": 0.7270207405090332, "sampling/sampling_logp_difference/mean": 0.02273380383849144, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 39.90625, "completions/mean_terminated_length": 39.90625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.24342747032642365, "epoch": 0.03893805309734513, "frac_reward_zero_std": 0.75, "grad_norm": 3.7038806022357496, "kl": 0.0023908826988190413, "learning_rate": 9.292035398230089e-08, "loss": 0.0698, "num_tokens": 344998.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.872116208076477, "sampling/importance_sampling_ratio/mean": 0.9997944235801697, "sampling/importance_sampling_ratio/min": 0.19888915121555328, "sampling/sampling_logp_difference/max": 1.6150076389312744, "sampling/sampling_logp_difference/mean": 0.019733227789402008, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 39.5, "completions/mean_terminated_length": 39.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.16345232725143433, "epoch": 0.04070796460176991, "frac_reward_zero_std": 1.0, "grad_norm": 0.04997415895244329, "kl": 0.0014484870480373502, "learning_rate": 9.734513274336283e-08, "loss": 0.0, "num_tokens": 357894.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5752054452896118, "sampling/importance_sampling_ratio/mean": 0.999841570854187, "sampling/importance_sampling_ratio/min": 0.6862316131591797, "sampling/sampling_logp_difference/max": 0.45438575744628906, "sampling/sampling_logp_difference/mean": 0.019694484770298004, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 52.890625, "completions/mean_terminated_length": 52.890625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.37000778317451477, "epoch": 0.04247787610619469, "frac_reward_zero_std": 0.75, "grad_norm": 8.338317286703484, "kl": 0.000826447329018265, "learning_rate": 1.0176991150442478e-07, "loss": -0.1221, "num_tokens": 372159.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0006983280181885, "sampling/importance_sampling_ratio/min": 0.5474603176116943, "sampling/sampling_logp_difference/max": 0.7353544235229492, "sampling/sampling_logp_difference/mean": 0.018441716209053993, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 65.078125, "completions/mean_terminated_length": 65.078125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.33777666091918945, "epoch": 0.04424778761061947, "frac_reward_zero_std": 0.75, "grad_norm": 11.659760033795731, "kl": 0.0006184541853144765, "learning_rate": 1.0619469026548672e-07, "loss": -0.1307, "num_tokens": 388756.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999577522277832, "sampling/importance_sampling_ratio/min": 0.46445992588996887, "sampling/sampling_logp_difference/max": 1.3790621757507324, "sampling/sampling_logp_difference/mean": 0.019229542464017868, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 69.046875, "completions/mean_terminated_length": 69.046875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.25738662481307983, "epoch": 0.04601769911504425, "frac_reward_zero_std": 0.75, "grad_norm": 9.487071516705344, "kl": 0.0003331995103508234, "learning_rate": 1.1061946902654867e-07, "loss": -0.2253, "num_tokens": 404391.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.2855082750320435, "sampling/importance_sampling_ratio/mean": 0.9986451864242554, "sampling/importance_sampling_ratio/min": 0.6865718364715576, "sampling/sampling_logp_difference/max": 0.3760443925857544, "sampling/sampling_logp_difference/mean": 0.018185608088970184, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 78.109375, "completions/mean_terminated_length": 78.109375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.44814449548721313, "epoch": 0.047787610619469026, "frac_reward_zero_std": 0.75, "grad_norm": 7.756923111271524, "kl": 0.00046124885557219386, "learning_rate": 1.1504424778761061e-07, "loss": -0.1409, "num_tokens": 419470.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.4353742599487305, "sampling/importance_sampling_ratio/mean": 0.9993364214897156, "sampling/importance_sampling_ratio/min": 0.5488451719284058, "sampling/sampling_logp_difference/max": 0.5999388694763184, "sampling/sampling_logp_difference/mean": 0.018295522779226303, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 76.953125, "completions/mean_terminated_length": 76.953125, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 0.31486934423446655, "epoch": 0.049557522123893805, "frac_reward_zero_std": 1.0, "grad_norm": 0.009386442378514733, "kl": 0.00041278538992628455, "learning_rate": 1.1946902654867256e-07, "loss": 0.0, "num_tokens": 435595.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5819377899169922, "sampling/importance_sampling_ratio/mean": 1.0006744861602783, "sampling/importance_sampling_ratio/min": 0.4626096785068512, "sampling/sampling_logp_difference/max": 0.770871639251709, "sampling/sampling_logp_difference/mean": 0.01879153959453106, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 98.671875, "completions/mean_terminated_length": 98.671875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.4051646590232849, "epoch": 0.05132743362831858, "frac_reward_zero_std": 1.0, "grad_norm": 0.014069863796196987, "kl": 0.0004532830498646945, "learning_rate": 1.238938053097345e-07, "loss": 0.0, "num_tokens": 452342.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7859727144241333, "sampling/importance_sampling_ratio/mean": 0.9995170831680298, "sampling/importance_sampling_ratio/min": 0.5645666718482971, "sampling/sampling_logp_difference/max": 0.579963207244873, "sampling/sampling_logp_difference/mean": 0.017181504517793655, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 41.71875, "completions/mean_terminated_length": 41.71875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.22401276230812073, "epoch": 0.05309734513274336, "frac_reward_zero_std": 0.75, "grad_norm": 6.61094425527448, "kl": 0.00048795485054142773, "learning_rate": 1.2831858407079647e-07, "loss": 0.0822, "num_tokens": 467092.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5831127166748047, "sampling/importance_sampling_ratio/mean": 0.9997624158859253, "sampling/importance_sampling_ratio/min": 0.5481081604957581, "sampling/sampling_logp_difference/max": 0.6012825965881348, "sampling/sampling_logp_difference/mean": 0.013244973495602608, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.71875, "completions/mean_terminated_length": 15.71875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.07598844170570374, "epoch": 0.05486725663716814, "frac_reward_zero_std": 0.75, "grad_norm": 9.874575477437448, "kl": 0.00039827151340432465, "learning_rate": 1.327433628318584e-07, "loss": -0.0032, "num_tokens": 480098.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.4121800661087036, "sampling/importance_sampling_ratio/mean": 1.0020051002502441, "sampling/importance_sampling_ratio/min": 0.5527469515800476, "sampling/sampling_logp_difference/max": 0.5928549766540527, "sampling/sampling_logp_difference/mean": 0.013948867097496986, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 39.421875, "completions/mean_terminated_length": 39.421875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.1963411271572113, "epoch": 0.05663716814159292, "frac_reward_zero_std": 0.75, "grad_norm": 4.856088448360933, "kl": 0.0015413790242746472, "learning_rate": 1.3716814159292035e-07, "loss": -0.0009, "num_tokens": 494525.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.797372579574585, "sampling/importance_sampling_ratio/mean": 1.001387596130371, "sampling/importance_sampling_ratio/min": 0.6285156011581421, "sampling/sampling_logp_difference/max": 0.5863258838653564, "sampling/sampling_logp_difference/mean": 0.01898869127035141, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 89.921875, "completions/mean_terminated_length": 89.921875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.40926527976989746, "epoch": 0.0584070796460177, "frac_reward_zero_std": 0.75, "grad_norm": 8.86902251988677, "kl": 0.0004499046190176159, "learning_rate": 1.4159292035398229e-07, "loss": -0.3134, "num_tokens": 511432.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.9430650472640991, "sampling/importance_sampling_ratio/mean": 1.0010493993759155, "sampling/importance_sampling_ratio/min": 0.62600177526474, "sampling/sampling_logp_difference/max": 0.6642667055130005, "sampling/sampling_logp_difference/mean": 0.016596786677837372, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 56.71875, "completions/mean_terminated_length": 56.71875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2863810658454895, "epoch": 0.06017699115044248, "frac_reward_zero_std": 1.0, "grad_norm": 0.021468556262928137, "kl": 0.0007147843716666102, "learning_rate": 1.4601769911504425e-07, "loss": 0.0, "num_tokens": 526534.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8784087896347046, "sampling/importance_sampling_ratio/mean": 0.9997020363807678, "sampling/importance_sampling_ratio/min": 0.6882390379905701, "sampling/sampling_logp_difference/max": 0.630424976348877, "sampling/sampling_logp_difference/mean": 0.017147017642855644, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 85.953125, "completions/mean_terminated_length": 85.953125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.4597724378108978, "epoch": 0.061946902654867256, "frac_reward_zero_std": 0.5, "grad_norm": 3.7681090870036456, "kl": 0.0005873649497516453, "learning_rate": 1.504424778761062e-07, "loss": 0.0506, "num_tokens": 545475.0, "reward": 0.4375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5615980625152588, "sampling/importance_sampling_ratio/mean": 0.9998810291290283, "sampling/importance_sampling_ratio/min": 0.6170110106468201, "sampling/sampling_logp_difference/max": 0.4828684329986572, "sampling/sampling_logp_difference/mean": 0.016421593725681305, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 99.140625, "completions/mean_terminated_length": 99.140625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.5353041887283325, "epoch": 0.06371681415929203, "frac_reward_zero_std": 0.75, "grad_norm": 1.6580441755876607, "kl": 0.0004714885726571083, "learning_rate": 1.5486725663716813e-07, "loss": 0.004, "num_tokens": 561196.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4393292665481567, "sampling/importance_sampling_ratio/mean": 0.9998955726623535, "sampling/importance_sampling_ratio/min": 0.6063184142112732, "sampling/sampling_logp_difference/max": 0.5003499984741211, "sampling/sampling_logp_difference/mean": 0.01866915076971054, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 52.53125, "completions/mean_terminated_length": 52.53125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3300973176956177, "epoch": 0.06548672566371681, "frac_reward_zero_std": 0.5, "grad_norm": 20.36985643572742, "kl": 0.0006205074023455381, "learning_rate": 1.5929203539823007e-07, "loss": 0.4553, "num_tokens": 579022.0, "reward": 0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995156526565552, "sampling/importance_sampling_ratio/min": 0.47040438652038574, "sampling/sampling_logp_difference/max": 1.002622365951538, "sampling/sampling_logp_difference/mean": 0.021546784788370132, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 81.078125, "completions/mean_terminated_length": 81.078125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3800387382507324, "epoch": 0.06725663716814159, "frac_reward_zero_std": 0.75, "grad_norm": 10.339012444363068, "kl": 0.0006738771917298436, "learning_rate": 1.6371681415929203e-07, "loss": 0.1046, "num_tokens": 594019.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0005218982696533, "sampling/importance_sampling_ratio/min": 0.5071792602539062, "sampling/sampling_logp_difference/max": 0.8020846843719482, "sampling/sampling_logp_difference/mean": 0.01670372113585472, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 93.484375, "completions/mean_terminated_length": 93.484375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.4449535310268402, "epoch": 0.06902654867256637, "frac_reward_zero_std": 0.75, "grad_norm": 4.317212480269803, "kl": 0.0006473226239904761, "learning_rate": 1.68141592920354e-07, "loss": -0.1538, "num_tokens": 609394.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.8720818758010864, "sampling/importance_sampling_ratio/mean": 0.9996934533119202, "sampling/importance_sampling_ratio/min": 0.6724444627761841, "sampling/sampling_logp_difference/max": 0.6270511150360107, "sampling/sampling_logp_difference/mean": 0.017083389684557915, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 81.90625, "completions/mean_terminated_length": 81.90625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.33130159974098206, "epoch": 0.07079646017699115, "frac_reward_zero_std": 0.75, "grad_norm": 4.038053707633894, "kl": 0.0009376220987178385, "learning_rate": 1.725663716814159e-07, "loss": 0.0787, "num_tokens": 625884.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4165065288543701, "sampling/importance_sampling_ratio/mean": 1.0004032850265503, "sampling/importance_sampling_ratio/min": 0.5999572277069092, "sampling/sampling_logp_difference/max": 0.5108969211578369, "sampling/sampling_logp_difference/mean": 0.019599106162786484, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 47.71875, "completions/mean_terminated_length": 47.71875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.1945371925830841, "epoch": 0.07256637168141593, "frac_reward_zero_std": 0.75, "grad_norm": 3.3978063035136974, "kl": 0.0002761489013209939, "learning_rate": 1.7699115044247788e-07, "loss": 0.0003, "num_tokens": 642314.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.3843897581100464, "sampling/importance_sampling_ratio/mean": 1.0020694732666016, "sampling/importance_sampling_ratio/min": 0.5751755833625793, "sampling/sampling_logp_difference/max": 0.5530799627304077, "sampling/sampling_logp_difference/mean": 0.01604965329170227, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 75.296875, "completions/mean_terminated_length": 75.296875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.35504159331321716, "epoch": 0.0743362831858407, "frac_reward_zero_std": 0.5, "grad_norm": 10.922965398733, "kl": 0.0010760982986539602, "learning_rate": 1.8141592920353982e-07, "loss": 0.1741, "num_tokens": 658733.0, "reward": 0.25, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.5612423419952393, "sampling/importance_sampling_ratio/mean": 0.9998565912246704, "sampling/importance_sampling_ratio/min": 0.6919164061546326, "sampling/sampling_logp_difference/max": 0.44548189640045166, "sampling/sampling_logp_difference/mean": 0.017055584117770195, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 76.390625, "completions/mean_terminated_length": 76.390625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3077981173992157, "epoch": 0.07610619469026549, "frac_reward_zero_std": 0.75, "grad_norm": 8.929794139932175, "kl": 0.000669544271659106, "learning_rate": 1.8584070796460178e-07, "loss": 0.1094, "num_tokens": 674374.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.8034026622772217, "sampling/importance_sampling_ratio/mean": 1.0019267797470093, "sampling/importance_sampling_ratio/min": 0.48356494307518005, "sampling/sampling_logp_difference/max": 0.726569652557373, "sampling/sampling_logp_difference/mean": 0.028325039893388748, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 122.9375, "completions/mean_terminated_length": 122.9375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.399412602186203, "epoch": 0.07787610619469026, "frac_reward_zero_std": 1.0, "grad_norm": 0.011838867228157263, "kl": 0.0004714487586170435, "learning_rate": 1.902654867256637e-07, "loss": 0.0, "num_tokens": 694210.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4246737957000732, "sampling/importance_sampling_ratio/mean": 1.0002517700195312, "sampling/importance_sampling_ratio/min": 0.6519432663917542, "sampling/sampling_logp_difference/max": 0.427797794342041, "sampling/sampling_logp_difference/mean": 0.014120910316705704, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 144.4375, "completions/mean_terminated_length": 144.4375, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 0.5294415354728699, "epoch": 0.07964601769911504, "frac_reward_zero_std": 1.0, "grad_norm": 0.00685115726939914, "kl": 0.0005526579916477203, "learning_rate": 1.9469026548672566e-07, "loss": 0.0, "num_tokens": 712510.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6863336563110352, "sampling/importance_sampling_ratio/mean": 0.9996277093887329, "sampling/importance_sampling_ratio/min": 0.5524927973747253, "sampling/sampling_logp_difference/max": 0.5933148860931396, "sampling/sampling_logp_difference/mean": 0.019174396991729736, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 56.796875, "completions/mean_terminated_length": 56.796875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.33027297258377075, "epoch": 0.08141592920353982, "frac_reward_zero_std": 1.0, "grad_norm": 0.03531126435999156, "kl": 0.001525461906567216, "learning_rate": 1.991150442477876e-07, "loss": 0.0, "num_tokens": 726929.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4253754615783691, "sampling/importance_sampling_ratio/mean": 1.002415418624878, "sampling/importance_sampling_ratio/min": 0.6252660751342773, "sampling/sampling_logp_difference/max": 0.46957799792289734, "sampling/sampling_logp_difference/mean": 0.018032699823379517, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 55.4375, "completions/mean_terminated_length": 55.4375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3531503677368164, "epoch": 0.0831858407079646, "frac_reward_zero_std": 1.0, "grad_norm": 0.03232207870409841, "kl": 0.0017060365062206984, "learning_rate": 2.0353982300884956e-07, "loss": 0.0, "num_tokens": 741069.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9697322845458984, "sampling/importance_sampling_ratio/mean": 1.0025861263275146, "sampling/importance_sampling_ratio/min": 0.6927888989448547, "sampling/sampling_logp_difference/max": 0.6778976917266846, "sampling/sampling_logp_difference/mean": 0.024530623108148575, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1186.0, "completions/max_terminated_length": 1186.0, "completions/mean_length": 136.765625, "completions/mean_terminated_length": 136.765625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3937437832355499, "epoch": 0.08495575221238938, "frac_reward_zero_std": 0.25, "grad_norm": 5.142458555957129, "kl": 0.0008791740983724594, "learning_rate": 2.0796460176991148e-07, "loss": 0.3402, "num_tokens": 759774.0, "reward": -0.09375, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.628279685974121, "sampling/importance_sampling_ratio/mean": 1.0001052618026733, "sampling/importance_sampling_ratio/min": 0.49396607279777527, "sampling/sampling_logp_difference/max": 0.7052884101867676, "sampling/sampling_logp_difference/mean": 0.015893086791038513, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 55.25, "completions/mean_terminated_length": 55.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.3387606143951416, "epoch": 0.08672566371681416, "frac_reward_zero_std": 1.0, "grad_norm": 0.03927816701742412, "kl": 0.001997170504182577, "learning_rate": 2.1238938053097344e-07, "loss": 0.0, "num_tokens": 773310.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4214093685150146, "sampling/importance_sampling_ratio/mean": 0.9992483854293823, "sampling/importance_sampling_ratio/min": 0.44950494170188904, "sampling/sampling_logp_difference/max": 0.7996084690093994, "sampling/sampling_logp_difference/mean": 0.022078003734350204, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 103.703125, "completions/mean_terminated_length": 103.703125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.5467600226402283, "epoch": 0.08849557522123894, "frac_reward_zero_std": 0.5, "grad_norm": 19.329756091300126, "kl": 0.00122804322745651, "learning_rate": 2.1681415929203538e-07, "loss": 0.3504, "num_tokens": 790427.0, "reward": -0.4375, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": -0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999853253364563, "sampling/importance_sampling_ratio/min": 0.6109908223152161, "sampling/sampling_logp_difference/max": 1.131711483001709, "sampling/sampling_logp_difference/mean": 0.02128685638308525, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 92.84375, "completions/mean_terminated_length": 92.84375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3790997266769409, "epoch": 0.09026548672566372, "frac_reward_zero_std": 0.5, "grad_norm": 3.3632322784690682, "kl": 0.00037776207318529487, "learning_rate": 2.2123893805309735e-07, "loss": 0.0385, "num_tokens": 806721.0, "reward": 0.4375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.6228275299072266, "sampling/importance_sampling_ratio/mean": 0.9994162321090698, "sampling/importance_sampling_ratio/min": 0.5379316806793213, "sampling/sampling_logp_difference/max": 0.6200237274169922, "sampling/sampling_logp_difference/mean": 0.0147955771535635, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 66.234375, "completions/mean_terminated_length": 66.234375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2744632959365845, "epoch": 0.0920353982300885, "frac_reward_zero_std": 1.0, "grad_norm": 0.014767897890408175, "kl": 0.0005663610063493252, "learning_rate": 2.2566371681415928e-07, "loss": 0.0, "num_tokens": 825280.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3668099641799927, "sampling/importance_sampling_ratio/mean": 0.9997496604919434, "sampling/importance_sampling_ratio/min": 0.628419041633606, "sampling/sampling_logp_difference/max": 0.46454811096191406, "sampling/sampling_logp_difference/mean": 0.012168833054602146, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 51.671875, "completions/mean_terminated_length": 51.671875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.22427326440811157, "epoch": 0.09380530973451327, "frac_reward_zero_std": 0.75, "grad_norm": 4.227366331101068, "kl": 0.002840688219293952, "learning_rate": 2.3008849557522122e-07, "loss": -0.1688, "num_tokens": 840923.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9992762804031372, "sampling/importance_sampling_ratio/min": 0.5337795615196228, "sampling/sampling_logp_difference/max": 0.7441730499267578, "sampling/sampling_logp_difference/mean": 0.021255411207675934, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 89.21875, "completions/mean_terminated_length": 89.21875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2630472183227539, "epoch": 0.09557522123893805, "frac_reward_zero_std": 1.0, "grad_norm": 0.008579094624632907, "kl": 0.00046598073095083237, "learning_rate": 2.345132743362832e-07, "loss": 0.0, "num_tokens": 858249.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.662152647972107, "sampling/importance_sampling_ratio/mean": 1.0007331371307373, "sampling/importance_sampling_ratio/min": 0.5581602454185486, "sampling/sampling_logp_difference/max": 0.5831091403961182, "sampling/sampling_logp_difference/mean": 0.016348931938409805, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 96.078125, "completions/mean_terminated_length": 96.078125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.47166889905929565, "epoch": 0.09734513274336283, "frac_reward_zero_std": 1.0, "grad_norm": 0.010793334962958628, "kl": 0.0005595291731879115, "learning_rate": 2.3893805309734513e-07, "loss": 0.0, "num_tokens": 875118.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7272074222564697, "sampling/importance_sampling_ratio/mean": 0.9997779130935669, "sampling/importance_sampling_ratio/min": 0.21025174856185913, "sampling/sampling_logp_difference/max": 1.5594496726989746, "sampling/sampling_logp_difference/mean": 0.018348487094044685, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 54.390625, "completions/mean_terminated_length": 54.390625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.32978034019470215, "epoch": 0.09911504424778761, "frac_reward_zero_std": 1.0, "grad_norm": 0.03360661011059917, "kl": 0.0016041224589571357, "learning_rate": 2.4336283185840704e-07, "loss": 0.0, "num_tokens": 891415.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.544557809829712, "sampling/importance_sampling_ratio/mean": 0.9991042613983154, "sampling/importance_sampling_ratio/min": 0.6957619786262512, "sampling/sampling_logp_difference/max": 0.4347376823425293, "sampling/sampling_logp_difference/mean": 0.01648845709860325, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 98.546875, "completions/mean_terminated_length": 98.546875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.40308648347854614, "epoch": 0.10088495575221239, "frac_reward_zero_std": 1.0, "grad_norm": 0.015656436625466928, "kl": 0.00042458786629140377, "learning_rate": 2.47787610619469e-07, "loss": 0.0, "num_tokens": 907450.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5252330303192139, "sampling/importance_sampling_ratio/mean": 0.9999468922615051, "sampling/importance_sampling_ratio/min": 0.6792412996292114, "sampling/sampling_logp_difference/max": 0.4221472144126892, "sampling/sampling_logp_difference/mean": 0.017018482089042664, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 64.71875, "completions/mean_terminated_length": 64.71875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3652716875076294, "epoch": 0.10265486725663717, "frac_reward_zero_std": 0.75, "grad_norm": 2.7572585149403777, "kl": 0.000706555787473917, "learning_rate": 2.5221238938053097e-07, "loss": 0.0038, "num_tokens": 922536.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5869511365890503, "sampling/importance_sampling_ratio/mean": 1.0007580518722534, "sampling/importance_sampling_ratio/min": 0.4306277632713318, "sampling/sampling_logp_difference/max": 0.8425111770629883, "sampling/sampling_logp_difference/mean": 0.013503305613994598, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.765625, "completions/mean_terminated_length": 15.765625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.08323395252227783, "epoch": 0.10442477876106195, "frac_reward_zero_std": 1.0, "grad_norm": 0.10431399096983622, "kl": 0.0004693373921327293, "learning_rate": 2.5663716814159294e-07, "loss": 0.0, "num_tokens": 937513.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3673667907714844, "sampling/importance_sampling_ratio/mean": 0.9999486207962036, "sampling/importance_sampling_ratio/min": 0.8102474212646484, "sampling/sampling_logp_difference/max": 0.3128868341445923, "sampling/sampling_logp_difference/mean": 0.012339731678366661, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 103.0, "completions/max_terminated_length": 103.0, "completions/mean_length": 28.484375, "completions/mean_terminated_length": 28.484375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.15878352522850037, "epoch": 0.10619469026548672, "frac_reward_zero_std": 1.0, "grad_norm": 0.08825490514685248, "kl": 0.0016191593604162335, "learning_rate": 2.6106194690265485e-07, "loss": 0.0, "num_tokens": 950072.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3832296133041382, "sampling/importance_sampling_ratio/mean": 0.9970600605010986, "sampling/importance_sampling_ratio/min": 0.6773175597190857, "sampling/sampling_logp_difference/max": 0.3896150588989258, "sampling/sampling_logp_difference/mean": 0.021082065999507904, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 87.203125, "completions/mean_terminated_length": 87.203125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2835385203361511, "epoch": 0.1079646017699115, "frac_reward_zero_std": 0.75, "grad_norm": 7.039394391202979, "kl": 0.000507982331328094, "learning_rate": 2.654867256637168e-07, "loss": 0.3259, "num_tokens": 969429.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.4433850049972534, "sampling/importance_sampling_ratio/mean": 0.9994461536407471, "sampling/importance_sampling_ratio/min": 0.6672996878623962, "sampling/sampling_logp_difference/max": 0.4045161008834839, "sampling/sampling_logp_difference/mean": 0.014681078493595123, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 80.875, "completions/mean_terminated_length": 80.875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.42828965187072754, "epoch": 0.10973451327433628, "frac_reward_zero_std": 0.75, "grad_norm": 2.219259006737941, "kl": 0.0006035867263562977, "learning_rate": 2.6991150442477873e-07, "loss": 0.0008, "num_tokens": 984573.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999299168586731, "sampling/importance_sampling_ratio/min": 0.5546641945838928, "sampling/sampling_logp_difference/max": 1.1937751770019531, "sampling/sampling_logp_difference/mean": 0.019111908972263336, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 38.890625, "completions/mean_terminated_length": 38.890625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2435263991355896, "epoch": 0.11150442477876106, "frac_reward_zero_std": 0.75, "grad_norm": 20.082345347768168, "kl": 0.002106260508298874, "learning_rate": 2.743362831858407e-07, "loss": 0.1822, "num_tokens": 998118.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.9094908237457275, "sampling/importance_sampling_ratio/mean": 1.0004956722259521, "sampling/importance_sampling_ratio/min": 0.7219632267951965, "sampling/sampling_logp_difference/max": 0.6468366384506226, "sampling/sampling_logp_difference/mean": 0.019784800708293915, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 34.28125, "completions/mean_terminated_length": 34.28125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.17724953591823578, "epoch": 0.11327433628318584, "frac_reward_zero_std": 1.0, "grad_norm": 0.055329392007934075, "kl": 0.0014039300149306655, "learning_rate": 2.787610619469026e-07, "loss": 0.0, "num_tokens": 1012360.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3268667459487915, "sampling/importance_sampling_ratio/mean": 0.9999594688415527, "sampling/importance_sampling_ratio/min": 0.5984827280044556, "sampling/sampling_logp_difference/max": 0.5133576393127441, "sampling/sampling_logp_difference/mean": 0.019908219575881958, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 74.0, "completions/mean_terminated_length": 74.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.25674378871917725, "epoch": 0.11504424778761062, "frac_reward_zero_std": 0.75, "grad_norm": 3.358973041717826, "kl": 0.0008139506680890918, "learning_rate": 2.8318584070796457e-07, "loss": -0.0235, "num_tokens": 1027176.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5614769458770752, "sampling/importance_sampling_ratio/mean": 0.9981610774993896, "sampling/importance_sampling_ratio/min": 0.6101521253585815, "sampling/sampling_logp_difference/max": 0.4940469264984131, "sampling/sampling_logp_difference/mean": 0.021541573107242584, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 114.609375, "completions/mean_terminated_length": 114.609375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.5083310008049011, "epoch": 0.1168141592920354, "frac_reward_zero_std": 0.75, "grad_norm": 2.6382262277093083, "kl": 0.0006657812045887113, "learning_rate": 2.8761061946902654e-07, "loss": 0.0159, "num_tokens": 1044191.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4574837684631348, "sampling/importance_sampling_ratio/mean": 1.0002875328063965, "sampling/importance_sampling_ratio/min": 0.6315723657608032, "sampling/sampling_logp_difference/max": 0.45954275131225586, "sampling/sampling_logp_difference/mean": 0.017609603703022003, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 68.65625, "completions/mean_terminated_length": 68.65625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.4105982780456543, "epoch": 0.11858407079646018, "frac_reward_zero_std": 0.5, "grad_norm": 12.053850779982422, "kl": 0.0023665111511945724, "learning_rate": 2.920353982300885e-07, "loss": 0.3613, "num_tokens": 1058313.0, "reward": 0.84375, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.3561493158340454, "sampling/importance_sampling_ratio/mean": 0.9979859590530396, "sampling/importance_sampling_ratio/min": 0.4442462921142578, "sampling/sampling_logp_difference/max": 0.8113762140274048, "sampling/sampling_logp_difference/mean": 0.022402148693799973, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 66.46875, "completions/mean_terminated_length": 66.46875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.1998130977153778, "epoch": 0.12035398230088495, "frac_reward_zero_std": 1.0, "grad_norm": 0.022656157015304077, "kl": 0.0007582076359540224, "learning_rate": 2.9646017699115047e-07, "loss": 0.0, "num_tokens": 1072263.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.412645936012268, "sampling/importance_sampling_ratio/mean": 0.9985370635986328, "sampling/importance_sampling_ratio/min": 0.593745231628418, "sampling/sampling_logp_difference/max": 0.5213049650192261, "sampling/sampling_logp_difference/mean": 0.018213676288723946, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 54.5625, "completions/mean_terminated_length": 54.5625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.271050363779068, "epoch": 0.12212389380530973, "frac_reward_zero_std": 1.0, "grad_norm": 0.013820374585321411, "kl": 0.00034594995668157935, "learning_rate": 3.008849557522124e-07, "loss": 0.0, "num_tokens": 1086347.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2971035242080688, "sampling/importance_sampling_ratio/mean": 0.999480664730072, "sampling/importance_sampling_ratio/min": 0.37043890357017517, "sampling/sampling_logp_difference/max": 0.9930667877197266, "sampling/sampling_logp_difference/mean": 0.011262361891567707, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 57.328125, "completions/mean_terminated_length": 57.328125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.30324071645736694, "epoch": 0.12389380530973451, "frac_reward_zero_std": 0.75, "grad_norm": 3.125386142425997, "kl": 0.0008926556329242885, "learning_rate": 3.053097345132743e-07, "loss": -0.0308, "num_tokens": 1101872.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.3413037061691284, "sampling/importance_sampling_ratio/mean": 0.9995997548103333, "sampling/importance_sampling_ratio/min": 0.6632469892501831, "sampling/sampling_logp_difference/max": 0.41060781478881836, "sampling/sampling_logp_difference/mean": 0.015544610098004341, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 97.671875, "completions/mean_terminated_length": 97.671875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.6352293491363525, "epoch": 0.1256637168141593, "frac_reward_zero_std": 0.5, "grad_norm": 7.979445476665689, "kl": 0.001399570144712925, "learning_rate": 3.0973451327433626e-07, "loss": 0.1946, "num_tokens": 1118347.0, "reward": 0.375, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.7627366781234741, "sampling/importance_sampling_ratio/mean": 1.0006338357925415, "sampling/importance_sampling_ratio/min": 0.5739402174949646, "sampling/sampling_logp_difference/max": 0.5668675899505615, "sampling/sampling_logp_difference/mean": 0.02080698311328888, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/max_terminated_length": 674.0, "completions/mean_length": 80.890625, "completions/mean_terminated_length": 80.890625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3430714011192322, "epoch": 0.12743362831858407, "frac_reward_zero_std": 0.5, "grad_norm": 12.02956264887036, "kl": 0.006966053508222103, "learning_rate": 3.141592920353982e-07, "loss": 0.1236, "num_tokens": 1133908.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9982129335403442, "sampling/importance_sampling_ratio/min": 0.31648826599121094, "sampling/sampling_logp_difference/max": 1.150469183921814, "sampling/sampling_logp_difference/mean": 0.02067970484495163, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 92.75, "completions/mean_terminated_length": 92.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.5193896293640137, "epoch": 0.12920353982300886, "frac_reward_zero_std": 0.75, "grad_norm": 3.848363550830051, "kl": 0.010346418246626854, "learning_rate": 3.1858407079646014e-07, "loss": -0.0719, "num_tokens": 1149572.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.555552363395691, "sampling/importance_sampling_ratio/mean": 0.9990873336791992, "sampling/importance_sampling_ratio/min": 0.48628026247024536, "sampling/sampling_logp_difference/max": 0.7209701538085938, "sampling/sampling_logp_difference/mean": 0.02255423739552498, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 64.125, "completions/mean_terminated_length": 64.125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.30180686712265015, "epoch": 0.13097345132743363, "frac_reward_zero_std": 0.75, "grad_norm": 10.872794222298856, "kl": 0.001606640056706965, "learning_rate": 3.230088495575221e-07, "loss": -0.3004, "num_tokens": 1163660.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.7250492572784424, "sampling/importance_sampling_ratio/mean": 1.0003955364227295, "sampling/importance_sampling_ratio/min": 0.5589487552642822, "sampling/sampling_logp_difference/max": 0.5816974639892578, "sampling/sampling_logp_difference/mean": 0.02236917056143284, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 44.59375, "completions/mean_terminated_length": 44.59375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2116633653640747, "epoch": 0.13274336283185842, "frac_reward_zero_std": 0.75, "grad_norm": 2.8699103275799596, "kl": 0.006977086421102285, "learning_rate": 3.2743362831858407e-07, "loss": 0.0026, "num_tokens": 1176994.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4212322235107422, "sampling/importance_sampling_ratio/mean": 0.9926056265830994, "sampling/importance_sampling_ratio/min": 0.6300075054168701, "sampling/sampling_logp_difference/max": 0.4620234966278076, "sampling/sampling_logp_difference/mean": 0.02601846680045128, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 99.203125, "completions/mean_terminated_length": 99.203125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.41840457916259766, "epoch": 0.13451327433628318, "frac_reward_zero_std": 1.0, "grad_norm": 0.016256057216556645, "kl": 0.0008459609234705567, "learning_rate": 3.3185840707964603e-07, "loss": 0.0, "num_tokens": 1194431.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000793218612671, "sampling/importance_sampling_ratio/min": 0.5363112092018127, "sampling/sampling_logp_difference/max": 0.8022520542144775, "sampling/sampling_logp_difference/mean": 0.02195250429213047, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 101.984375, "completions/mean_terminated_length": 101.984375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.37862005829811096, "epoch": 0.13628318584070798, "frac_reward_zero_std": 0.5, "grad_norm": 10.132955638345939, "kl": 0.0018726624548435211, "learning_rate": 3.36283185840708e-07, "loss": -0.3206, "num_tokens": 1210270.0, "reward": 0.78125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.5600541830062866, "sampling/importance_sampling_ratio/mean": 1.0004552602767944, "sampling/importance_sampling_ratio/min": 0.6160954833030701, "sampling/sampling_logp_difference/max": 0.48435330390930176, "sampling/sampling_logp_difference/mean": 0.014363247901201248, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 64.0, "completions/mean_terminated_length": 64.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 0.3538378179073334, "epoch": 0.13805309734513274, "frac_reward_zero_std": 0.75, "grad_norm": 9.268417053893387, "kl": 0.005304547026753426, "learning_rate": 3.4070796460176986e-07, "loss": -0.1266, "num_tokens": 1228782.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.5338397026062012, "sampling/importance_sampling_ratio/mean": 1.000978708267212, "sampling/importance_sampling_ratio/min": 0.598846971988678, "sampling/sampling_logp_difference/max": 0.512749195098877, "sampling/sampling_logp_difference/mean": 0.01858421415090561, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 96.515625, "completions/mean_terminated_length": 96.515625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.4295077621936798, "epoch": 0.13982300884955753, "frac_reward_zero_std": 1.0, "grad_norm": 0.009828059132190585, "kl": 0.0005789292044937611, "learning_rate": 3.451327433628318e-07, "loss": 0.0, "num_tokens": 1245087.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9877007007598877, "sampling/importance_sampling_ratio/mean": 1.0007420778274536, "sampling/importance_sampling_ratio/min": 0.6938974857330322, "sampling/sampling_logp_difference/max": 0.6869785785675049, "sampling/sampling_logp_difference/mean": 0.01571083441376686, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 77.5, "completions/mean_terminated_length": 77.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2790239453315735, "epoch": 0.1415929203539823, "frac_reward_zero_std": 0.75, "grad_norm": 7.747476493391317, "kl": 0.003009934676811099, "learning_rate": 3.495575221238938e-07, "loss": 0.268, "num_tokens": 1262559.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996301531791687, "sampling/importance_sampling_ratio/min": 0.4342047870159149, "sampling/sampling_logp_difference/max": 0.8342390060424805, "sampling/sampling_logp_difference/mean": 0.019856680184602737, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 58.0, "completions/mean_terminated_length": 58.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.20376178622245789, "epoch": 0.1433628318584071, "frac_reward_zero_std": 0.75, "grad_norm": 12.49039325869809, "kl": 0.0034272735938429832, "learning_rate": 3.5398230088495575e-07, "loss": -0.4271, "num_tokens": 1277551.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.3819938898086548, "sampling/importance_sampling_ratio/mean": 0.9996453523635864, "sampling/importance_sampling_ratio/min": 0.5875664353370667, "sampling/sampling_logp_difference/max": 0.5317659378051758, "sampling/sampling_logp_difference/mean": 0.0125980693846941, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 63.875, "completions/mean_terminated_length": 63.875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.23782622814178467, "epoch": 0.14513274336283186, "frac_reward_zero_std": 0.75, "grad_norm": 13.890965293596501, "kl": 0.0006324282148852944, "learning_rate": 3.5840707964601767e-07, "loss": 0.1274, "num_tokens": 1294071.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.7391748428344727, "sampling/importance_sampling_ratio/mean": 1.0006330013275146, "sampling/importance_sampling_ratio/min": 0.5648089051246643, "sampling/sampling_logp_difference/max": 0.57126784324646, "sampling/sampling_logp_difference/mean": 0.015169214457273483, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 48.328125, "completions/mean_terminated_length": 48.328125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.295986533164978, "epoch": 0.14690265486725665, "frac_reward_zero_std": 1.0, "grad_norm": 0.02545414431293979, "kl": 0.00065580167574808, "learning_rate": 3.6283185840707963e-07, "loss": 0.0, "num_tokens": 1309340.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3446919918060303, "sampling/importance_sampling_ratio/mean": 0.9991348385810852, "sampling/importance_sampling_ratio/min": 0.6952046751976013, "sampling/sampling_logp_difference/max": 0.36354899406433105, "sampling/sampling_logp_difference/mean": 0.016460547223687172, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 60.15625, "completions/mean_terminated_length": 60.15625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2950069308280945, "epoch": 0.1486725663716814, "frac_reward_zero_std": 1.0, "grad_norm": 0.03425744965710435, "kl": 0.0017368867993354797, "learning_rate": 3.672566371681416e-07, "loss": 0.0, "num_tokens": 1324326.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3615399599075317, "sampling/importance_sampling_ratio/mean": 0.9983700513839722, "sampling/importance_sampling_ratio/min": 0.556420087814331, "sampling/sampling_logp_difference/max": 0.5862317085266113, "sampling/sampling_logp_difference/mean": 0.018328040838241577, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 106.921875, "completions/mean_terminated_length": 106.921875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.5123524069786072, "epoch": 0.1504424778761062, "frac_reward_zero_std": 0.5, "grad_norm": 17.031908186631007, "kl": 0.002637058962136507, "learning_rate": 3.7168141592920356e-07, "loss": 0.3122, "num_tokens": 1341793.0, "reward": 0.21875, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0009326934814453, "sampling/importance_sampling_ratio/min": 0.5631783604621887, "sampling/sampling_logp_difference/max": 1.1850533485412598, "sampling/sampling_logp_difference/mean": 0.020340263843536377, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 91.390625, "completions/mean_terminated_length": 91.390625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.4329564571380615, "epoch": 0.15221238938053097, "frac_reward_zero_std": 1.0, "grad_norm": 0.023204412462173665, "kl": 0.001921823131851852, "learning_rate": 3.761061946902654e-07, "loss": 0.0, "num_tokens": 1357770.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6110613346099854, "sampling/importance_sampling_ratio/mean": 1.0003230571746826, "sampling/importance_sampling_ratio/min": 0.46672725677490234, "sampling/sampling_logp_difference/max": 0.7620102167129517, "sampling/sampling_logp_difference/mean": 0.018431926146149635, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 63.953125, "completions/mean_terminated_length": 63.953125, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 0.42276352643966675, "epoch": 0.15398230088495576, "frac_reward_zero_std": 0.25, "grad_norm": 16.80517251352282, "kl": 0.00828557275235653, "learning_rate": 3.805309734513274e-07, "loss": 0.3641, "num_tokens": 1371687.0, "reward": 0.78125, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.7898368835449219, "sampling/importance_sampling_ratio/mean": 1.0000966787338257, "sampling/importance_sampling_ratio/min": 0.5008212327957153, "sampling/sampling_logp_difference/max": 0.6915061473846436, "sampling/sampling_logp_difference/mean": 0.019807137548923492, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 96.0625, "completions/mean_terminated_length": 96.0625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3446410298347473, "epoch": 0.15575221238938053, "frac_reward_zero_std": 0.5, "grad_norm": 9.858522314114232, "kl": 0.0018287475686520338, "learning_rate": 3.8495575221238935e-07, "loss": 0.3426, "num_tokens": 1388731.0, "reward": 0.21875, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0008213520050049, "sampling/importance_sampling_ratio/min": 0.33204153180122375, "sampling/sampling_logp_difference/max": 1.1024951934814453, "sampling/sampling_logp_difference/mean": 0.01567666232585907, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 85.3125, "completions/mean_terminated_length": 85.3125, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.3201582133769989, "epoch": 0.15752212389380532, "frac_reward_zero_std": 1.0, "grad_norm": 0.014084128619798917, "kl": 0.000516217143740505, "learning_rate": 3.893805309734513e-07, "loss": 0.0, "num_tokens": 1404319.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4318376779556274, "sampling/importance_sampling_ratio/mean": 1.0020582675933838, "sampling/importance_sampling_ratio/min": 0.6632694602012634, "sampling/sampling_logp_difference/max": 0.41057395935058594, "sampling/sampling_logp_difference/mean": 0.015921611338853836, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 63.234375, "completions/mean_terminated_length": 63.234375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.27583983540534973, "epoch": 0.1592920353982301, "frac_reward_zero_std": 0.75, "grad_norm": 15.744491824905102, "kl": 0.0023850714787840843, "learning_rate": 3.938053097345133e-07, "loss": 0.0948, "num_tokens": 1422062.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9983652830123901, "sampling/importance_sampling_ratio/min": 0.6548058390617371, "sampling/sampling_logp_difference/max": 0.9749901294708252, "sampling/sampling_logp_difference/mean": 0.014708740636706352, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.34375, "completions/mean_terminated_length": 17.34375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.06893637031316757, "epoch": 0.16106194690265488, "frac_reward_zero_std": 1.0, "grad_norm": 0.19984842013006643, "kl": 0.002403866034001112, "learning_rate": 3.982300884955752e-07, "loss": 0.0, "num_tokens": 1439604.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.133671522140503, "sampling/importance_sampling_ratio/mean": 0.9994039535522461, "sampling/importance_sampling_ratio/min": 0.6547919511795044, "sampling/sampling_logp_difference/max": 0.4234377145767212, "sampling/sampling_logp_difference/mean": 0.007659217808395624, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 36.953125, "completions/mean_terminated_length": 36.953125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.26978591084480286, "epoch": 0.16283185840707964, "frac_reward_zero_std": 0.75, "grad_norm": 16.587903339877094, "kl": 0.003500830614939332, "learning_rate": 4.0265486725663716e-07, "loss": 0.1325, "num_tokens": 1455329.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.6246740818023682, "sampling/importance_sampling_ratio/mean": 1.0004960298538208, "sampling/importance_sampling_ratio/min": 0.5870731472969055, "sampling/sampling_logp_difference/max": 0.532605767250061, "sampling/sampling_logp_difference/mean": 0.014703018590807915, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 84.109375, "completions/mean_terminated_length": 84.109375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.47278204560279846, "epoch": 0.16460176991150444, "frac_reward_zero_std": 0.25, "grad_norm": 11.06688751319423, "kl": 0.003939971327781677, "learning_rate": 4.0707964601769913e-07, "loss": 0.1314, "num_tokens": 1471768.0, "reward": 0.03125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0012634992599487, "sampling/importance_sampling_ratio/min": 0.46622326970100403, "sampling/sampling_logp_difference/max": 0.8460280895233154, "sampling/sampling_logp_difference/mean": 0.022663792595267296, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 59.6875, "completions/mean_terminated_length": 59.6875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2976003885269165, "epoch": 0.1663716814159292, "frac_reward_zero_std": 0.75, "grad_norm": 4.901967299367, "kl": 0.002563281450420618, "learning_rate": 4.1150442477876104e-07, "loss": -0.3815, "num_tokens": 1485444.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.3954286575317383, "sampling/importance_sampling_ratio/mean": 1.0000526905059814, "sampling/importance_sampling_ratio/min": 0.6176431775093079, "sampling/sampling_logp_difference/max": 0.481844425201416, "sampling/sampling_logp_difference/mean": 0.015158753842115402, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 51.46875, "completions/mean_terminated_length": 51.46875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2984507083892822, "epoch": 0.168141592920354, "frac_reward_zero_std": 0.5, "grad_norm": 13.83890318477302, "kl": 0.05415243282914162, "learning_rate": 4.1592920353982295e-07, "loss": 0.2832, "num_tokens": 1500914.0, "reward": 0.5, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7520718574523926, "sampling/importance_sampling_ratio/mean": 1.0006399154663086, "sampling/importance_sampling_ratio/min": 0.4704074263572693, "sampling/sampling_logp_difference/max": 0.7541561126708984, "sampling/sampling_logp_difference/mean": 0.017951540648937225, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 32.59375, "completions/mean_terminated_length": 32.59375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.13821932673454285, "epoch": 0.16991150442477876, "frac_reward_zero_std": 1.0, "grad_norm": 0.19751930399724366, "kl": 0.022612882778048515, "learning_rate": 4.203539823008849e-07, "loss": 0.0001, "num_tokens": 1512440.0, "reward": -1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": -1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7523033618927002, "sampling/importance_sampling_ratio/mean": 0.9995698928833008, "sampling/importance_sampling_ratio/min": 0.4824843406677246, "sampling/sampling_logp_difference/max": 0.7288068532943726, "sampling/sampling_logp_difference/mean": 0.02123548462986946, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 21.15625, "completions/mean_terminated_length": 21.15625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2681628465652466, "epoch": 0.17168141592920355, "frac_reward_zero_std": 0.25, "grad_norm": 84.58976760954279, "kl": 1.1065090894699097, "learning_rate": 4.247787610619469e-07, "loss": -0.0586, "num_tokens": 1522786.0, "reward": -0.03125, "reward_std": 0.519389271736145, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.9697527885437012, "sampling/importance_sampling_ratio/mean": 0.9997271299362183, "sampling/importance_sampling_ratio/min": 0.0831858441233635, "sampling/sampling_logp_difference/max": 2.486678123474121, "sampling/sampling_logp_difference/mean": 0.025643471628427505, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 16.875, "completions/mean_terminated_length": 16.875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.06743040680885315, "epoch": 0.17345132743362832, "frac_reward_zero_std": 1.0, "grad_norm": 0.39896373700566834, "kl": 0.010008303448557854, "learning_rate": 4.2920353982300885e-07, "loss": 0.0001, "num_tokens": 1536762.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5342676639556885, "sampling/importance_sampling_ratio/mean": 0.9996311068534851, "sampling/importance_sampling_ratio/min": 0.6176677346229553, "sampling/sampling_logp_difference/max": 0.48180460929870605, "sampling/sampling_logp_difference/mean": 0.012232774868607521, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 39.625, "completions/mean_terminated_length": 39.625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2937965393066406, "epoch": 0.1752212389380531, "frac_reward_zero_std": 0.75, "grad_norm": 17.05345220718161, "kl": 0.047442853450775146, "learning_rate": 4.3362831858407076e-07, "loss": 0.1838, "num_tokens": 1551058.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.50980806350708, "sampling/importance_sampling_ratio/mean": 0.9996635913848877, "sampling/importance_sampling_ratio/min": 0.5629895925521851, "sampling/sampling_logp_difference/max": 0.5744941234588623, "sampling/sampling_logp_difference/mean": 0.012546870857477188, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 43.921875, "completions/mean_terminated_length": 43.921875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.25557002425193787, "epoch": 0.17699115044247787, "frac_reward_zero_std": 0.5, "grad_norm": 23.867050724829383, "kl": 0.012761028483510017, "learning_rate": 4.380530973451327e-07, "loss": -0.2337, "num_tokens": 1569261.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0012588500976562, "sampling/importance_sampling_ratio/min": 0.5261755585670471, "sampling/sampling_logp_difference/max": 0.8514184951782227, "sampling/sampling_logp_difference/mean": 0.017819223925471306, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.06638449430465698, "epoch": 0.17876106194690267, "frac_reward_zero_std": 1.0, "grad_norm": 0.44630179415515814, "kl": 0.024098360911011696, "learning_rate": 4.424778761061947e-07, "loss": 0.0002, "num_tokens": 1581645.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2675453424453735, "sampling/importance_sampling_ratio/mean": 0.999424397945404, "sampling/importance_sampling_ratio/min": 0.6548995971679688, "sampling/sampling_logp_difference/max": 0.42327332496643066, "sampling/sampling_logp_difference/mean": 0.011951332911849022, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 36.296875, "completions/mean_terminated_length": 36.296875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2074262946844101, "epoch": 0.18053097345132743, "frac_reward_zero_std": 0.75, "grad_norm": 17.8257040036157, "kl": 0.054125286638736725, "learning_rate": 4.469026548672566e-07, "loss": -0.144, "num_tokens": 1593456.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6750662326812744, "sampling/importance_sampling_ratio/mean": 0.9999299049377441, "sampling/importance_sampling_ratio/min": 0.6054568886756897, "sampling/sampling_logp_difference/max": 0.515852689743042, "sampling/sampling_logp_difference/mean": 0.012604659423232079, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 44.046875, "completions/mean_terminated_length": 44.046875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.32559850811958313, "epoch": 0.18230088495575222, "frac_reward_zero_std": 0.5, "grad_norm": 32.22001959212673, "kl": 0.13167370855808258, "learning_rate": 4.5132743362831857e-07, "loss": -0.3729, "num_tokens": 1606947.0, "reward": 0.25, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995090961456299, "sampling/importance_sampling_ratio/min": 0.4825321435928345, "sampling/sampling_logp_difference/max": 0.830096960067749, "sampling/sampling_logp_difference/mean": 0.02871047519147396, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 59.21875, "completions/mean_terminated_length": 59.21875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.4233275055885315, "epoch": 0.184070796460177, "frac_reward_zero_std": 0.25, "grad_norm": 32.05855820384328, "kl": 0.05426112934947014, "learning_rate": 4.557522123893805e-07, "loss": -0.4524, "num_tokens": 1620529.0, "reward": 0.46875, "reward_std": 0.5959457159042358, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9988217353820801, "sampling/importance_sampling_ratio/min": 0.47307318449020386, "sampling/sampling_logp_difference/max": 1.6695234775543213, "sampling/sampling_logp_difference/mean": 0.026059508323669434, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 16.578125, "completions/mean_terminated_length": 16.578125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.14209511876106262, "epoch": 0.18584070796460178, "frac_reward_zero_std": 1.0, "grad_norm": 0.0777847234694372, "kl": 0.08551844954490662, "learning_rate": 4.6017699115044245e-07, "loss": 0.0008, "num_tokens": 1636598.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2430133819580078, "sampling/importance_sampling_ratio/mean": 1.001842737197876, "sampling/importance_sampling_ratio/min": 0.42892172932624817, "sampling/sampling_logp_difference/max": 0.8464808464050293, "sampling/sampling_logp_difference/mean": 0.009883169084787369, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 35.625, "completions/mean_terminated_length": 35.625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.1706276834011078, "epoch": 0.18761061946902655, "frac_reward_zero_std": 0.75, "grad_norm": 16.242571617604863, "kl": 0.008568068966269493, "learning_rate": 4.646017699115044e-07, "loss": -0.3094, "num_tokens": 1652014.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.7361836433410645, "sampling/importance_sampling_ratio/mean": 0.9986557960510254, "sampling/importance_sampling_ratio/min": 0.49548566341400146, "sampling/sampling_logp_difference/max": 0.7022168040275574, "sampling/sampling_logp_difference/mean": 0.017092259600758553, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 53.09375, "completions/mean_terminated_length": 53.09375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2413337230682373, "epoch": 0.18938053097345134, "frac_reward_zero_std": 0.5, "grad_norm": 22.783578598982626, "kl": 0.0069497618824243546, "learning_rate": 4.690265486725664e-07, "loss": -0.2593, "num_tokens": 1667156.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.5598926544189453, "sampling/importance_sampling_ratio/mean": 0.9992170333862305, "sampling/importance_sampling_ratio/min": 0.733043909072876, "sampling/sampling_logp_difference/max": 0.44461703300476074, "sampling/sampling_logp_difference/mean": 0.01676351949572563, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 30.765625, "completions/mean_terminated_length": 30.765625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.17704860866069794, "epoch": 0.1911504424778761, "frac_reward_zero_std": 0.75, "grad_norm": 22.062156584055394, "kl": 0.015616398304700851, "learning_rate": 4.734513274336283e-07, "loss": 0.584, "num_tokens": 1680021.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.6436699628829956, "sampling/importance_sampling_ratio/mean": 1.0013000965118408, "sampling/importance_sampling_ratio/min": 0.6871146559715271, "sampling/sampling_logp_difference/max": 0.4969315528869629, "sampling/sampling_logp_difference/mean": 0.013623833656311035, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 83.0625, "completions/mean_terminated_length": 83.0625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2795030176639557, "epoch": 0.1929203539823009, "frac_reward_zero_std": 0.75, "grad_norm": 3.5345931936458213, "kl": 0.0018449004273861647, "learning_rate": 4.778761061946903e-07, "loss": 0.0868, "num_tokens": 1695865.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4435425996780396, "sampling/importance_sampling_ratio/mean": 0.9981499314308167, "sampling/importance_sampling_ratio/min": 0.6196099519729614, "sampling/sampling_logp_difference/max": 0.4786651134490967, "sampling/sampling_logp_difference/mean": 0.01591462641954422, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 58.171875, "completions/mean_terminated_length": 58.171875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2562200129032135, "epoch": 0.19469026548672566, "frac_reward_zero_std": 1.0, "grad_norm": 0.02541889548481687, "kl": 0.0014256872236728668, "learning_rate": 4.823008849557521e-07, "loss": 0.0, "num_tokens": 1709716.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6915197372436523, "sampling/importance_sampling_ratio/mean": 0.9971394538879395, "sampling/importance_sampling_ratio/min": 0.535825788974762, "sampling/sampling_logp_difference/max": 0.6239461898803711, "sampling/sampling_logp_difference/mean": 0.017007606104016304, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 23.390625, "completions/mean_terminated_length": 23.390625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.21091923117637634, "epoch": 0.19646017699115045, "frac_reward_zero_std": 0.75, "grad_norm": 29.116072245725366, "kl": 0.023805273696780205, "learning_rate": 4.867256637168141e-07, "loss": 0.4792, "num_tokens": 1725309.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.825963020324707, "sampling/importance_sampling_ratio/mean": 0.9973196387290955, "sampling/importance_sampling_ratio/min": 0.6877333521842957, "sampling/sampling_logp_difference/max": 0.6021075248718262, "sampling/sampling_logp_difference/mean": 0.02469644322991371, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 41.171875, "completions/mean_terminated_length": 41.171875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.337478369474411, "epoch": 0.19823008849557522, "frac_reward_zero_std": 0.5, "grad_norm": 28.478295365787474, "kl": 0.019370798021554947, "learning_rate": 4.91150442477876e-07, "loss": -0.373, "num_tokens": 1746520.0, "reward": 0.65625, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.4072273969650269, "sampling/importance_sampling_ratio/mean": 1.0000637769699097, "sampling/importance_sampling_ratio/min": 0.5572885274887085, "sampling/sampling_logp_difference/max": 0.5846721529960632, "sampling/sampling_logp_difference/mean": 0.015861041843891144, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 67.5, "completions/mean_terminated_length": 67.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.43532299995422363, "epoch": 0.2, "frac_reward_zero_std": 0.25, "grad_norm": 11.864682675131935, "kl": 0.018026825040578842, "learning_rate": 4.95575221238938e-07, "loss": -0.2313, "num_tokens": 1765192.0, "reward": -0.1875, "reward_std": 0.6991121172904968, "rewards/decision_reward_func/mean": -0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.565968632698059, "sampling/importance_sampling_ratio/mean": 0.9986177682876587, "sampling/importance_sampling_ratio/min": 0.6226667761802673, "sampling/sampling_logp_difference/max": 0.4737436771392822, "sampling/sampling_logp_difference/mean": 0.01891300082206726, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 72.5625, "completions/mean_terminated_length": 72.5625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3272761106491089, "epoch": 0.20176991150442478, "frac_reward_zero_std": 0.5, "grad_norm": 5.146958292258678, "kl": 0.05377377197146416, "learning_rate": 5e-07, "loss": -0.005, "num_tokens": 1782668.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0012102127075195, "sampling/importance_sampling_ratio/min": 0.3370976150035858, "sampling/sampling_logp_difference/max": 1.0873827934265137, "sampling/sampling_logp_difference/mean": 0.024121500551700592, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 82.59375, "completions/mean_terminated_length": 82.59375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.46006712317466736, "epoch": 0.20353982300884957, "frac_reward_zero_std": 0.25, "grad_norm": 6.751666668869363, "kl": 0.07440268248319626, "learning_rate": 5.044247787610619e-07, "loss": 0.0712, "num_tokens": 1799058.0, "reward": 0.90625, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9985596537590027, "sampling/importance_sampling_ratio/min": 0.4345816969871521, "sampling/sampling_logp_difference/max": 0.972254753112793, "sampling/sampling_logp_difference/mean": 0.019559338688850403, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 37.625, "completions/mean_terminated_length": 37.625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2306581437587738, "epoch": 0.20530973451327433, "frac_reward_zero_std": 1.0, "grad_norm": 0.09028985253936475, "kl": 0.003620689269155264, "learning_rate": 5.088495575221239e-07, "loss": 0.0, "num_tokens": 1815882.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6114362478256226, "sampling/importance_sampling_ratio/mean": 0.9963452816009521, "sampling/importance_sampling_ratio/min": 0.41382789611816406, "sampling/sampling_logp_difference/max": 0.8823051452636719, "sampling/sampling_logp_difference/mean": 0.019629651680588722, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 45.75, "completions/mean_terminated_length": 45.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2207641899585724, "epoch": 0.20707964601769913, "frac_reward_zero_std": 1.0, "grad_norm": 0.030791093669244915, "kl": 0.001684409799054265, "learning_rate": 5.132743362831859e-07, "loss": 0.0, "num_tokens": 1830506.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4853850603103638, "sampling/importance_sampling_ratio/mean": 0.9980629086494446, "sampling/importance_sampling_ratio/min": 0.444266676902771, "sampling/sampling_logp_difference/max": 0.8113303184509277, "sampling/sampling_logp_difference/mean": 0.028768619522452354, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 64.484375, "completions/mean_terminated_length": 64.484375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3205970227718353, "epoch": 0.2088495575221239, "frac_reward_zero_std": 1.0, "grad_norm": 0.03203958114580459, "kl": 0.0018948880024254322, "learning_rate": 5.176991150442478e-07, "loss": 0.0, "num_tokens": 1845561.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5698038339614868, "sampling/importance_sampling_ratio/mean": 1.0007352828979492, "sampling/importance_sampling_ratio/min": 0.6012983322143555, "sampling/sampling_logp_difference/max": 0.5086641311645508, "sampling/sampling_logp_difference/mean": 0.021763615310192108, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 140.8125, "completions/mean_terminated_length": 140.8125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.5416373014450073, "epoch": 0.21061946902654868, "frac_reward_zero_std": 1.0, "grad_norm": 0.007404796935460493, "kl": 0.0005785170360468328, "learning_rate": 5.221238938053097e-07, "loss": 0.0, "num_tokens": 1863789.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4254950284957886, "sampling/importance_sampling_ratio/mean": 1.000152349472046, "sampling/importance_sampling_ratio/min": 0.6323530077934265, "sampling/sampling_logp_difference/max": 0.45830750465393066, "sampling/sampling_logp_difference/mean": 0.01834021881222725, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 49.9375, "completions/mean_terminated_length": 49.9375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.1841767281293869, "epoch": 0.21238938053097345, "frac_reward_zero_std": 0.75, "grad_norm": 14.962615729352883, "kl": 0.005641619674861431, "learning_rate": 5.265486725663717e-07, "loss": 0.2669, "num_tokens": 1878841.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.765735149383545, "sampling/importance_sampling_ratio/mean": 0.9993394613265991, "sampling/importance_sampling_ratio/min": 0.7040172815322876, "sampling/sampling_logp_difference/max": 0.568567156791687, "sampling/sampling_logp_difference/mean": 0.009488564915955067, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 44.046875, "completions/mean_terminated_length": 44.046875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2508974075317383, "epoch": 0.21415929203539824, "frac_reward_zero_std": 0.75, "grad_norm": 15.138540143090314, "kl": 0.003447249997407198, "learning_rate": 5.309734513274336e-07, "loss": -0.2376, "num_tokens": 1895772.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.6151796579360962, "sampling/importance_sampling_ratio/mean": 0.9982805252075195, "sampling/importance_sampling_ratio/min": 0.6492539644241333, "sampling/sampling_logp_difference/max": 0.4794461727142334, "sampling/sampling_logp_difference/mean": 0.017452435567975044, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 38.140625, "completions/mean_terminated_length": 38.140625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3044247627258301, "epoch": 0.215929203539823, "frac_reward_zero_std": 0.25, "grad_norm": 19.479083260753757, "kl": 0.01046132855117321, "learning_rate": 5.353982300884956e-07, "loss": 0.0352, "num_tokens": 1911141.0, "reward": 0.34375, "reward_std": 0.5827301740646362, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.5219072103500366, "sampling/importance_sampling_ratio/mean": 0.9994726181030273, "sampling/importance_sampling_ratio/min": 0.5666710734367371, "sampling/sampling_logp_difference/max": 0.5679762363433838, "sampling/sampling_logp_difference/mean": 0.016208771616220474, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 105.75, "completions/mean_terminated_length": 105.75, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.519478440284729, "epoch": 0.2176991150442478, "frac_reward_zero_std": 1.0, "grad_norm": 0.019650278961170106, "kl": 0.0011656515998765826, "learning_rate": 5.398230088495575e-07, "loss": 0.0, "num_tokens": 1927541.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8163784742355347, "sampling/importance_sampling_ratio/mean": 1.000349998474121, "sampling/importance_sampling_ratio/min": 0.6328713893890381, "sampling/sampling_logp_difference/max": 0.5968446731567383, "sampling/sampling_logp_difference/mean": 0.019682027399539948, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 106.96875, "completions/mean_terminated_length": 106.96875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.45931580662727356, "epoch": 0.21946902654867256, "frac_reward_zero_std": 1.0, "grad_norm": 0.008681335336834708, "kl": 0.0005357326008379459, "learning_rate": 5.442477876106194e-07, "loss": 0.0, "num_tokens": 1944499.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.929969310760498, "sampling/importance_sampling_ratio/mean": 1.0011227130889893, "sampling/importance_sampling_ratio/min": 0.6561875939369202, "sampling/sampling_logp_difference/max": 0.6575040817260742, "sampling/sampling_logp_difference/mean": 0.01847759075462818, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 110.75, "completions/mean_terminated_length": 110.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 0.37561601400375366, "epoch": 0.22123893805309736, "frac_reward_zero_std": 0.75, "grad_norm": 3.9753687076399795, "kl": 0.0010785853955894709, "learning_rate": 5.486725663716814e-07, "loss": -0.0333, "num_tokens": 1961459.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995502829551697, "sampling/importance_sampling_ratio/min": 0.6624337434768677, "sampling/sampling_logp_difference/max": 1.584089994430542, "sampling/sampling_logp_difference/mean": 0.016993992030620575, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 70.53125, "completions/mean_terminated_length": 70.53125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.31924882531166077, "epoch": 0.22300884955752212, "frac_reward_zero_std": 1.0, "grad_norm": 0.031328937808989196, "kl": 0.0014360514469444752, "learning_rate": 5.530973451327434e-07, "loss": 0.0, "num_tokens": 1975653.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.8715254068374634, "sampling/importance_sampling_ratio/mean": 1.0022432804107666, "sampling/importance_sampling_ratio/min": 0.6149583458900452, "sampling/sampling_logp_difference/max": 0.6267538070678711, "sampling/sampling_logp_difference/mean": 0.025006473064422607, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 82.21875, "completions/mean_terminated_length": 82.21875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3541611433029175, "epoch": 0.2247787610619469, "frac_reward_zero_std": 1.0, "grad_norm": 0.013457624899861007, "kl": 0.0013129812432453036, "learning_rate": 5.575221238938052e-07, "loss": 0.0, "num_tokens": 1995827.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.574439525604248, "sampling/importance_sampling_ratio/mean": 0.9993070960044861, "sampling/importance_sampling_ratio/min": 0.5381397008895874, "sampling/sampling_logp_difference/max": 0.6196370124816895, "sampling/sampling_logp_difference/mean": 0.014719847589731216, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 67.015625, "completions/mean_terminated_length": 67.015625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.44300490617752075, "epoch": 0.22654867256637168, "frac_reward_zero_std": 0.75, "grad_norm": 6.134743959337715, "kl": 0.0014869027072563767, "learning_rate": 5.619469026548672e-07, "loss": 0.2426, "num_tokens": 2011460.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.8721041679382324, "sampling/importance_sampling_ratio/mean": 0.9977869987487793, "sampling/importance_sampling_ratio/min": 0.4702486991882324, "sampling/sampling_logp_difference/max": 0.7544935941696167, "sampling/sampling_logp_difference/mean": 0.023253358900547028, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 81.8125, "completions/mean_terminated_length": 81.8125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2518032491207123, "epoch": 0.22831858407079647, "frac_reward_zero_std": 0.75, "grad_norm": 8.862384196324069, "kl": 0.0007762154564261436, "learning_rate": 5.663716814159291e-07, "loss": 0.1619, "num_tokens": 2027848.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0013949871063232, "sampling/importance_sampling_ratio/min": 0.5217262506484985, "sampling/sampling_logp_difference/max": 1.1203207969665527, "sampling/sampling_logp_difference/mean": 0.01854543387889862, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 71.625, "completions/mean_terminated_length": 71.625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.299152135848999, "epoch": 0.23008849557522124, "frac_reward_zero_std": 1.0, "grad_norm": 0.014935968388077706, "kl": 0.0009952923282980919, "learning_rate": 5.707964601769911e-07, "loss": 0.0, "num_tokens": 2042800.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6621967554092407, "sampling/importance_sampling_ratio/mean": 1.0008690357208252, "sampling/importance_sampling_ratio/min": 0.4874541759490967, "sampling/sampling_logp_difference/max": 0.7185590267181396, "sampling/sampling_logp_difference/mean": 0.018116779625415802, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 129.546875, "completions/mean_terminated_length": 129.546875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.4606068730354309, "epoch": 0.23185840707964603, "frac_reward_zero_std": 0.5, "grad_norm": 6.560787920244211, "kl": 0.001249688328243792, "learning_rate": 5.752212389380531e-07, "loss": 0.1933, "num_tokens": 2060803.0, "reward": 0.03125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5971571207046509, "sampling/importance_sampling_ratio/mean": 1.0004818439483643, "sampling/importance_sampling_ratio/min": 0.5036928653717041, "sampling/sampling_logp_difference/max": 0.685788631439209, "sampling/sampling_logp_difference/mean": 0.01681518740952015, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 84.25, "completions/mean_terminated_length": 84.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3058205246925354, "epoch": 0.2336283185840708, "frac_reward_zero_std": 1.0, "grad_norm": 0.01254687731931507, "kl": 0.0007779281586408615, "learning_rate": 5.79646017699115e-07, "loss": 0.0, "num_tokens": 2078243.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002914667129517, "sampling/importance_sampling_ratio/min": 0.4855709969997406, "sampling/sampling_logp_difference/max": 0.7224297523498535, "sampling/sampling_logp_difference/mean": 0.02297021448612213, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 69.484375, "completions/mean_terminated_length": 69.484375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.26227444410324097, "epoch": 0.23539823008849559, "frac_reward_zero_std": 1.0, "grad_norm": 0.016136415805378446, "kl": 0.0010202470002695918, "learning_rate": 5.84070796460177e-07, "loss": 0.0, "num_tokens": 2096418.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.781939148902893, "sampling/importance_sampling_ratio/mean": 0.9994803667068481, "sampling/importance_sampling_ratio/min": 0.5584296584129333, "sampling/sampling_logp_difference/max": 0.5826265811920166, "sampling/sampling_logp_difference/mean": 0.014151913113892078, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 72.875, "completions/mean_terminated_length": 72.875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.24580150842666626, "epoch": 0.23716814159292035, "frac_reward_zero_std": 1.0, "grad_norm": 0.02503235510969517, "kl": 0.001629705191589892, "learning_rate": 5.88495575221239e-07, "loss": 0.0, "num_tokens": 2112890.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5154296159744263, "sampling/importance_sampling_ratio/mean": 1.0017865896224976, "sampling/importance_sampling_ratio/min": 0.6210728287696838, "sampling/sampling_logp_difference/max": 0.4763069152832031, "sampling/sampling_logp_difference/mean": 0.01634804531931877, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 75.90625, "completions/mean_terminated_length": 75.90625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.5235762000083923, "epoch": 0.23893805309734514, "frac_reward_zero_std": 1.0, "grad_norm": 0.01575589802905874, "kl": 0.001007390907034278, "learning_rate": 5.929203539823009e-07, "loss": 0.0, "num_tokens": 2129028.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.001007080078125, "sampling/importance_sampling_ratio/min": 0.6624895334243774, "sampling/sampling_logp_difference/max": 0.7274326086044312, "sampling/sampling_logp_difference/mean": 0.02209286577999592, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 88.375, "completions/mean_terminated_length": 88.375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.5046366453170776, "epoch": 0.2407079646017699, "frac_reward_zero_std": 0.75, "grad_norm": 3.578048939539401, "kl": 0.0036787153221666813, "learning_rate": 5.973451327433628e-07, "loss": 0.1217, "num_tokens": 2144508.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.4554598331451416, "sampling/importance_sampling_ratio/mean": 0.9992860555648804, "sampling/importance_sampling_ratio/min": 0.25643423199653625, "sampling/sampling_logp_difference/max": 1.3608829975128174, "sampling/sampling_logp_difference/mean": 0.022801248356699944, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 123.0, "completions/max_terminated_length": 123.0, "completions/mean_length": 33.5625, "completions/mean_terminated_length": 33.5625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.18427319824695587, "epoch": 0.2424778761061947, "frac_reward_zero_std": 1.0, "grad_norm": 0.03157405100169581, "kl": 0.0014091174816712737, "learning_rate": 6.017699115044248e-07, "loss": 0.0, "num_tokens": 2156688.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0007842779159546, "sampling/importance_sampling_ratio/min": 0.627287745475769, "sampling/sampling_logp_difference/max": 0.8145256042480469, "sampling/sampling_logp_difference/mean": 0.017598610371351242, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 71.640625, "completions/mean_terminated_length": 71.640625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3980875611305237, "epoch": 0.24424778761061947, "frac_reward_zero_std": 1.0, "grad_norm": 0.016564111690985255, "kl": 0.0008818708593025804, "learning_rate": 6.061946902654867e-07, "loss": 0.0, "num_tokens": 2173097.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4036502838134766, "sampling/importance_sampling_ratio/mean": 1.0016671419143677, "sampling/importance_sampling_ratio/min": 0.6285651922225952, "sampling/sampling_logp_difference/max": 0.4643155336380005, "sampling/sampling_logp_difference/mean": 0.01826133392751217, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 81.828125, "completions/mean_terminated_length": 81.828125, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 0.4850948452949524, "epoch": 0.24601769911504426, "frac_reward_zero_std": 1.0, "grad_norm": 0.01713387326277667, "kl": 0.001043393975123763, "learning_rate": 6.106194690265486e-07, "loss": 0.0, "num_tokens": 2188382.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.910315752029419, "sampling/importance_sampling_ratio/mean": 0.9982123374938965, "sampling/importance_sampling_ratio/min": 0.5476705431938171, "sampling/sampling_logp_difference/max": 0.647268533706665, "sampling/sampling_logp_difference/mean": 0.020284462720155716, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 99.71875, "completions/mean_terminated_length": 99.71875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3546068072319031, "epoch": 0.24778761061946902, "frac_reward_zero_std": 1.0, "grad_norm": 0.01725404162883253, "kl": 0.0017146074678748846, "learning_rate": 6.150442477876105e-07, "loss": 0.0, "num_tokens": 2204524.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3572512865066528, "sampling/importance_sampling_ratio/mean": 1.0001609325408936, "sampling/importance_sampling_ratio/min": 0.4417930245399475, "sampling/sampling_logp_difference/max": 0.8169138431549072, "sampling/sampling_logp_difference/mean": 0.01794680394232273, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 35.140625, "completions/mean_terminated_length": 35.140625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.18727999925613403, "epoch": 0.24955752212389382, "frac_reward_zero_std": 1.0, "grad_norm": 0.09529720865059939, "kl": 0.0010563858086243272, "learning_rate": 6.194690265486725e-07, "loss": 0.0, "num_tokens": 2216885.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9989814758300781, "sampling/importance_sampling_ratio/min": 0.7282373905181885, "sampling/sampling_logp_difference/max": 0.814521074295044, "sampling/sampling_logp_difference/mean": 0.018082180991768837, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.328125, "completions/mean_terminated_length": 16.328125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.06842117011547089, "epoch": 0.2513274336283186, "frac_reward_zero_std": 0.75, "grad_norm": 9.075752721630387, "kl": 0.004978060256689787, "learning_rate": 6.238938053097345e-07, "loss": -0.0013, "num_tokens": 2230346.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.266401767730713, "sampling/importance_sampling_ratio/mean": 1.0020692348480225, "sampling/importance_sampling_ratio/min": 0.6714641451835632, "sampling/sampling_logp_difference/max": 0.39829468727111816, "sampling/sampling_logp_difference/mean": 0.010533824563026428, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 74.671875, "completions/mean_terminated_length": 74.671875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3647806644439697, "epoch": 0.25309734513274335, "frac_reward_zero_std": 1.0, "grad_norm": 0.015980370154043362, "kl": 0.0007940245559439063, "learning_rate": 6.283185840707964e-07, "loss": 0.0, "num_tokens": 2245157.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.424361228942871, "sampling/importance_sampling_ratio/mean": 1.000733494758606, "sampling/importance_sampling_ratio/min": 0.7051947116851807, "sampling/sampling_logp_difference/max": 0.353723406791687, "sampling/sampling_logp_difference/mean": 0.015422477386891842, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 840.0, "completions/max_terminated_length": 840.0, "completions/mean_length": 141.390625, "completions/mean_terminated_length": 141.390625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.5192233324050903, "epoch": 0.25486725663716814, "frac_reward_zero_std": 0.5, "grad_norm": 10.03788217298924, "kl": 0.0028998591005802155, "learning_rate": 6.327433628318584e-07, "loss": -0.0586, "num_tokens": 2262686.0, "reward": -0.15625, "reward_std": 0.47978055477142334, "rewards/decision_reward_func/mean": -0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.6302465200424194, "sampling/importance_sampling_ratio/mean": 0.9998656511306763, "sampling/importance_sampling_ratio/min": 0.5490341782569885, "sampling/sampling_logp_difference/max": 0.5995945930480957, "sampling/sampling_logp_difference/mean": 0.019007448107004166, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 101.109375, "completions/mean_terminated_length": 101.109375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3655405640602112, "epoch": 0.25663716814159293, "frac_reward_zero_std": 0.75, "grad_norm": 6.607652442786013, "kl": 0.002844990696758032, "learning_rate": 6.371681415929203e-07, "loss": 0.0627, "num_tokens": 2279573.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995540380477905, "sampling/importance_sampling_ratio/min": 0.6338145136833191, "sampling/sampling_logp_difference/max": 1.1002683639526367, "sampling/sampling_logp_difference/mean": 0.021086499094963074, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 48.328125, "completions/mean_terminated_length": 48.328125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2803422212600708, "epoch": 0.2584070796460177, "frac_reward_zero_std": 1.0, "grad_norm": 0.02802108714282455, "kl": 0.0013906147796660662, "learning_rate": 6.415929203539822e-07, "loss": 0.0, "num_tokens": 2295178.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2905248403549194, "sampling/importance_sampling_ratio/mean": 0.9989687204360962, "sampling/importance_sampling_ratio/min": 0.4375634789466858, "sampling/sampling_logp_difference/max": 0.8265335559844971, "sampling/sampling_logp_difference/mean": 0.015749461948871613, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 62.203125, "completions/mean_terminated_length": 62.203125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2617567479610443, "epoch": 0.26017699115044246, "frac_reward_zero_std": 1.0, "grad_norm": 0.024533085491038015, "kl": 0.001395686762407422, "learning_rate": 6.460176991150442e-07, "loss": 0.0, "num_tokens": 2316247.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5753140449523926, "sampling/importance_sampling_ratio/mean": 0.9995524287223816, "sampling/importance_sampling_ratio/min": 0.7271898984909058, "sampling/sampling_logp_difference/max": 0.4544546604156494, "sampling/sampling_logp_difference/mean": 0.011754988692700863, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.84375, "completions/mean_terminated_length": 15.84375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.07275533676147461, "epoch": 0.26194690265486725, "frac_reward_zero_std": 1.0, "grad_norm": 0.09102567414694034, "kl": 0.0017056685173884034, "learning_rate": 6.504424778761062e-07, "loss": 0.0, "num_tokens": 2330141.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.197696566581726, "sampling/importance_sampling_ratio/mean": 1.000204086303711, "sampling/importance_sampling_ratio/min": 0.8154329061508179, "sampling/sampling_logp_difference/max": 0.20403611660003662, "sampling/sampling_logp_difference/mean": 0.0072796279564499855, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 38.5625, "completions/mean_terminated_length": 38.5625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.14959517121315002, "epoch": 0.26371681415929205, "frac_reward_zero_std": 0.75, "grad_norm": 12.158302193342701, "kl": 0.009750444442033768, "learning_rate": 6.548672566371681e-07, "loss": -0.1449, "num_tokens": 2343185.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.7331476211547852, "sampling/importance_sampling_ratio/mean": 0.9996535778045654, "sampling/importance_sampling_ratio/min": 0.255138635635376, "sampling/sampling_logp_difference/max": 1.36594820022583, "sampling/sampling_logp_difference/mean": 0.016084929928183556, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 35.59375, "completions/mean_terminated_length": 35.59375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.1963880956172943, "epoch": 0.26548672566371684, "frac_reward_zero_std": 0.75, "grad_norm": 7.1218924747110615, "kl": 0.005532016046345234, "learning_rate": 6.592920353982301e-07, "loss": -0.1457, "num_tokens": 2358039.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004479885101318, "sampling/importance_sampling_ratio/min": 0.5923722982406616, "sampling/sampling_logp_difference/max": 1.9722204208374023, "sampling/sampling_logp_difference/mean": 0.013359785079956055, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 78.109375, "completions/mean_terminated_length": 78.109375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2529459297657013, "epoch": 0.2672566371681416, "frac_reward_zero_std": 1.0, "grad_norm": 0.044873586553780806, "kl": 0.005619180388748646, "learning_rate": 6.637168141592921e-07, "loss": 0.0, "num_tokens": 2373422.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9478777647018433, "sampling/importance_sampling_ratio/mean": 1.0008846521377563, "sampling/importance_sampling_ratio/min": 0.45570361614227295, "sampling/sampling_logp_difference/max": 0.7859126925468445, "sampling/sampling_logp_difference/mean": 0.015898194164037704, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 65.640625, "completions/mean_terminated_length": 65.640625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.30609267950057983, "epoch": 0.26902654867256637, "frac_reward_zero_std": 0.75, "grad_norm": 11.061460167266631, "kl": 0.010558098554611206, "learning_rate": 6.68141592920354e-07, "loss": -0.0973, "num_tokens": 2389335.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5556261539459229, "sampling/importance_sampling_ratio/mean": 1.0001894235610962, "sampling/importance_sampling_ratio/min": 0.572333037853241, "sampling/sampling_logp_difference/max": 0.5580341815948486, "sampling/sampling_logp_difference/mean": 0.014833297580480576, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 36.953125, "completions/mean_terminated_length": 36.953125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.216139018535614, "epoch": 0.27079646017699116, "frac_reward_zero_std": 1.0, "grad_norm": 0.14097604253434792, "kl": 0.017395276576280594, "learning_rate": 6.72566371681416e-07, "loss": 0.0001, "num_tokens": 2404404.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.879258394241333, "sampling/importance_sampling_ratio/mean": 0.9984070062637329, "sampling/importance_sampling_ratio/min": 0.6819742321968079, "sampling/sampling_logp_difference/max": 0.6308772563934326, "sampling/sampling_logp_difference/mean": 0.012710087932646275, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 48.265625, "completions/mean_terminated_length": 48.265625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.1130940243601799, "epoch": 0.27256637168141595, "frac_reward_zero_std": 1.0, "grad_norm": 0.12575044725568155, "kl": 0.007728739641606808, "learning_rate": 6.769911504424779e-07, "loss": 0.0, "num_tokens": 2420661.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4918367862701416, "sampling/importance_sampling_ratio/mean": 0.9995026588439941, "sampling/importance_sampling_ratio/min": 0.6138349175453186, "sampling/sampling_logp_difference/max": 0.48802924156188965, "sampling/sampling_logp_difference/mean": 0.020763784646987915, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 59.953125, "completions/mean_terminated_length": 59.953125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.21759197115898132, "epoch": 0.2743362831858407, "frac_reward_zero_std": 0.5, "grad_norm": 18.00910866136154, "kl": 0.010943894274532795, "learning_rate": 6.814159292035397e-07, "loss": -0.221, "num_tokens": 2433986.0, "reward": 0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.969313383102417, "sampling/importance_sampling_ratio/mean": 1.002643346786499, "sampling/importance_sampling_ratio/min": 0.538426399230957, "sampling/sampling_logp_difference/max": 0.677685022354126, "sampling/sampling_logp_difference/mean": 0.024278851225972176, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 41.84375, "completions/mean_terminated_length": 41.84375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.24851931631565094, "epoch": 0.2761061946902655, "frac_reward_zero_std": 0.75, "grad_norm": 19.56733698517216, "kl": 0.0038888424169272184, "learning_rate": 6.858407079646017e-07, "loss": -0.2139, "num_tokens": 2447512.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.663854956626892, "sampling/importance_sampling_ratio/mean": 1.0001685619354248, "sampling/importance_sampling_ratio/min": 0.46025779843330383, "sampling/sampling_logp_difference/max": 0.7759685516357422, "sampling/sampling_logp_difference/mean": 0.012126958929002285, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 82.734375, "completions/mean_terminated_length": 82.734375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.24554727971553802, "epoch": 0.2778761061946903, "frac_reward_zero_std": 1.0, "grad_norm": 0.02984062761107293, "kl": 0.001689649187028408, "learning_rate": 6.902654867256636e-07, "loss": 0.0, "num_tokens": 2464983.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4411574602127075, "sampling/importance_sampling_ratio/mean": 1.000152349472046, "sampling/importance_sampling_ratio/min": 0.6802230477333069, "sampling/sampling_logp_difference/max": 0.38533449172973633, "sampling/sampling_logp_difference/mean": 0.013160878792405128, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 88.640625, "completions/mean_terminated_length": 88.640625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2844574451446533, "epoch": 0.27964601769911507, "frac_reward_zero_std": 0.5, "grad_norm": 8.589166739359197, "kl": 0.0038004093803465366, "learning_rate": 6.946902654867256e-07, "loss": 0.0108, "num_tokens": 2481360.0, "reward": 0.84375, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.5664699077606201, "sampling/importance_sampling_ratio/mean": 0.9985457062721252, "sampling/importance_sampling_ratio/min": 0.5478703379631042, "sampling/sampling_logp_difference/max": 0.6017166376113892, "sampling/sampling_logp_difference/mean": 0.017418205738067627, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 29.828125, "completions/mean_terminated_length": 29.828125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.13992829620838165, "epoch": 0.2814159292035398, "frac_reward_zero_std": 0.75, "grad_norm": 22.822138577571543, "kl": 0.007147709373384714, "learning_rate": 6.991150442477876e-07, "loss": -0.2478, "num_tokens": 2494325.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.9513019323349, "sampling/importance_sampling_ratio/mean": 0.9980819225311279, "sampling/importance_sampling_ratio/min": 0.49336642026901245, "sampling/sampling_logp_difference/max": 0.70650315284729, "sampling/sampling_logp_difference/mean": 0.01531197689473629, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 36.28125, "completions/mean_terminated_length": 36.28125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.1655530482530594, "epoch": 0.2831858407079646, "frac_reward_zero_std": 0.75, "grad_norm": 9.220842673661625, "kl": 0.005232499912381172, "learning_rate": 7.035398230088495e-07, "loss": 0.0035, "num_tokens": 2508055.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6507823467254639, "sampling/importance_sampling_ratio/mean": 1.0039724111557007, "sampling/importance_sampling_ratio/min": 0.5558677315711975, "sampling/sampling_logp_difference/max": 0.5872249603271484, "sampling/sampling_logp_difference/mean": 0.01936100609600544, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 37.234375, "completions/mean_terminated_length": 37.234375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.14813996851444244, "epoch": 0.2849557522123894, "frac_reward_zero_std": 1.0, "grad_norm": 0.06153866209942643, "kl": 0.0026678910944610834, "learning_rate": 7.079646017699115e-07, "loss": 0.0, "num_tokens": 2521126.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.862612009048462, "sampling/importance_sampling_ratio/mean": 0.9947205781936646, "sampling/importance_sampling_ratio/min": 0.5880116820335388, "sampling/sampling_logp_difference/max": 0.621979832649231, "sampling/sampling_logp_difference/mean": 0.024796247482299805, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 97.875, "completions/mean_terminated_length": 97.875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.4262969493865967, "epoch": 0.2867256637168142, "frac_reward_zero_std": 1.0, "grad_norm": 0.013023578679145853, "kl": 0.0014449837617576122, "learning_rate": 7.123893805309734e-07, "loss": 0.0, "num_tokens": 2538334.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.434495449066162, "sampling/importance_sampling_ratio/mean": 0.9998641014099121, "sampling/importance_sampling_ratio/min": 0.613407552242279, "sampling/sampling_logp_difference/max": 0.4887256622314453, "sampling/sampling_logp_difference/mean": 0.017236698418855667, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 48.375, "completions/mean_terminated_length": 48.375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.21288928389549255, "epoch": 0.2884955752212389, "frac_reward_zero_std": 0.75, "grad_norm": 15.800928761698737, "kl": 0.004314340651035309, "learning_rate": 7.168141592920353e-07, "loss": 0.37, "num_tokens": 2554374.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.3523565530776978, "sampling/importance_sampling_ratio/mean": 1.0010709762573242, "sampling/importance_sampling_ratio/min": 0.6108425259590149, "sampling/sampling_logp_difference/max": 0.4929161071777344, "sampling/sampling_logp_difference/mean": 0.012879086658358574, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 97.3125, "completions/mean_terminated_length": 97.3125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.21863287687301636, "epoch": 0.2902654867256637, "frac_reward_zero_std": 1.0, "grad_norm": 0.01218772305508651, "kl": 0.0015893366653472185, "learning_rate": 7.212389380530973e-07, "loss": 0.0, "num_tokens": 2569834.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000678539276123, "sampling/importance_sampling_ratio/min": 0.5868110060691833, "sampling/sampling_logp_difference/max": 0.9002819061279297, "sampling/sampling_logp_difference/mean": 0.012089526280760765, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 85.046875, "completions/mean_terminated_length": 85.046875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.303574800491333, "epoch": 0.2920353982300885, "frac_reward_zero_std": 0.5, "grad_norm": 5.2945564536762735, "kl": 0.003153447061777115, "learning_rate": 7.256637168141593e-07, "loss": 0.1266, "num_tokens": 2587805.0, "reward": 0.0, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.7874300479888916, "sampling/importance_sampling_ratio/mean": 0.9995529651641846, "sampling/importance_sampling_ratio/min": 0.6549264788627625, "sampling/sampling_logp_difference/max": 0.5807788372039795, "sampling/sampling_logp_difference/mean": 0.020131688565015793, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 62.4375, "completions/mean_terminated_length": 62.4375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3352992534637451, "epoch": 0.2938053097345133, "frac_reward_zero_std": 1.0, "grad_norm": 0.05527729473556852, "kl": 0.006250219885259867, "learning_rate": 7.300884955752212e-07, "loss": 0.0, "num_tokens": 2603433.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.315501093864441, "sampling/importance_sampling_ratio/mean": 0.9993723630905151, "sampling/importance_sampling_ratio/min": 0.65906822681427, "sampling/sampling_logp_difference/max": 0.4169282913208008, "sampling/sampling_logp_difference/mean": 0.01699444092810154, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/max_terminated_length": 645.0, "completions/mean_length": 122.328125, "completions/mean_terminated_length": 122.328125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.34051042795181274, "epoch": 0.29557522123893804, "frac_reward_zero_std": 0.75, "grad_norm": 3.2065456771565835, "kl": 0.004882376175373793, "learning_rate": 7.345132743362832e-07, "loss": 0.1907, "num_tokens": 2620878.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.4985473155975342, "sampling/importance_sampling_ratio/mean": 0.9992001056671143, "sampling/importance_sampling_ratio/min": 0.5491763353347778, "sampling/sampling_logp_difference/max": 0.5993356704711914, "sampling/sampling_logp_difference/mean": 0.015716129913926125, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 83.671875, "completions/mean_terminated_length": 83.671875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.4155896306037903, "epoch": 0.2973451327433628, "frac_reward_zero_std": 1.0, "grad_norm": 0.02412913910928674, "kl": 0.002413155511021614, "learning_rate": 7.389380530973452e-07, "loss": 0.0, "num_tokens": 2635865.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5505778789520264, "sampling/importance_sampling_ratio/mean": 0.9991369247436523, "sampling/importance_sampling_ratio/min": 0.5488505959510803, "sampling/sampling_logp_difference/max": 0.5999289751052856, "sampling/sampling_logp_difference/mean": 0.018513940274715424, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 95.734375, "completions/mean_terminated_length": 95.734375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.31833508610725403, "epoch": 0.2991150442477876, "frac_reward_zero_std": 1.0, "grad_norm": 0.018637248678998186, "kl": 0.0012480122968554497, "learning_rate": 7.433628318584071e-07, "loss": 0.0, "num_tokens": 2652520.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5856602191925049, "sampling/importance_sampling_ratio/mean": 1.0013822317123413, "sampling/importance_sampling_ratio/min": 0.573755145072937, "sampling/sampling_logp_difference/max": 0.55555260181427, "sampling/sampling_logp_difference/mean": 0.015990041196346283, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 66.3125, "completions/mean_terminated_length": 66.3125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.36399298906326294, "epoch": 0.3008849557522124, "frac_reward_zero_std": 0.5, "grad_norm": 11.85054564249182, "kl": 0.0069623468443751335, "learning_rate": 7.477876106194691e-07, "loss": 0.3686, "num_tokens": 2670876.0, "reward": 0.625, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.423884391784668, "sampling/importance_sampling_ratio/mean": 0.999387264251709, "sampling/importance_sampling_ratio/min": 0.5508221983909607, "sampling/sampling_logp_difference/max": 0.5963431596755981, "sampling/sampling_logp_difference/mean": 0.01819230616092682, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 65.265625, "completions/mean_terminated_length": 65.265625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3241598606109619, "epoch": 0.30265486725663715, "frac_reward_zero_std": 1.0, "grad_norm": 0.03064381044639263, "kl": 0.0015449854545295238, "learning_rate": 7.522123893805308e-07, "loss": 0.0, "num_tokens": 2690733.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9951161742210388, "sampling/importance_sampling_ratio/min": 0.5531994104385376, "sampling/sampling_logp_difference/max": 0.7696962356567383, "sampling/sampling_logp_difference/mean": 0.02130197361111641, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 60.546875, "completions/mean_terminated_length": 60.546875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.34832802414894104, "epoch": 0.30442477876106194, "frac_reward_zero_std": 1.0, "grad_norm": 0.01942212919039001, "kl": 0.0010704933665692806, "learning_rate": 7.566371681415928e-07, "loss": 0.0, "num_tokens": 2710400.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3826576471328735, "sampling/importance_sampling_ratio/mean": 0.9970792531967163, "sampling/importance_sampling_ratio/min": 0.6549003720283508, "sampling/sampling_logp_difference/max": 0.42327213287353516, "sampling/sampling_logp_difference/mean": 0.02165486104786396, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 99.46875, "completions/mean_terminated_length": 99.46875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.5507562160491943, "epoch": 0.30619469026548674, "frac_reward_zero_std": 0.75, "grad_norm": 3.2067772660198575, "kl": 0.0015644296072423458, "learning_rate": 7.610619469026548e-07, "loss": 0.1168, "num_tokens": 2725870.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6677830219268799, "sampling/importance_sampling_ratio/mean": 0.9991893768310547, "sampling/importance_sampling_ratio/min": 0.3073960840702057, "sampling/sampling_logp_difference/max": 1.1796181201934814, "sampling/sampling_logp_difference/mean": 0.0195973701775074, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 65.59375, "completions/mean_terminated_length": 65.59375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2421538382768631, "epoch": 0.30796460176991153, "frac_reward_zero_std": 1.0, "grad_norm": 0.029068109786864894, "kl": 0.0013104917015880346, "learning_rate": 7.654867256637167e-07, "loss": 0.0, "num_tokens": 2740260.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995702505111694, "sampling/importance_sampling_ratio/min": 0.6254555583000183, "sampling/sampling_logp_difference/max": 0.7899044752120972, "sampling/sampling_logp_difference/mean": 0.022774621844291687, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 75.921875, "completions/mean_terminated_length": 75.921875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3197605013847351, "epoch": 0.30973451327433627, "frac_reward_zero_std": 0.75, "grad_norm": 3.6400337164230887, "kl": 0.0015748351579532027, "learning_rate": 7.699115044247787e-07, "loss": -0.0426, "num_tokens": 2753055.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5528416633605957, "sampling/importance_sampling_ratio/mean": 0.9991353750228882, "sampling/importance_sampling_ratio/min": 0.6129763126373291, "sampling/sampling_logp_difference/max": 0.4894289970397949, "sampling/sampling_logp_difference/mean": 0.013405116274952888, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 113.0, "completions/max_terminated_length": 113.0, "completions/mean_length": 34.75, "completions/mean_terminated_length": 34.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.1282852590084076, "epoch": 0.31150442477876106, "frac_reward_zero_std": 0.75, "grad_norm": 5.965701958805946, "kl": 0.0016409424133598804, "learning_rate": 7.743362831858407e-07, "loss": 0.0365, "num_tokens": 2766895.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.306551456451416, "sampling/importance_sampling_ratio/mean": 1.00221848487854, "sampling/importance_sampling_ratio/min": 0.6547200679779053, "sampling/sampling_logp_difference/max": 0.42354750633239746, "sampling/sampling_logp_difference/mean": 0.012759683653712273, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 86.859375, "completions/mean_terminated_length": 86.859375, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.3257979154586792, "epoch": 0.31327433628318585, "frac_reward_zero_std": 0.75, "grad_norm": 2.1971454945435758, "kl": 0.0011299046454951167, "learning_rate": 7.787610619469026e-07, "loss": 0.0277, "num_tokens": 2784630.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.28737211227417, "sampling/importance_sampling_ratio/mean": 1.0015079975128174, "sampling/importance_sampling_ratio/min": 0.6338574886322021, "sampling/sampling_logp_difference/max": 0.4559311866760254, "sampling/sampling_logp_difference/mean": 0.014637185260653496, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 35.921875, "completions/mean_terminated_length": 35.921875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.24986636638641357, "epoch": 0.31504424778761064, "frac_reward_zero_std": 0.75, "grad_norm": 7.890195329991731, "kl": 0.006909585557878017, "learning_rate": 7.831858407079646e-07, "loss": 0.2545, "num_tokens": 2797281.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.8822228908538818, "sampling/importance_sampling_ratio/mean": 0.999498724937439, "sampling/importance_sampling_ratio/min": 0.277087539434433, "sampling/sampling_logp_difference/max": 1.2834217548370361, "sampling/sampling_logp_difference/mean": 0.01401694305241108, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 71.921875, "completions/mean_terminated_length": 71.921875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3881117105484009, "epoch": 0.3168141592920354, "frac_reward_zero_std": 1.0, "grad_norm": 0.0174497927871592, "kl": 0.001206007320433855, "learning_rate": 7.876106194690266e-07, "loss": 0.0, "num_tokens": 2812412.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.54652738571167, "sampling/importance_sampling_ratio/mean": 1.0000944137573242, "sampling/importance_sampling_ratio/min": 0.4926236569881439, "sampling/sampling_logp_difference/max": 0.7080097198486328, "sampling/sampling_logp_difference/mean": 0.01670251041650772, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 81.234375, "completions/mean_terminated_length": 81.234375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.37518447637557983, "epoch": 0.3185840707964602, "frac_reward_zero_std": 0.75, "grad_norm": 11.298833240783317, "kl": 0.0012631581630557775, "learning_rate": 7.920353982300884e-07, "loss": 0.1495, "num_tokens": 2829723.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9986304640769958, "sampling/importance_sampling_ratio/min": 0.6893054246902466, "sampling/sampling_logp_difference/max": 0.7487752437591553, "sampling/sampling_logp_difference/mean": 0.01565636694431305, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 68.59375, "completions/mean_terminated_length": 68.59375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.19516851007938385, "epoch": 0.32035398230088497, "frac_reward_zero_std": 1.0, "grad_norm": 0.022769574972825203, "kl": 0.0016036881133913994, "learning_rate": 7.964601769911504e-07, "loss": 0.0, "num_tokens": 2843313.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.887073040008545, "sampling/importance_sampling_ratio/mean": 0.9991527795791626, "sampling/importance_sampling_ratio/min": 0.5863591432571411, "sampling/sampling_logp_difference/max": 0.6350269317626953, "sampling/sampling_logp_difference/mean": 0.019246196374297142, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 39.78125, "completions/mean_terminated_length": 39.78125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2743300795555115, "epoch": 0.32212389380530976, "frac_reward_zero_std": 1.0, "grad_norm": 0.03835264627824913, "kl": 0.0019648310262709856, "learning_rate": 8.008849557522124e-07, "loss": 0.0, "num_tokens": 2858579.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.516083836555481, "sampling/importance_sampling_ratio/mean": 1.0001496076583862, "sampling/importance_sampling_ratio/min": 0.688202440738678, "sampling/sampling_logp_difference/max": 0.4161306619644165, "sampling/sampling_logp_difference/mean": 0.014625566080212593, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 90.515625, "completions/mean_terminated_length": 90.515625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.40749549865722656, "epoch": 0.3238938053097345, "frac_reward_zero_std": 1.0, "grad_norm": 0.021686218872358942, "kl": 0.0016722225118428469, "learning_rate": 8.053097345132743e-07, "loss": 0.0, "num_tokens": 2875172.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6169039011001587, "sampling/importance_sampling_ratio/mean": 1.0007729530334473, "sampling/importance_sampling_ratio/min": 0.6497799158096313, "sampling/sampling_logp_difference/max": 0.4805130958557129, "sampling/sampling_logp_difference/mean": 0.014663636684417725, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 96.640625, "completions/mean_terminated_length": 96.640625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.4288424253463745, "epoch": 0.3256637168141593, "frac_reward_zero_std": 1.0, "grad_norm": 0.032833059064123725, "kl": 0.0040725404396653175, "learning_rate": 8.097345132743363e-07, "loss": 0.0, "num_tokens": 2891581.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0008609294891357, "sampling/importance_sampling_ratio/min": 0.6394206285476685, "sampling/sampling_logp_difference/max": 0.7237358093261719, "sampling/sampling_logp_difference/mean": 0.02272997982800007, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 82.953125, "completions/mean_terminated_length": 82.953125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.4191206097602844, "epoch": 0.3274336283185841, "frac_reward_zero_std": 1.0, "grad_norm": 0.017823889993753107, "kl": 0.0016019068425521255, "learning_rate": 8.141592920353983e-07, "loss": 0.0, "num_tokens": 2906906.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3651630878448486, "sampling/importance_sampling_ratio/mean": 0.998401403427124, "sampling/importance_sampling_ratio/min": 0.6447701454162598, "sampling/sampling_logp_difference/max": 0.4388613700866699, "sampling/sampling_logp_difference/mean": 0.020062603056430817, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 67.65625, "completions/mean_terminated_length": 67.65625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3816743493080139, "epoch": 0.3292035398230089, "frac_reward_zero_std": 1.0, "grad_norm": 0.018024805681682764, "kl": 0.0013134465552866459, "learning_rate": 8.185840707964602e-07, "loss": 0.0, "num_tokens": 2922388.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.706222653388977, "sampling/importance_sampling_ratio/mean": 0.9995259046554565, "sampling/importance_sampling_ratio/min": 0.5038576722145081, "sampling/sampling_logp_difference/max": 0.6854615211486816, "sampling/sampling_logp_difference/mean": 0.016284022480249405, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 88.015625, "completions/mean_terminated_length": 88.015625, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.4469081163406372, "epoch": 0.3309734513274336, "frac_reward_zero_std": 0.5, "grad_norm": 6.371433628774608, "kl": 0.00285714166238904, "learning_rate": 8.230088495575221e-07, "loss": -0.0155, "num_tokens": 2938437.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4655020236968994, "sampling/importance_sampling_ratio/mean": 0.9996187686920166, "sampling/importance_sampling_ratio/min": 0.6309859752655029, "sampling/sampling_logp_difference/max": 0.46047163009643555, "sampling/sampling_logp_difference/mean": 0.01801963523030281, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 96.90625, "completions/mean_terminated_length": 96.90625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.21593278646469116, "epoch": 0.3327433628318584, "frac_reward_zero_std": 0.75, "grad_norm": 4.2014978144866415, "kl": 0.0016711852513253689, "learning_rate": 8.274336283185839e-07, "loss": 0.0088, "num_tokens": 2955631.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.4248566627502441, "sampling/importance_sampling_ratio/mean": 1.0010021924972534, "sampling/importance_sampling_ratio/min": 0.3327132761478424, "sampling/sampling_logp_difference/max": 1.1004741191864014, "sampling/sampling_logp_difference/mean": 0.015623951330780983, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 96.140625, "completions/mean_terminated_length": 96.140625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.45695868134498596, "epoch": 0.3345132743362832, "frac_reward_zero_std": 0.75, "grad_norm": 3.4257601631699997, "kl": 0.0024920753203332424, "learning_rate": 8.318584070796459e-07, "loss": 0.0294, "num_tokens": 2972056.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6508358716964722, "sampling/importance_sampling_ratio/mean": 1.0013859272003174, "sampling/importance_sampling_ratio/min": 0.5403315424919128, "sampling/sampling_logp_difference/max": 0.615572452545166, "sampling/sampling_logp_difference/mean": 0.019048312678933144, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 37.03125, "completions/mean_terminated_length": 37.03125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.13859203457832336, "epoch": 0.336283185840708, "frac_reward_zero_std": 0.75, "grad_norm": 8.097821127529105, "kl": 0.005911725573241711, "learning_rate": 8.362831858407079e-07, "loss": 0.2139, "num_tokens": 2985642.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4755970239639282, "sampling/importance_sampling_ratio/mean": 1.0006300210952759, "sampling/importance_sampling_ratio/min": 0.6624336838722229, "sampling/sampling_logp_difference/max": 0.41183483600616455, "sampling/sampling_logp_difference/mean": 0.01723603904247284, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 53.203125, "completions/mean_terminated_length": 53.203125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3490672707557678, "epoch": 0.3380530973451327, "frac_reward_zero_std": 1.0, "grad_norm": 0.07868897743366499, "kl": 0.005042393691837788, "learning_rate": 8.407079646017698e-07, "loss": 0.0, "num_tokens": 2999239.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.3826329708099365, "sampling/importance_sampling_ratio/mean": 0.9992291927337646, "sampling/importance_sampling_ratio/min": 0.5984382033348083, "sampling/sampling_logp_difference/max": 0.5134320259094238, "sampling/sampling_logp_difference/mean": 0.021959085017442703, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 71.734375, "completions/mean_terminated_length": 71.734375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.307967871427536, "epoch": 0.3398230088495575, "frac_reward_zero_std": 1.0, "grad_norm": 0.025977733696619907, "kl": 0.0022195237688720226, "learning_rate": 8.451327433628318e-07, "loss": 0.0, "num_tokens": 3016038.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5421792268753052, "sampling/importance_sampling_ratio/mean": 1.0004302263259888, "sampling/importance_sampling_ratio/min": 0.5187493562698364, "sampling/sampling_logp_difference/max": 0.656334400177002, "sampling/sampling_logp_difference/mean": 0.012343363836407661, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 65.75, "completions/mean_terminated_length": 65.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3567613661289215, "epoch": 0.3415929203539823, "frac_reward_zero_std": 1.0, "grad_norm": 0.0460271782944802, "kl": 0.016741083934903145, "learning_rate": 8.495575221238938e-07, "loss": 0.0001, "num_tokens": 3031846.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0010594129562378, "sampling/importance_sampling_ratio/min": 0.5105498433113098, "sampling/sampling_logp_difference/max": 0.7371149063110352, "sampling/sampling_logp_difference/mean": 0.019204776734113693, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 66.328125, "completions/mean_terminated_length": 66.328125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3064947724342346, "epoch": 0.3433628318584071, "frac_reward_zero_std": 0.75, "grad_norm": 5.828728528219838, "kl": 0.003697988111525774, "learning_rate": 8.539823008849557e-07, "loss": -0.1107, "num_tokens": 3046859.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001063346862793, "sampling/importance_sampling_ratio/min": 0.6153205633163452, "sampling/sampling_logp_difference/max": 0.8466815948486328, "sampling/sampling_logp_difference/mean": 0.01753907836973667, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 95.890625, "completions/mean_terminated_length": 95.890625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.45068109035491943, "epoch": 0.34513274336283184, "frac_reward_zero_std": 1.0, "grad_norm": 0.027973118552110467, "kl": 0.002571109915152192, "learning_rate": 8.584070796460177e-07, "loss": 0.0, "num_tokens": 3062788.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000136137008667, "sampling/importance_sampling_ratio/min": 0.6952780485153198, "sampling/sampling_logp_difference/max": 0.8257927894592285, "sampling/sampling_logp_difference/mean": 0.015602972358465195, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 47.3125, "completions/mean_terminated_length": 47.3125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.29853731393814087, "epoch": 0.34690265486725663, "frac_reward_zero_std": 1.0, "grad_norm": 0.07086573487077374, "kl": 0.035713452845811844, "learning_rate": 8.628318584070797e-07, "loss": 0.0001, "num_tokens": 3076456.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4578688144683838, "sampling/importance_sampling_ratio/mean": 0.9997751712799072, "sampling/importance_sampling_ratio/min": 0.3986743092536926, "sampling/sampling_logp_difference/max": 0.9196105003356934, "sampling/sampling_logp_difference/mean": 0.020105183124542236, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 50.34375, "completions/mean_terminated_length": 50.34375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3566172122955322, "epoch": 0.3486725663716814, "frac_reward_zero_std": 1.0, "grad_norm": 0.050854542449691606, "kl": 0.02495628222823143, "learning_rate": 8.672566371681415e-07, "loss": 0.0001, "num_tokens": 3090718.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6565725803375244, "sampling/importance_sampling_ratio/mean": 1.0001130104064941, "sampling/importance_sampling_ratio/min": 0.6099073886871338, "sampling/sampling_logp_difference/max": 0.5047507286071777, "sampling/sampling_logp_difference/mean": 0.017076190561056137, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 88.65625, "completions/mean_terminated_length": 88.65625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.6213746070861816, "epoch": 0.3504424778761062, "frac_reward_zero_std": 0.75, "grad_norm": 12.629178819001702, "kl": 0.003796565579250455, "learning_rate": 8.716814159292035e-07, "loss": 0.0921, "num_tokens": 3106456.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.5601270198822021, "sampling/importance_sampling_ratio/mean": 0.9996728897094727, "sampling/importance_sampling_ratio/min": 0.6842789053916931, "sampling/sampling_logp_difference/max": 0.4447672367095947, "sampling/sampling_logp_difference/mean": 0.021474361419677734, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 84.3125, "completions/mean_terminated_length": 84.3125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3975464701652527, "epoch": 0.35221238938053095, "frac_reward_zero_std": 0.75, "grad_norm": 6.019198297739022, "kl": 0.007026151288300753, "learning_rate": 8.761061946902655e-07, "loss": 0.2102, "num_tokens": 3122780.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.674494743347168, "sampling/importance_sampling_ratio/mean": 1.0001417398452759, "sampling/importance_sampling_ratio/min": 0.3294057250022888, "sampling/sampling_logp_difference/max": 1.1104650497436523, "sampling/sampling_logp_difference/mean": 0.020129112526774406, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 40.71875, "completions/mean_terminated_length": 40.71875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.19297020137310028, "epoch": 0.35398230088495575, "frac_reward_zero_std": 1.0, "grad_norm": 0.03618304098894548, "kl": 0.001933026360347867, "learning_rate": 8.805309734513274e-07, "loss": 0.0, "num_tokens": 3143066.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5666741132736206, "sampling/importance_sampling_ratio/mean": 0.9981307983398438, "sampling/importance_sampling_ratio/min": 0.40572628378868103, "sampling/sampling_logp_difference/max": 0.9020766019821167, "sampling/sampling_logp_difference/mean": 0.021881964057683945, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 13.6875, "completions/mean_terminated_length": 13.6875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.06337769329547882, "epoch": 0.35575221238938054, "frac_reward_zero_std": 1.0, "grad_norm": 0.3730626436437873, "kl": 0.005040082149207592, "learning_rate": 8.849557522123894e-07, "loss": 0.0001, "num_tokens": 3153878.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5257240533828735, "sampling/importance_sampling_ratio/mean": 0.9977389574050903, "sampling/importance_sampling_ratio/min": 0.4783879518508911, "sampling/sampling_logp_difference/max": 0.7373332977294922, "sampling/sampling_logp_difference/mean": 0.012821320444345474, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 20.921875, "completions/mean_terminated_length": 20.921875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.16319364309310913, "epoch": 0.35752212389380533, "frac_reward_zero_std": 0.75, "grad_norm": 29.22328448711984, "kl": 0.04330842196941376, "learning_rate": 8.893805309734513e-07, "loss": -0.4998, "num_tokens": 3168065.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.6684596538543701, "sampling/importance_sampling_ratio/mean": 0.9991862177848816, "sampling/importance_sampling_ratio/min": 0.6727373003959656, "sampling/sampling_logp_difference/max": 0.5119008421897888, "sampling/sampling_logp_difference/mean": 0.022439446300268173, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 38.34375, "completions/mean_terminated_length": 38.34375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.19670294225215912, "epoch": 0.35929203539823007, "frac_reward_zero_std": 1.0, "grad_norm": 0.1867018254244633, "kl": 0.03805701807141304, "learning_rate": 8.938053097345132e-07, "loss": 0.0002, "num_tokens": 3180439.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3781383037567139, "sampling/importance_sampling_ratio/mean": 1.00034761428833, "sampling/importance_sampling_ratio/min": 0.6061520576477051, "sampling/sampling_logp_difference/max": 0.500624418258667, "sampling/sampling_logp_difference/mean": 0.016402604058384895, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 124.0, "completions/max_terminated_length": 124.0, "completions/mean_length": 16.984375, "completions/mean_terminated_length": 16.984375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.08758561313152313, "epoch": 0.36106194690265486, "frac_reward_zero_std": 0.75, "grad_norm": 67.96663604069774, "kl": 0.03069724701344967, "learning_rate": 8.982300884955752e-07, "loss": -0.3958, "num_tokens": 3194870.0, "reward": -0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.944921612739563, "sampling/importance_sampling_ratio/mean": 1.0006206035614014, "sampling/importance_sampling_ratio/min": 0.6622530221939087, "sampling/sampling_logp_difference/max": 0.6652216911315918, "sampling/sampling_logp_difference/mean": 0.010742882266640663, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.328125, "completions/mean_terminated_length": 15.328125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.06518861651420593, "epoch": 0.36283185840707965, "frac_reward_zero_std": 1.0, "grad_norm": 0.09771435391875398, "kl": 0.02997596189379692, "learning_rate": 9.026548672566371e-07, "loss": 0.0002, "num_tokens": 3208795.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.0926145315170288, "sampling/importance_sampling_ratio/mean": 0.999653160572052, "sampling/importance_sampling_ratio/min": 0.6137433052062988, "sampling/sampling_logp_difference/max": 0.4881784915924072, "sampling/sampling_logp_difference/mean": 0.005968471057713032, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 58.0625, "completions/mean_terminated_length": 58.0625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2882225811481476, "epoch": 0.36460176991150445, "frac_reward_zero_std": 1.0, "grad_norm": 0.055160220710038725, "kl": 0.003100259928032756, "learning_rate": 9.07079646017699e-07, "loss": 0.0, "num_tokens": 3222911.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3295564651489258, "sampling/importance_sampling_ratio/mean": 0.9999579191207886, "sampling/importance_sampling_ratio/min": 0.6964197754859924, "sampling/sampling_logp_difference/max": 0.36180269718170166, "sampling/sampling_logp_difference/mean": 0.010741502977907658, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 68.4375, "completions/mean_terminated_length": 68.4375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3214592933654785, "epoch": 0.3663716814159292, "frac_reward_zero_std": 0.75, "grad_norm": 6.336265981679781, "kl": 0.005980383604764938, "learning_rate": 9.11504424778761e-07, "loss": -0.0132, "num_tokens": 3237387.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.3671600818634033, "sampling/importance_sampling_ratio/mean": 1.0011777877807617, "sampling/importance_sampling_ratio/min": 0.6046066284179688, "sampling/sampling_logp_difference/max": 0.5031771659851074, "sampling/sampling_logp_difference/mean": 0.015451844781637192, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 108.15625, "completions/mean_terminated_length": 108.15625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.35494041442871094, "epoch": 0.368141592920354, "frac_reward_zero_std": 0.75, "grad_norm": 2.6888265767189843, "kl": 0.002561464672908187, "learning_rate": 9.159292035398229e-07, "loss": 0.0946, "num_tokens": 3254645.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9973541498184204, "sampling/importance_sampling_ratio/min": 0.6656482815742493, "sampling/sampling_logp_difference/max": 0.7017228603363037, "sampling/sampling_logp_difference/mean": 0.015536784194409847, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 106.453125, "completions/mean_terminated_length": 106.453125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.5013563632965088, "epoch": 0.36991150442477877, "frac_reward_zero_std": 1.0, "grad_norm": 0.05157514700743391, "kl": 0.0033573838882148266, "learning_rate": 9.203539823008849e-07, "loss": 0.0, "num_tokens": 3270530.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6783130168914795, "sampling/importance_sampling_ratio/mean": 0.9998264312744141, "sampling/importance_sampling_ratio/min": 0.5489054918289185, "sampling/sampling_logp_difference/max": 0.5998289585113525, "sampling/sampling_logp_difference/mean": 0.018359629437327385, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 44.453125, "completions/mean_terminated_length": 44.453125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.1675224006175995, "epoch": 0.37168141592920356, "frac_reward_zero_std": 1.0, "grad_norm": 0.03374225208507583, "kl": 0.001434197649359703, "learning_rate": 9.247787610619469e-07, "loss": 0.0, "num_tokens": 3285727.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7428385019302368, "sampling/importance_sampling_ratio/mean": 0.9988299608230591, "sampling/importance_sampling_ratio/min": 0.5252789855003357, "sampling/sampling_logp_difference/max": 0.6438257694244385, "sampling/sampling_logp_difference/mean": 0.01490543782711029, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 40.21875, "completions/mean_terminated_length": 40.21875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.18517419695854187, "epoch": 0.3734513274336283, "frac_reward_zero_std": 1.0, "grad_norm": 0.050757562647429574, "kl": 0.002568627241998911, "learning_rate": 9.292035398230088e-07, "loss": 0.0, "num_tokens": 3300285.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.381851315498352, "sampling/importance_sampling_ratio/mean": 0.9986093044281006, "sampling/importance_sampling_ratio/min": 0.609358549118042, "sampling/sampling_logp_difference/max": 0.4953484535217285, "sampling/sampling_logp_difference/mean": 0.012276686728000641, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 71.78125, "completions/mean_terminated_length": 71.78125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2278621345758438, "epoch": 0.3752212389380531, "frac_reward_zero_std": 1.0, "grad_norm": 0.03204199727428532, "kl": 0.003042126540094614, "learning_rate": 9.336283185840708e-07, "loss": 0.0, "num_tokens": 3316879.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.281581997871399, "sampling/importance_sampling_ratio/mean": 0.9995130896568298, "sampling/importance_sampling_ratio/min": 0.6369761228561401, "sampling/sampling_logp_difference/max": 0.4510231018066406, "sampling/sampling_logp_difference/mean": 0.014066744595766068, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 38.34375, "completions/mean_terminated_length": 38.34375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.17596986889839172, "epoch": 0.3769911504424779, "frac_reward_zero_std": 1.0, "grad_norm": 0.09589961445803016, "kl": 0.004617048427462578, "learning_rate": 9.380530973451328e-07, "loss": 0.0, "num_tokens": 3331989.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.341760516166687, "sampling/importance_sampling_ratio/mean": 0.9994263648986816, "sampling/importance_sampling_ratio/min": 0.7102124094963074, "sampling/sampling_logp_difference/max": 0.342191219329834, "sampling/sampling_logp_difference/mean": 0.010547621175646782, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 102.0, "completions/mean_terminated_length": 102.0, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.46252167224884033, "epoch": 0.3787610619469027, "frac_reward_zero_std": 1.0, "grad_norm": 0.03145555998263438, "kl": 0.003119957633316517, "learning_rate": 9.424778761061947e-07, "loss": 0.0, "num_tokens": 3348373.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6221009492874146, "sampling/importance_sampling_ratio/mean": 0.9998818039894104, "sampling/importance_sampling_ratio/min": 0.20803289115428925, "sampling/sampling_logp_difference/max": 1.570059061050415, "sampling/sampling_logp_difference/mean": 0.01729355938732624, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 62.59375, "completions/mean_terminated_length": 62.59375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2621881365776062, "epoch": 0.3805309734513274, "frac_reward_zero_std": 1.0, "grad_norm": 0.056861081675358534, "kl": 0.005555910989642143, "learning_rate": 9.469026548672566e-07, "loss": 0.0, "num_tokens": 3362795.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7976553440093994, "sampling/importance_sampling_ratio/mean": 1.002416729927063, "sampling/importance_sampling_ratio/min": 0.5128350853919983, "sampling/sampling_logp_difference/max": 0.667801022529602, "sampling/sampling_logp_difference/mean": 0.01444577518850565, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 59.53125, "completions/mean_terminated_length": 59.53125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3704109191894531, "epoch": 0.3823008849557522, "frac_reward_zero_std": 1.0, "grad_norm": 0.1242170837294012, "kl": 0.005496586672961712, "learning_rate": 9.513274336283185e-07, "loss": 0.0001, "num_tokens": 3378029.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5919092893600464, "sampling/importance_sampling_ratio/mean": 1.0035631656646729, "sampling/importance_sampling_ratio/min": 0.7009298801422119, "sampling/sampling_logp_difference/max": 0.4649341106414795, "sampling/sampling_logp_difference/mean": 0.020102720707654953, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 98.078125, "completions/mean_terminated_length": 98.078125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3987380266189575, "epoch": 0.384070796460177, "frac_reward_zero_std": 1.0, "grad_norm": 0.045585197120377674, "kl": 0.003970324993133545, "learning_rate": 9.557522123893805e-07, "loss": 0.0, "num_tokens": 3394690.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998895525932312, "sampling/importance_sampling_ratio/min": 0.5717587471008301, "sampling/sampling_logp_difference/max": 0.921055793762207, "sampling/sampling_logp_difference/mean": 0.01475785207003355, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 34.296875, "completions/mean_terminated_length": 34.296875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.19475924968719482, "epoch": 0.3858407079646018, "frac_reward_zero_std": 1.0, "grad_norm": 0.11738437086607027, "kl": 0.010515974834561348, "learning_rate": 9.601769911504426e-07, "loss": 0.0001, "num_tokens": 3407157.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5994064807891846, "sampling/importance_sampling_ratio/mean": 0.9995347261428833, "sampling/importance_sampling_ratio/min": 0.6554685831069946, "sampling/sampling_logp_difference/max": 0.469632625579834, "sampling/sampling_logp_difference/mean": 0.011333119124174118, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 54.765625, "completions/mean_terminated_length": 54.765625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.33620890974998474, "epoch": 0.38761061946902653, "frac_reward_zero_std": 1.0, "grad_norm": 0.10464694120487733, "kl": 0.004381977953016758, "learning_rate": 9.646017699115042e-07, "loss": 0.0001, "num_tokens": 3421606.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5475236177444458, "sampling/importance_sampling_ratio/mean": 1.0017409324645996, "sampling/importance_sampling_ratio/min": 0.6921407580375671, "sampling/sampling_logp_difference/max": 0.43665599822998047, "sampling/sampling_logp_difference/mean": 0.022906150668859482, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 52.09375, "completions/mean_terminated_length": 52.09375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.17895236611366272, "epoch": 0.3893805309734513, "frac_reward_zero_std": 0.75, "grad_norm": 8.487661476567311, "kl": 0.002079170662909746, "learning_rate": 9.690265486725663e-07, "loss": 0.184, "num_tokens": 3435900.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4371217489242554, "sampling/importance_sampling_ratio/mean": 0.9985246658325195, "sampling/importance_sampling_ratio/min": 0.5759603381156921, "sampling/sampling_logp_difference/max": 0.5517165660858154, "sampling/sampling_logp_difference/mean": 0.017181089147925377, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 41.65625, "completions/mean_terminated_length": 41.65625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.19378438591957092, "epoch": 0.3911504424778761, "frac_reward_zero_std": 1.0, "grad_norm": 0.057238480728486814, "kl": 0.0032039829529821873, "learning_rate": 9.734513274336282e-07, "loss": 0.0, "num_tokens": 3450294.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4914631843566895, "sampling/importance_sampling_ratio/mean": 0.9990211129188538, "sampling/importance_sampling_ratio/min": 0.6217343211174011, "sampling/sampling_logp_difference/max": 0.47524237632751465, "sampling/sampling_logp_difference/mean": 0.01241573877632618, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 62.703125, "completions/mean_terminated_length": 62.703125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3654828667640686, "epoch": 0.3929203539823009, "frac_reward_zero_std": 1.0, "grad_norm": 0.035383096163377675, "kl": 0.0059638191014528275, "learning_rate": 9.778761061946902e-07, "loss": 0.0, "num_tokens": 3465411.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5720010995864868, "sampling/importance_sampling_ratio/mean": 0.9991636276245117, "sampling/importance_sampling_ratio/min": 0.6438614726066589, "sampling/sampling_logp_difference/max": 0.4523494243621826, "sampling/sampling_logp_difference/mean": 0.01645643450319767, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 108.0, "completions/max_terminated_length": 108.0, "completions/mean_length": 32.265625, "completions/mean_terminated_length": 32.265625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.12503893673419952, "epoch": 0.39469026548672564, "frac_reward_zero_std": 1.0, "grad_norm": 0.029294659331895984, "kl": 0.0009783116402104497, "learning_rate": 9.82300884955752e-07, "loss": 0.0, "num_tokens": 3479988.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2597873210906982, "sampling/importance_sampling_ratio/mean": 1.0013203620910645, "sampling/importance_sampling_ratio/min": 0.7212660312652588, "sampling/sampling_logp_difference/max": 0.32674723863601685, "sampling/sampling_logp_difference/mean": 0.009833626449108124, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 57.65625, "completions/mean_terminated_length": 57.65625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.242695152759552, "epoch": 0.39646017699115044, "frac_reward_zero_std": 1.0, "grad_norm": 0.02587311644919685, "kl": 0.0017175667453557253, "learning_rate": 9.867256637168142e-07, "loss": 0.0, "num_tokens": 3495246.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.328568935394287, "sampling/importance_sampling_ratio/mean": 1.0001909732818604, "sampling/importance_sampling_ratio/min": 0.7182667851448059, "sampling/sampling_logp_difference/max": 0.3309142589569092, "sampling/sampling_logp_difference/mean": 0.008995315060019493, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 96.734375, "completions/mean_terminated_length": 96.734375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.39507365226745605, "epoch": 0.39823008849557523, "frac_reward_zero_std": 0.75, "grad_norm": 3.7512447226554966, "kl": 0.004321282729506493, "learning_rate": 9.91150442477876e-07, "loss": -0.0746, "num_tokens": 3511581.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.593572974205017, "sampling/importance_sampling_ratio/mean": 1.0000005960464478, "sampling/importance_sampling_ratio/min": 0.6780757904052734, "sampling/sampling_logp_difference/max": 0.46597862243652344, "sampling/sampling_logp_difference/mean": 0.01681409776210785, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 35.171875, "completions/mean_terminated_length": 35.171875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.22788852453231812, "epoch": 0.4, "frac_reward_zero_std": 1.0, "grad_norm": 0.05385673391397239, "kl": 0.004239211790263653, "learning_rate": 9.95575221238938e-07, "loss": 0.0, "num_tokens": 3523800.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0043925046920776, "sampling/importance_sampling_ratio/min": 0.7580122351646423, "sampling/sampling_logp_difference/max": 0.705294132232666, "sampling/sampling_logp_difference/mean": 0.016978923231363297, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 43.015625, "completions/mean_terminated_length": 43.015625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.1527884155511856, "epoch": 0.40176991150442476, "frac_reward_zero_std": 0.75, "grad_norm": 21.641425147437015, "kl": 0.004469120409339666, "learning_rate": 1e-06, "loss": -0.1624, "num_tokens": 3537321.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0010275840759277, "sampling/importance_sampling_ratio/min": 0.4817606806755066, "sampling/sampling_logp_difference/max": 0.7303078174591064, "sampling/sampling_logp_difference/mean": 0.017776301130652428, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 30.71875, "completions/mean_terminated_length": 30.71875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.19440095126628876, "epoch": 0.40353982300884955, "frac_reward_zero_std": 0.75, "grad_norm": 36.71682744138212, "kl": 0.010755006223917007, "learning_rate": 9.999994035998135e-07, "loss": 0.3461, "num_tokens": 3550775.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.5073049068450928, "sampling/importance_sampling_ratio/mean": 0.9993104934692383, "sampling/importance_sampling_ratio/min": 0.6966415047645569, "sampling/sampling_logp_difference/max": 0.41032326221466064, "sampling/sampling_logp_difference/mean": 0.01622116006910801, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 107.890625, "completions/mean_terminated_length": 107.890625, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.4642508029937744, "epoch": 0.40530973451327434, "frac_reward_zero_std": 1.0, "grad_norm": 0.054119781405429675, "kl": 0.0034284957218915224, "learning_rate": 9.99997614400677e-07, "loss": 0.0, "num_tokens": 3567168.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001689195632935, "sampling/importance_sampling_ratio/min": 0.6158477663993835, "sampling/sampling_logp_difference/max": 0.7310385704040527, "sampling/sampling_logp_difference/mean": 0.01859310269355774, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 52.40625, "completions/mean_terminated_length": 52.40625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2456488460302353, "epoch": 0.40707964601769914, "frac_reward_zero_std": 1.0, "grad_norm": 0.062557665823258, "kl": 0.002922304905951023, "learning_rate": 9.999946324068587e-07, "loss": 0.0, "num_tokens": 3580618.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.969736933708191, "sampling/importance_sampling_ratio/mean": 0.9975132942199707, "sampling/importance_sampling_ratio/min": 0.6378993988037109, "sampling/sampling_logp_difference/max": 0.677899956703186, "sampling/sampling_logp_difference/mean": 0.019860180094838142, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 39.734375, "completions/mean_terminated_length": 39.734375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.18301242589950562, "epoch": 0.4088495575221239, "frac_reward_zero_std": 1.0, "grad_norm": 0.04358872525227839, "kl": 0.0017693135887384415, "learning_rate": 9.999904576254724e-07, "loss": 0.0, "num_tokens": 3595081.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3643784523010254, "sampling/importance_sampling_ratio/mean": 1.0005431175231934, "sampling/importance_sampling_ratio/min": 0.6537138223648071, "sampling/sampling_logp_difference/max": 0.4250856637954712, "sampling/sampling_logp_difference/mean": 0.013198630884289742, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 73.890625, "completions/mean_terminated_length": 73.890625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2257222831249237, "epoch": 0.41061946902654867, "frac_reward_zero_std": 1.0, "grad_norm": 0.08980511226821004, "kl": 0.001283166348002851, "learning_rate": 9.999850900664773e-07, "loss": 0.0, "num_tokens": 3611618.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2656625509262085, "sampling/importance_sampling_ratio/mean": 0.9989427924156189, "sampling/importance_sampling_ratio/min": 0.512908935546875, "sampling/sampling_logp_difference/max": 0.6676568984985352, "sampling/sampling_logp_difference/mean": 0.012095805257558823, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 104.0, "completions/max_terminated_length": 104.0, "completions/mean_length": 32.453125, "completions/mean_terminated_length": 32.453125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.294692724943161, "epoch": 0.41238938053097346, "frac_reward_zero_std": 0.75, "grad_norm": 21.452015476936687, "kl": 0.023687120527029037, "learning_rate": 9.999785297426788e-07, "loss": -0.1992, "num_tokens": 3623407.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.6505494117736816, "sampling/importance_sampling_ratio/mean": 0.9979889988899231, "sampling/importance_sampling_ratio/min": 0.6730009913444519, "sampling/sampling_logp_difference/max": 0.5011081695556641, "sampling/sampling_logp_difference/mean": 0.01914994977414608, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 37.53125, "completions/mean_terminated_length": 37.53125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.23396769165992737, "epoch": 0.41415929203539825, "frac_reward_zero_std": 1.0, "grad_norm": 0.04800585746953848, "kl": 0.002836022526025772, "learning_rate": 9.999707766697265e-07, "loss": 0.0, "num_tokens": 3637537.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4313642978668213, "sampling/importance_sampling_ratio/mean": 1.0034034252166748, "sampling/importance_sampling_ratio/min": 0.6590465307235718, "sampling/sampling_logp_difference/max": 0.4169611930847168, "sampling/sampling_logp_difference/mean": 0.019834768027067184, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 42.234375, "completions/mean_terminated_length": 42.234375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.15960168838500977, "epoch": 0.415929203539823, "frac_reward_zero_std": 1.0, "grad_norm": 0.026987595816708707, "kl": 0.0025480028707534075, "learning_rate": 9.999618308661168e-07, "loss": 0.0, "num_tokens": 3650912.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5073378086090088, "sampling/importance_sampling_ratio/mean": 0.9996541738510132, "sampling/importance_sampling_ratio/min": 0.6348318457603455, "sampling/sampling_logp_difference/max": 0.454395055770874, "sampling/sampling_logp_difference/mean": 0.01629040017724037, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 90.703125, "completions/mean_terminated_length": 90.703125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.38684922456741333, "epoch": 0.4176991150442478, "frac_reward_zero_std": 0.75, "grad_norm": 3.010045925952474, "kl": 0.0022131020668894053, "learning_rate": 9.999516923531906e-07, "loss": 0.0773, "num_tokens": 3668093.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.644107699394226, "sampling/importance_sampling_ratio/mean": 1.0001626014709473, "sampling/importance_sampling_ratio/min": 0.6234187483787537, "sampling/sampling_logp_difference/max": 0.4971977472305298, "sampling/sampling_logp_difference/mean": 0.017248239368200302, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 86.953125, "completions/mean_terminated_length": 86.953125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.476330429315567, "epoch": 0.4194690265486726, "frac_reward_zero_std": 0.75, "grad_norm": 7.72996716584878, "kl": 0.0019192376639693975, "learning_rate": 9.99940361155134e-07, "loss": 0.0898, "num_tokens": 3682634.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4441698789596558, "sampling/importance_sampling_ratio/mean": 1.0006109476089478, "sampling/importance_sampling_ratio/min": 0.6419512033462524, "sampling/sampling_logp_difference/max": 0.44324302673339844, "sampling/sampling_logp_difference/mean": 0.016820615157485008, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 79.15625, "completions/mean_terminated_length": 79.15625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.27041104435920715, "epoch": 0.42123893805309737, "frac_reward_zero_std": 1.0, "grad_norm": 0.019962387727511497, "kl": 0.0013378332369029522, "learning_rate": 9.99927837298979e-07, "loss": 0.0, "num_tokens": 3698772.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7013784646987915, "sampling/importance_sampling_ratio/mean": 0.9985455274581909, "sampling/importance_sampling_ratio/min": 0.6555176377296448, "sampling/sampling_logp_difference/max": 0.5314388275146484, "sampling/sampling_logp_difference/mean": 0.012149603106081486, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 54.84375, "completions/mean_terminated_length": 54.84375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.20714148879051208, "epoch": 0.4230088495575221, "frac_reward_zero_std": 1.0, "grad_norm": 0.04988395372967355, "kl": 0.003713001497089863, "learning_rate": 9.999141208146027e-07, "loss": 0.0, "num_tokens": 3712506.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3076030015945435, "sampling/importance_sampling_ratio/mean": 0.9982582330703735, "sampling/importance_sampling_ratio/min": 0.5380278825759888, "sampling/sampling_logp_difference/max": 0.619844913482666, "sampling/sampling_logp_difference/mean": 0.014127662405371666, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 37.328125, "completions/mean_terminated_length": 37.328125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.1810479760169983, "epoch": 0.4247787610619469, "frac_reward_zero_std": 1.0, "grad_norm": 0.14025310662941104, "kl": 0.006155351176857948, "learning_rate": 9.99899211734727e-07, "loss": 0.0, "num_tokens": 3727135.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4445059299468994, "sampling/importance_sampling_ratio/mean": 1.0028483867645264, "sampling/importance_sampling_ratio/min": 0.5484683513641357, "sampling/sampling_logp_difference/max": 0.60062575340271, "sampling/sampling_logp_difference/mean": 0.020090894773602486, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 13.234375, "completions/mean_terminated_length": 13.234375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.06778696179389954, "epoch": 0.4265486725663717, "frac_reward_zero_std": 1.0, "grad_norm": 0.32877278340164795, "kl": 0.003949641715735197, "learning_rate": 9.998831100949186e-07, "loss": 0.0, "num_tokens": 3738222.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4646263122558594, "sampling/importance_sampling_ratio/mean": 0.9988141059875488, "sampling/importance_sampling_ratio/min": 0.781985878944397, "sampling/sampling_logp_difference/max": 0.38160014152526855, "sampling/sampling_logp_difference/mean": 0.009022011421620846, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 36.21875, "completions/mean_terminated_length": 36.21875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.14866621792316437, "epoch": 0.4283185840707965, "frac_reward_zero_std": 1.0, "grad_norm": 0.030079433302675318, "kl": 0.0009830165654420853, "learning_rate": 9.998658159335901e-07, "loss": 0.0, "num_tokens": 3754716.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.250900387763977, "sampling/importance_sampling_ratio/mean": 0.9993738532066345, "sampling/importance_sampling_ratio/min": 0.7676602005958557, "sampling/sampling_logp_difference/max": 0.2644081115722656, "sampling/sampling_logp_difference/mean": 0.009230071678757668, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 85.328125, "completions/mean_terminated_length": 85.328125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.38343095779418945, "epoch": 0.4300884955752212, "frac_reward_zero_std": 0.75, "grad_norm": 3.0171205491946504, "kl": 0.0024617710150778294, "learning_rate": 9.998473292919985e-07, "loss": -0.0413, "num_tokens": 3771553.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6167631149291992, "sampling/importance_sampling_ratio/mean": 1.0004315376281738, "sampling/importance_sampling_ratio/min": 0.3154713213443756, "sampling/sampling_logp_difference/max": 1.1536874771118164, "sampling/sampling_logp_difference/mean": 0.01735665649175644, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 37.078125, "completions/mean_terminated_length": 37.078125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.18088294565677643, "epoch": 0.431858407079646, "frac_reward_zero_std": 1.0, "grad_norm": 0.12771523381725922, "kl": 0.006010980345308781, "learning_rate": 9.998276502142454e-07, "loss": 0.0, "num_tokens": 3788790.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2638511657714844, "sampling/importance_sampling_ratio/mean": 0.999843180179596, "sampling/importance_sampling_ratio/min": 0.6756933331489563, "sampling/sampling_logp_difference/max": 0.3920159339904785, "sampling/sampling_logp_difference/mean": 0.0102929025888443, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 74.46875, "completions/mean_terminated_length": 74.46875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3529541492462158, "epoch": 0.4336283185840708, "frac_reward_zero_std": 1.0, "grad_norm": 0.03759443190383923, "kl": 0.003588553052395582, "learning_rate": 9.99806778747277e-07, "loss": 0.0, "num_tokens": 3802644.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.421198844909668, "sampling/importance_sampling_ratio/mean": 0.9967384934425354, "sampling/importance_sampling_ratio/min": 0.5428851842880249, "sampling/sampling_logp_difference/max": 0.6108574867248535, "sampling/sampling_logp_difference/mean": 0.024649962782859802, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 67.40625, "completions/mean_terminated_length": 67.40625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3470374047756195, "epoch": 0.4353982300884956, "frac_reward_zero_std": 1.0, "grad_norm": 0.051173374001041765, "kl": 0.004685951862484217, "learning_rate": 9.997847149408844e-07, "loss": 0.0, "num_tokens": 3817582.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.598049521446228, "sampling/importance_sampling_ratio/mean": 0.999319314956665, "sampling/importance_sampling_ratio/min": 0.5836055874824524, "sampling/sampling_logp_difference/max": 0.5385298728942871, "sampling/sampling_logp_difference/mean": 0.015922270715236664, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 62.484375, "completions/mean_terminated_length": 62.484375, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 0.2849852740764618, "epoch": 0.43716814159292033, "frac_reward_zero_std": 1.0, "grad_norm": 0.050583000758105276, "kl": 0.007301662117242813, "learning_rate": 9.997614588477033e-07, "loss": 0.0, "num_tokens": 3831645.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.3781495094299316, "sampling/importance_sampling_ratio/mean": 1.0004756450653076, "sampling/importance_sampling_ratio/min": 0.46459150314331055, "sampling/sampling_logp_difference/max": 0.766596794128418, "sampling/sampling_logp_difference/mean": 0.014835800975561142, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 72.390625, "completions/mean_terminated_length": 72.390625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.25849053263664246, "epoch": 0.4389380530973451, "frac_reward_zero_std": 0.75, "grad_norm": 4.7871676009843105, "kl": 0.0029903594404459, "learning_rate": 9.997370105232132e-07, "loss": -0.1178, "num_tokens": 3847062.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6574006080627441, "sampling/importance_sampling_ratio/mean": 1.0001531839370728, "sampling/importance_sampling_ratio/min": 0.5268588662147522, "sampling/sampling_logp_difference/max": 0.6408225297927856, "sampling/sampling_logp_difference/mean": 0.012619627639651299, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 43.890625, "completions/mean_terminated_length": 43.890625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.16508735716342926, "epoch": 0.4407079646017699, "frac_reward_zero_std": 1.0, "grad_norm": 0.06435464819757303, "kl": 0.003507372457534075, "learning_rate": 9.99711370025738e-07, "loss": 0.0, "num_tokens": 3861119.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8435330390930176, "sampling/importance_sampling_ratio/mean": 1.0004816055297852, "sampling/importance_sampling_ratio/min": 0.5295119285583496, "sampling/sampling_logp_difference/max": 0.6357995271682739, "sampling/sampling_logp_difference/mean": 0.0132959159091115, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 36.046875, "completions/mean_terminated_length": 36.046875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.15764935314655304, "epoch": 0.4424778761061947, "frac_reward_zero_std": 1.0, "grad_norm": 0.05396769395261247, "kl": 0.00200331280939281, "learning_rate": 9.99684537416446e-07, "loss": 0.0, "num_tokens": 3879634.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4567588567733765, "sampling/importance_sampling_ratio/mean": 1.0007212162017822, "sampling/importance_sampling_ratio/min": 0.5999128818511963, "sampling/sampling_logp_difference/max": 0.5109708309173584, "sampling/sampling_logp_difference/mean": 0.008045634254813194, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 95.9375, "completions/mean_terminated_length": 95.9375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3375559151172638, "epoch": 0.44424778761061945, "frac_reward_zero_std": 1.0, "grad_norm": 0.022391294297959205, "kl": 0.0018936173291876912, "learning_rate": 9.996565127593489e-07, "loss": 0.0, "num_tokens": 3899038.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6154776811599731, "sampling/importance_sampling_ratio/mean": 0.9995601773262024, "sampling/importance_sampling_ratio/min": 0.6189666986465454, "sampling/sampling_logp_difference/max": 0.4797039031982422, "sampling/sampling_logp_difference/mean": 0.014130129478871822, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 82.203125, "completions/mean_terminated_length": 82.203125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.27879437804222107, "epoch": 0.44601769911504424, "frac_reward_zero_std": 1.0, "grad_norm": 0.05656063312383896, "kl": 0.0035494635812938213, "learning_rate": 9.996272961213022e-07, "loss": 0.0, "num_tokens": 3915467.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5963265895843506, "sampling/importance_sampling_ratio/mean": 0.9994061589241028, "sampling/importance_sampling_ratio/min": 0.4906884729862213, "sampling/sampling_logp_difference/max": 0.7119457721710205, "sampling/sampling_logp_difference/mean": 0.01221383735537529, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 77.484375, "completions/mean_terminated_length": 77.484375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.27256277203559875, "epoch": 0.44778761061946903, "frac_reward_zero_std": 0.75, "grad_norm": 4.370429715106762, "kl": 0.004128012806177139, "learning_rate": 9.995968875720051e-07, "loss": 0.0854, "num_tokens": 3931418.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5513712167739868, "sampling/importance_sampling_ratio/mean": 0.9996453523635864, "sampling/importance_sampling_ratio/min": 0.6445860862731934, "sampling/sampling_logp_difference/max": 0.4391469955444336, "sampling/sampling_logp_difference/mean": 0.013168378733098507, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 104.703125, "completions/mean_terminated_length": 104.703125, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 0.39987921714782715, "epoch": 0.4495575221238938, "frac_reward_zero_std": 1.0, "grad_norm": 0.02226855702470426, "kl": 0.002329135313630104, "learning_rate": 9.995652871840006e-07, "loss": 0.0, "num_tokens": 3948695.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6002646684646606, "sampling/importance_sampling_ratio/mean": 1.0006269216537476, "sampling/importance_sampling_ratio/min": 0.6878425478935242, "sampling/sampling_logp_difference/max": 0.4701690673828125, "sampling/sampling_logp_difference/mean": 0.014145614579319954, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 42.234375, "completions/mean_terminated_length": 42.234375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2794543504714966, "epoch": 0.45132743362831856, "frac_reward_zero_std": 0.75, "grad_norm": 23.872798817189192, "kl": 0.0077172527089715, "learning_rate": 9.995324950326745e-07, "loss": 0.2481, "num_tokens": 3961158.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.440090298652649, "sampling/importance_sampling_ratio/mean": 0.9973623752593994, "sampling/importance_sampling_ratio/min": 0.5127883553504944, "sampling/sampling_logp_difference/max": 0.6678920984268188, "sampling/sampling_logp_difference/mean": 0.016313213855028152, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 76.859375, "completions/mean_terminated_length": 76.859375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.366608589887619, "epoch": 0.45309734513274336, "frac_reward_zero_std": 0.75, "grad_norm": 6.752881151929045, "kl": 0.0026165214367210865, "learning_rate": 9.994985111962555e-07, "loss": -0.0774, "num_tokens": 3976269.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.617125153541565, "sampling/importance_sampling_ratio/mean": 0.9991705417633057, "sampling/importance_sampling_ratio/min": 0.5433637499809265, "sampling/sampling_logp_difference/max": 0.6099762916564941, "sampling/sampling_logp_difference/mean": 0.018060646951198578, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 39.828125, "completions/mean_terminated_length": 39.828125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.13335466384887695, "epoch": 0.45486725663716815, "frac_reward_zero_std": 1.0, "grad_norm": 0.10586924061032081, "kl": 0.03460051491856575, "learning_rate": 9.994633357558158e-07, "loss": 0.0001, "num_tokens": 3988818.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2890098094940186, "sampling/importance_sampling_ratio/mean": 1.0003187656402588, "sampling/importance_sampling_ratio/min": 0.530799150466919, "sampling/sampling_logp_difference/max": 0.6333715915679932, "sampling/sampling_logp_difference/mean": 0.012708578258752823, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 82.9375, "completions/mean_terminated_length": 82.9375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.4873947501182556, "epoch": 0.45663716814159294, "frac_reward_zero_std": 1.0, "grad_norm": 0.046335198779911754, "kl": 0.004279019311070442, "learning_rate": 9.994269687952698e-07, "loss": 0.0, "num_tokens": 4005070.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5861549377441406, "sampling/importance_sampling_ratio/mean": 0.9983864426612854, "sampling/importance_sampling_ratio/min": 0.583560585975647, "sampling/sampling_logp_difference/max": 0.5386070013046265, "sampling/sampling_logp_difference/mean": 0.019403012469410896, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 55.25, "completions/mean_terminated_length": 55.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.28078144788742065, "epoch": 0.4584070796460177, "frac_reward_zero_std": 1.0, "grad_norm": 0.051731198908428394, "kl": 0.00406224001199007, "learning_rate": 9.993894104013746e-07, "loss": 0.0, "num_tokens": 4031454.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4271976947784424, "sampling/importance_sampling_ratio/mean": 0.9989052414894104, "sampling/importance_sampling_ratio/min": 0.6807091236114502, "sampling/sampling_logp_difference/max": 0.38462018966674805, "sampling/sampling_logp_difference/mean": 0.012434137985110283, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 38.859375, "completions/mean_terminated_length": 38.859375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.17825648188591003, "epoch": 0.46017699115044247, "frac_reward_zero_std": 1.0, "grad_norm": 0.09457567520972249, "kl": 0.004964154213666916, "learning_rate": 9.993506606637296e-07, "loss": 0.0001, "num_tokens": 4045093.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.430456519126892, "sampling/importance_sampling_ratio/mean": 0.9988031387329102, "sampling/importance_sampling_ratio/min": 0.662390947341919, "sampling/sampling_logp_difference/max": 0.4118993282318115, "sampling/sampling_logp_difference/mean": 0.012801151722669601, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 63.53125, "completions/mean_terminated_length": 63.53125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.24471867084503174, "epoch": 0.46194690265486726, "frac_reward_zero_std": 1.0, "grad_norm": 0.07442819078368382, "kl": 0.014636065810918808, "learning_rate": 9.993107196747758e-07, "loss": 0.0001, "num_tokens": 4059527.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4217251539230347, "sampling/importance_sampling_ratio/mean": 0.9983421564102173, "sampling/importance_sampling_ratio/min": 0.6172206997871399, "sampling/sampling_logp_difference/max": 0.4825286865234375, "sampling/sampling_logp_difference/mean": 0.014853391796350479, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 76.0625, "completions/mean_terminated_length": 76.0625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2663308382034302, "epoch": 0.46371681415929206, "frac_reward_zero_std": 1.0, "grad_norm": 0.03699252494847363, "kl": 0.002792743733152747, "learning_rate": 9.99269587529797e-07, "loss": 0.0, "num_tokens": 4076491.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4453238248825073, "sampling/importance_sampling_ratio/mean": 0.9987292289733887, "sampling/importance_sampling_ratio/min": 0.31267639994621277, "sampling/sampling_logp_difference/max": 1.1625864505767822, "sampling/sampling_logp_difference/mean": 0.010977178812026978, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 25.109375, "completions/mean_terminated_length": 25.109375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.22926849126815796, "epoch": 0.4654867256637168, "frac_reward_zero_std": 0.75, "grad_norm": 31.72844486467987, "kl": 0.02525678463280201, "learning_rate": 9.99227264326918e-07, "loss": 0.4846, "num_tokens": 4090498.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.595948576927185, "sampling/importance_sampling_ratio/mean": 0.9998789429664612, "sampling/importance_sampling_ratio/min": 0.6622171401977539, "sampling/sampling_logp_difference/max": 0.46746826171875, "sampling/sampling_logp_difference/mean": 0.016610635444521904, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 15.875, "completions/mean_terminated_length": 15.875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.0870765745639801, "epoch": 0.4672566371681416, "frac_reward_zero_std": 1.0, "grad_norm": 0.24809152797829495, "kl": 0.005051221698522568, "learning_rate": 9.991837501671048e-07, "loss": 0.0, "num_tokens": 4103130.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4061976671218872, "sampling/importance_sampling_ratio/mean": 1.0006778240203857, "sampling/importance_sampling_ratio/min": 0.730045735836029, "sampling/sampling_logp_difference/max": 0.3408893346786499, "sampling/sampling_logp_difference/mean": 0.02010222151875496, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 81.34375, "completions/mean_terminated_length": 81.34375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3357410430908203, "epoch": 0.4690265486725664, "frac_reward_zero_std": 1.0, "grad_norm": 0.02930875122187649, "kl": 0.0031524254009127617, "learning_rate": 9.991390451541648e-07, "loss": 0.0, "num_tokens": 4118880.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4591114521026611, "sampling/importance_sampling_ratio/mean": 0.9978489279747009, "sampling/importance_sampling_ratio/min": 0.5481137037277222, "sampling/sampling_logp_difference/max": 0.6012725830078125, "sampling/sampling_logp_difference/mean": 0.01842043176293373, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 76.6875, "completions/mean_terminated_length": 76.6875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.39841485023498535, "epoch": 0.47079646017699117, "frac_reward_zero_std": 1.0, "grad_norm": 0.09000805900037295, "kl": 0.09546725451946259, "learning_rate": 9.990931493947465e-07, "loss": 0.0002, "num_tokens": 4133548.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.588159203529358, "sampling/importance_sampling_ratio/mean": 0.9959626197814941, "sampling/importance_sampling_ratio/min": 0.4857686758041382, "sampling/sampling_logp_difference/max": 0.7220227718353271, "sampling/sampling_logp_difference/mean": 0.0178472138941288, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 47.296875, "completions/mean_terminated_length": 47.296875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.15155848860740662, "epoch": 0.4725663716814159, "frac_reward_zero_std": 1.0, "grad_norm": 0.025041931999307744, "kl": 0.006949643604457378, "learning_rate": 9.990460629983388e-07, "loss": 0.0, "num_tokens": 4147439.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.2860136032104492, "sampling/importance_sampling_ratio/mean": 0.9995124936103821, "sampling/importance_sampling_ratio/min": 0.6633288264274597, "sampling/sampling_logp_difference/max": 0.4104844331741333, "sampling/sampling_logp_difference/mean": 0.006444099824875593, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.07502971589565277, "epoch": 0.4743362831858407, "frac_reward_zero_std": 1.0, "grad_norm": 0.06736313727533422, "kl": 0.0005349120474420488, "learning_rate": 9.98997786077271e-07, "loss": 0.0, "num_tokens": 4161759.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3041719198226929, "sampling/importance_sampling_ratio/mean": 1.00235915184021, "sampling/importance_sampling_ratio/min": 0.8297764658927917, "sampling/sampling_logp_difference/max": 0.26556825637817383, "sampling/sampling_logp_difference/mean": 0.0092760119587183, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 61.265625, "completions/mean_terminated_length": 61.265625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.331500768661499, "epoch": 0.4761061946902655, "frac_reward_zero_std": 0.25, "grad_norm": 25.38060991543653, "kl": 0.06742088496685028, "learning_rate": 9.989483187467125e-07, "loss": -0.0358, "num_tokens": 4178736.0, "reward": 0.5, "reward_std": 0.5501632690429688, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.854530930519104, "sampling/importance_sampling_ratio/mean": 0.9997930526733398, "sampling/importance_sampling_ratio/min": 0.4544433653354645, "sampling/sampling_logp_difference/max": 0.7886819839477539, "sampling/sampling_logp_difference/mean": 0.018123114481568336, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 33.328125, "completions/mean_terminated_length": 33.328125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.13617177307605743, "epoch": 0.4778761061946903, "frac_reward_zero_std": 0.75, "grad_norm": 29.547501296153417, "kl": 0.09512509405612946, "learning_rate": 9.988976611246728e-07, "loss": -0.7683, "num_tokens": 4190165.0, "reward": -0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": -0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.278420090675354, "sampling/importance_sampling_ratio/mean": 1.0012259483337402, "sampling/importance_sampling_ratio/min": 0.7977098822593689, "sampling/sampling_logp_difference/max": 0.24562501907348633, "sampling/sampling_logp_difference/mean": 0.00955257099121809, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 84.5, "completions/mean_terminated_length": 84.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3737594485282898, "epoch": 0.479646017699115, "frac_reward_zero_std": 0.75, "grad_norm": 4.198167042441208, "kl": 0.002854880876839161, "learning_rate": 9.988458133320008e-07, "loss": -0.0762, "num_tokens": 4205285.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.346764087677002, "sampling/importance_sampling_ratio/mean": 0.9988805651664734, "sampling/importance_sampling_ratio/min": 0.5224089622497559, "sampling/sampling_logp_difference/max": 0.6493045091629028, "sampling/sampling_logp_difference/mean": 0.01678304933011532, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 82.8125, "completions/mean_terminated_length": 82.8125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.43366530537605286, "epoch": 0.4814159292035398, "frac_reward_zero_std": 1.0, "grad_norm": 0.031160634170031478, "kl": 0.004372789058834314, "learning_rate": 9.987927754923843e-07, "loss": 0.0, "num_tokens": 4224985.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6253509521484375, "sampling/importance_sampling_ratio/mean": 1.000022530555725, "sampling/importance_sampling_ratio/min": 0.417615681886673, "sampling/sampling_logp_difference/max": 0.8731937408447266, "sampling/sampling_logp_difference/mean": 0.017850469797849655, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 93.890625, "completions/mean_terminated_length": 93.890625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.30273085832595825, "epoch": 0.4831858407079646, "frac_reward_zero_std": 1.0, "grad_norm": 0.03724891110348457, "kl": 0.003369373269379139, "learning_rate": 9.987385477323506e-07, "loss": 0.0, "num_tokens": 4242722.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4414142370224, "sampling/importance_sampling_ratio/mean": 1.0018047094345093, "sampling/importance_sampling_ratio/min": 0.5525964498519897, "sampling/sampling_logp_difference/max": 0.5931272506713867, "sampling/sampling_logp_difference/mean": 0.020217468962073326, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 103.703125, "completions/mean_terminated_length": 103.703125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3254879415035248, "epoch": 0.4849557522123894, "frac_reward_zero_std": 1.0, "grad_norm": 0.028170574077244014, "kl": 0.004011090844869614, "learning_rate": 9.986831301812655e-07, "loss": 0.0, "num_tokens": 4260655.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3963828086853027, "sampling/importance_sampling_ratio/mean": 0.9998421669006348, "sampling/importance_sampling_ratio/min": 0.6422039270401001, "sampling/sampling_logp_difference/max": 0.44284939765930176, "sampling/sampling_logp_difference/mean": 0.012491639703512192, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 103.25, "completions/mean_terminated_length": 103.25, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.4250674843788147, "epoch": 0.48672566371681414, "frac_reward_zero_std": 1.0, "grad_norm": 0.024067047052095888, "kl": 0.002488392870873213, "learning_rate": 9.98626522971333e-07, "loss": 0.0, "num_tokens": 4277503.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6527782678604126, "sampling/importance_sampling_ratio/mean": 0.9999735355377197, "sampling/importance_sampling_ratio/min": 0.5634852647781372, "sampling/sampling_logp_difference/max": 0.5736141204833984, "sampling/sampling_logp_difference/mean": 0.016514642164111137, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 57.203125, "completions/mean_terminated_length": 57.203125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.30962735414505005, "epoch": 0.48849557522123893, "frac_reward_zero_std": 1.0, "grad_norm": 0.08756911954230388, "kl": 0.010065382346510887, "learning_rate": 9.985687262375956e-07, "loss": 0.0, "num_tokens": 4300284.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5002374649047852, "sampling/importance_sampling_ratio/mean": 1.0002117156982422, "sampling/importance_sampling_ratio/min": 0.7399517297744751, "sampling/sampling_logp_difference/max": 0.4056234359741211, "sampling/sampling_logp_difference/mean": 0.014174298383295536, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 104.296875, "completions/mean_terminated_length": 104.296875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.3918093144893646, "epoch": 0.4902654867256637, "frac_reward_zero_std": 1.0, "grad_norm": 0.026797512712986273, "kl": 0.0023234847467392683, "learning_rate": 9.985097401179333e-07, "loss": 0.0, "num_tokens": 4315727.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5206025838851929, "sampling/importance_sampling_ratio/mean": 1.0002238750457764, "sampling/importance_sampling_ratio/min": 0.6069608926773071, "sampling/sampling_logp_difference/max": 0.49929094314575195, "sampling/sampling_logp_difference/mean": 0.014753839001059532, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 56.5, "completions/mean_terminated_length": 56.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2540507912635803, "epoch": 0.4920353982300885, "frac_reward_zero_std": 1.0, "grad_norm": 0.09931648670257845, "kl": 0.012060517445206642, "learning_rate": 9.98449564753063e-07, "loss": 0.0001, "num_tokens": 4330927.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4970253705978394, "sampling/importance_sampling_ratio/mean": 0.999140202999115, "sampling/importance_sampling_ratio/min": 0.6079336404800415, "sampling/sampling_logp_difference/max": 0.49768948554992676, "sampling/sampling_logp_difference/mean": 0.012635363265872002, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 95.515625, "completions/mean_terminated_length": 95.515625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.29506152868270874, "epoch": 0.49380530973451325, "frac_reward_zero_std": 1.0, "grad_norm": 0.02588733266619697, "kl": 0.005474093370139599, "learning_rate": 9.98388200286539e-07, "loss": 0.0, "num_tokens": 4346944.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7718281745910645, "sampling/importance_sampling_ratio/mean": 0.9993443489074707, "sampling/importance_sampling_ratio/min": 0.6424316167831421, "sampling/sampling_logp_difference/max": 0.5720119476318359, "sampling/sampling_logp_difference/mean": 0.015047137625515461, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 120.96875, "completions/mean_terminated_length": 120.96875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.43145015835762024, "epoch": 0.49557522123893805, "frac_reward_zero_std": 1.0, "grad_norm": 0.15511521376513152, "kl": 0.007133021019399166, "learning_rate": 9.98325646864753e-07, "loss": 0.0001, "num_tokens": 4363742.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.998978853225708, "sampling/importance_sampling_ratio/min": 0.41871336102485657, "sampling/sampling_logp_difference/max": 0.8705687522888184, "sampling/sampling_logp_difference/mean": 0.01744624227285385, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 59.71875, "completions/mean_terminated_length": 59.71875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.33847692608833313, "epoch": 0.49734513274336284, "frac_reward_zero_std": 0.75, "grad_norm": 1.5322890577873676, "kl": 0.009394319728016853, "learning_rate": 9.98261904636932e-07, "loss": -0.0016, "num_tokens": 4377820.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.586092233657837, "sampling/importance_sampling_ratio/mean": 1.0006672143936157, "sampling/importance_sampling_ratio/min": 0.6987010836601257, "sampling/sampling_logp_difference/max": 0.461273193359375, "sampling/sampling_logp_difference/mean": 0.015468766912817955, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 30.8125, "completions/mean_terminated_length": 30.8125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.22856585681438446, "epoch": 0.49911504424778763, "frac_reward_zero_std": 1.0, "grad_norm": 0.20919807997035103, "kl": 0.013904878869652748, "learning_rate": 9.9819697375514e-07, "loss": 0.0001, "num_tokens": 4399776.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3514093160629272, "sampling/importance_sampling_ratio/mean": 1.0007905960083008, "sampling/importance_sampling_ratio/min": 0.7770977020263672, "sampling/sampling_logp_difference/max": 0.3011479377746582, "sampling/sampling_logp_difference/mean": 0.011299017816781998, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 41.3125, "completions/mean_terminated_length": 41.3125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.17480722069740295, "epoch": 0.5008849557522124, "frac_reward_zero_std": 1.0, "grad_norm": 0.033366070697314384, "kl": 0.0021748305298388004, "learning_rate": 9.981308543742756e-07, "loss": 0.0, "num_tokens": 4416100.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2915865182876587, "sampling/importance_sampling_ratio/mean": 0.9992952346801758, "sampling/importance_sampling_ratio/min": 0.737482488155365, "sampling/sampling_logp_difference/max": 0.30451297760009766, "sampling/sampling_logp_difference/mean": 0.01027563028037548, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 68.890625, "completions/mean_terminated_length": 68.890625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.24031007289886475, "epoch": 0.5026548672566372, "frac_reward_zero_std": 1.0, "grad_norm": 0.015676680037355565, "kl": 0.001302052871324122, "learning_rate": 9.980635466520736e-07, "loss": 0.0, "num_tokens": 4432957.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3076136112213135, "sampling/importance_sampling_ratio/mean": 1.0002350807189941, "sampling/importance_sampling_ratio/min": 0.7194289565086365, "sampling/sampling_logp_difference/max": 0.3292975425720215, "sampling/sampling_logp_difference/mean": 0.009223762899637222, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 72.375, "completions/mean_terminated_length": 72.375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3059983253479004, "epoch": 0.504424778761062, "frac_reward_zero_std": 1.0, "grad_norm": 0.03358499227826061, "kl": 0.004222136922180653, "learning_rate": 9.979950507491033e-07, "loss": 0.0, "num_tokens": 4453269.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9277665615081787, "sampling/importance_sampling_ratio/mean": 0.999069333076477, "sampling/importance_sampling_ratio/min": 0.7011988759040833, "sampling/sampling_logp_difference/max": 0.6563620567321777, "sampling/sampling_logp_difference/mean": 0.014185376465320587, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 100.25, "completions/mean_terminated_length": 100.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 0.46278679370880127, "epoch": 0.5061946902654867, "frac_reward_zero_std": 0.75, "grad_norm": 4.349333059405631, "kl": 0.00824448186904192, "learning_rate": 9.979253668287685e-07, "loss": 0.0758, "num_tokens": 4470949.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.6193196773529053, "sampling/importance_sampling_ratio/mean": 1.000375747680664, "sampling/importance_sampling_ratio/min": 0.6411598324775696, "sampling/sampling_logp_difference/max": 0.4820060729980469, "sampling/sampling_logp_difference/mean": 0.017631113529205322, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 823.0, "completions/max_terminated_length": 823.0, "completions/mean_length": 140.109375, "completions/mean_terminated_length": 140.109375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.3196713924407959, "epoch": 0.5079646017699115, "frac_reward_zero_std": 0.75, "grad_norm": 2.4901874515071363, "kl": 0.005851144902408123, "learning_rate": 9.978544950573073e-07, "loss": -0.0837, "num_tokens": 4489404.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9992465376853943, "sampling/importance_sampling_ratio/min": 0.6388933062553406, "sampling/sampling_logp_difference/max": 0.8145382404327393, "sampling/sampling_logp_difference/mean": 0.01222514733672142, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 81.03125, "completions/mean_terminated_length": 81.03125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2995682954788208, "epoch": 0.5097345132743363, "frac_reward_zero_std": 1.0, "grad_norm": 0.036957311092894976, "kl": 0.006040679756551981, "learning_rate": 9.977824356037915e-07, "loss": 0.0, "num_tokens": 4504238.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.493669033050537, "sampling/importance_sampling_ratio/mean": 1.0006897449493408, "sampling/importance_sampling_ratio/min": 0.6473813652992249, "sampling/sampling_logp_difference/max": 0.43481969833374023, "sampling/sampling_logp_difference/mean": 0.013412510976195335, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 75.078125, "completions/mean_terminated_length": 75.078125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.4171299338340759, "epoch": 0.511504424778761, "frac_reward_zero_std": 0.5, "grad_norm": 15.761554871688615, "kl": 0.018951520323753357, "learning_rate": 9.97709188640126e-07, "loss": 0.1355, "num_tokens": 4521539.0, "reward": 0.625, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.690384030342102, "sampling/importance_sampling_ratio/mean": 0.9999475479125977, "sampling/importance_sampling_ratio/min": 0.5168097615242004, "sampling/sampling_logp_difference/max": 0.6600804328918457, "sampling/sampling_logp_difference/mean": 0.01776520349085331, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 56.796875, "completions/mean_terminated_length": 56.796875, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.3325667381286621, "epoch": 0.5132743362831859, "frac_reward_zero_std": 0.5, "grad_norm": 12.906319531110897, "kl": 0.01129073090851307, "learning_rate": 9.976347543410486e-07, "loss": -0.0576, "num_tokens": 4542166.0, "reward": 0.71875, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0011260509490967, "sampling/importance_sampling_ratio/min": 0.6028571724891663, "sampling/sampling_logp_difference/max": 0.7271323204040527, "sampling/sampling_logp_difference/mean": 0.014359983615577221, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 91.40625, "completions/mean_terminated_length": 91.40625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.27161622047424316, "epoch": 0.5150442477876106, "frac_reward_zero_std": 0.5, "grad_norm": 6.2989239760333975, "kl": 0.022020166739821434, "learning_rate": 9.975591328841304e-07, "loss": 0.1212, "num_tokens": 4559424.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0005266666412354, "sampling/importance_sampling_ratio/min": 0.4944953918457031, "sampling/sampling_logp_difference/max": 0.7871992588043213, "sampling/sampling_logp_difference/mean": 0.013765428215265274, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 32.109375, "completions/mean_terminated_length": 32.109375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.20906458795070648, "epoch": 0.5168141592920354, "frac_reward_zero_std": 0.5, "grad_norm": 30.28759677520763, "kl": 0.030800532549619675, "learning_rate": 9.974823244497737e-07, "loss": 0.1248, "num_tokens": 4573079.0, "reward": 0.46875, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9987730979919434, "sampling/importance_sampling_ratio/min": 0.42857232689857483, "sampling/sampling_logp_difference/max": 0.8472957611083984, "sampling/sampling_logp_difference/mean": 0.016848154366016388, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 62.265625, "completions/mean_terminated_length": 62.265625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.275778591632843, "epoch": 0.5185840707964602, "frac_reward_zero_std": 0.5, "grad_norm": 11.562227028990126, "kl": 0.030032262206077576, "learning_rate": 9.974043292212127e-07, "loss": 0.2261, "num_tokens": 4587704.0, "reward": 0.65625, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.631435751914978, "sampling/importance_sampling_ratio/mean": 0.9993401765823364, "sampling/importance_sampling_ratio/min": 0.1340518444776535, "sampling/sampling_logp_difference/max": 2.009528636932373, "sampling/sampling_logp_difference/mean": 0.016510870307683945, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.359375, "completions/mean_terminated_length": 17.359375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.049905285239219666, "epoch": 0.5203539823008849, "frac_reward_zero_std": 1.0, "grad_norm": 0.29504253071905645, "kl": 0.008438438177108765, "learning_rate": 9.97325147384513e-07, "loss": 0.0001, "num_tokens": 4622127.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2581522464752197, "sampling/importance_sampling_ratio/mean": 0.9986610412597656, "sampling/importance_sampling_ratio/min": 0.8183937072753906, "sampling/sampling_logp_difference/max": 0.22964417934417725, "sampling/sampling_logp_difference/mean": 0.0056549059227108955, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 111.421875, "completions/mean_terminated_length": 111.421875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.5299802422523499, "epoch": 0.5221238938053098, "frac_reward_zero_std": 0.75, "grad_norm": 2.4250467422790787, "kl": 0.0054356371983885765, "learning_rate": 9.97244779128571e-07, "loss": -0.0261, "num_tokens": 4638570.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.3723515272140503, "sampling/importance_sampling_ratio/mean": 0.9999358654022217, "sampling/importance_sampling_ratio/min": 0.6040018200874329, "sampling/sampling_logp_difference/max": 0.5041780471801758, "sampling/sampling_logp_difference/mean": 0.01785511150956154, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 60.34375, "completions/mean_terminated_length": 60.34375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.18186187744140625, "epoch": 0.5238938053097345, "frac_reward_zero_std": 1.0, "grad_norm": 0.0889240132782423, "kl": 0.009175324812531471, "learning_rate": 9.971632246451127e-07, "loss": 0.0001, "num_tokens": 4652832.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.874014139175415, "sampling/importance_sampling_ratio/mean": 1.0008121728897095, "sampling/importance_sampling_ratio/min": 0.4280461370944977, "sampling/sampling_logp_difference/max": 0.8485243320465088, "sampling/sampling_logp_difference/mean": 0.008807593025267124, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 69.3125, "completions/mean_terminated_length": 69.3125, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.29337161779403687, "epoch": 0.5256637168141592, "frac_reward_zero_std": 1.0, "grad_norm": 0.2845039152124611, "kl": 0.01000886783003807, "learning_rate": 9.970804841286953e-07, "loss": 0.0001, "num_tokens": 4668708.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.484485387802124, "sampling/importance_sampling_ratio/mean": 0.9988928437232971, "sampling/importance_sampling_ratio/min": 0.5810959935188293, "sampling/sampling_logp_difference/max": 0.5428392887115479, "sampling/sampling_logp_difference/mean": 0.012892980128526688, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 106.4375, "completions/mean_terminated_length": 106.4375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.44031625986099243, "epoch": 0.5274336283185841, "frac_reward_zero_std": 0.75, "grad_norm": 6.971494631759584, "kl": 0.006387860979884863, "learning_rate": 9.96996557776704e-07, "loss": -0.0907, "num_tokens": 4684560.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.8611406087875366, "sampling/importance_sampling_ratio/mean": 0.9997141361236572, "sampling/importance_sampling_ratio/min": 0.6056256890296936, "sampling/sampling_logp_difference/max": 0.6211895942687988, "sampling/sampling_logp_difference/mean": 0.01681620627641678, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 45.78125, "completions/mean_terminated_length": 45.78125, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.20236334204673767, "epoch": 0.5292035398230088, "frac_reward_zero_std": 1.0, "grad_norm": 0.0530022444996644, "kl": 0.008030351251363754, "learning_rate": 9.969114457893539e-07, "loss": 0.0001, "num_tokens": 4700482.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6515015363693237, "sampling/importance_sampling_ratio/mean": 0.9991205334663391, "sampling/importance_sampling_ratio/min": 0.5748081207275391, "sampling/sampling_logp_difference/max": 0.5537190437316895, "sampling/sampling_logp_difference/mean": 0.009639833122491837, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 53.359375, "completions/mean_terminated_length": 53.359375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.14972491562366486, "epoch": 0.5309734513274337, "frac_reward_zero_std": 1.0, "grad_norm": 0.2657145861144557, "kl": 0.02420363575220108, "learning_rate": 9.96825148369688e-07, "loss": 0.0001, "num_tokens": 4718153.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6632829904556274, "sampling/importance_sampling_ratio/mean": 0.9984308481216431, "sampling/importance_sampling_ratio/min": 0.6867097616195679, "sampling/sampling_logp_difference/max": 0.5087933540344238, "sampling/sampling_logp_difference/mean": 0.011596485041081905, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 86.640625, "completions/mean_terminated_length": 86.640625, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.4426685571670532, "epoch": 0.5327433628318584, "frac_reward_zero_std": 1.0, "grad_norm": 0.05573498068166531, "kl": 0.0059372428804636, "learning_rate": 9.967376657235778e-07, "loss": 0.0001, "num_tokens": 4736258.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.352237582206726, "sampling/importance_sampling_ratio/mean": 1.0006613731384277, "sampling/importance_sampling_ratio/min": 0.5942684412002563, "sampling/sampling_logp_difference/max": 0.5204241275787354, "sampling/sampling_logp_difference/mean": 0.017739780247211456, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 75.59375, "completions/mean_terminated_length": 75.59375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3071618676185608, "epoch": 0.5345132743362832, "frac_reward_zero_std": 1.0, "grad_norm": 0.4380444782477624, "kl": 0.05999153107404709, "learning_rate": 9.966489980597217e-07, "loss": 0.0002, "num_tokens": 4753320.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0008987188339233, "sampling/importance_sampling_ratio/min": 0.41841116547584534, "sampling/sampling_logp_difference/max": 0.8712906837463379, "sampling/sampling_logp_difference/mean": 0.01640094444155693, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 68.9375, "completions/mean_terminated_length": 68.9375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.3281650245189667, "epoch": 0.536283185840708, "frac_reward_zero_std": 1.0, "grad_norm": 0.07769537274799407, "kl": 0.007740527857095003, "learning_rate": 9.965591455896455e-07, "loss": 0.0001, "num_tokens": 4770196.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6265430450439453, "sampling/importance_sampling_ratio/mean": 0.9986387491226196, "sampling/importance_sampling_ratio/min": 0.7393641471862793, "sampling/sampling_logp_difference/max": 0.48645687103271484, "sampling/sampling_logp_difference/mean": 0.017250174656510353, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 104.25, "completions/mean_terminated_length": 104.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3975277245044708, "epoch": 0.5380530973451327, "frac_reward_zero_std": 1.0, "grad_norm": 0.2827678240213584, "kl": 0.022672219201922417, "learning_rate": 9.964681085277011e-07, "loss": 0.0001, "num_tokens": 4786692.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.001333475112915, "sampling/importance_sampling_ratio/min": 0.592957079410553, "sampling/sampling_logp_difference/max": 1.0330731868743896, "sampling/sampling_logp_difference/mean": 0.017484024167060852, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 158.265625, "completions/mean_terminated_length": 158.265625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.5590561628341675, "epoch": 0.5398230088495575, "frac_reward_zero_std": 0.75, "grad_norm": 2.08652906915803, "kl": 0.005347556434571743, "learning_rate": 9.96375887091067e-07, "loss": 0.0594, "num_tokens": 4807045.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.7717801332473755, "sampling/importance_sampling_ratio/mean": 0.9994161128997803, "sampling/importance_sampling_ratio/min": 0.5527122616767883, "sampling/sampling_logp_difference/max": 0.5929176807403564, "sampling/sampling_logp_difference/mean": 0.01890292763710022, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 142.390625, "completions/mean_terminated_length": 142.390625, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.36418986320495605, "epoch": 0.5415929203539823, "frac_reward_zero_std": 1.0, "grad_norm": 0.03143902386920403, "kl": 0.004250286612659693, "learning_rate": 9.962824814997464e-07, "loss": 0.0, "num_tokens": 4827262.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0011675357818604, "sampling/importance_sampling_ratio/min": 0.6306570172309875, "sampling/sampling_logp_difference/max": 0.7148250341415405, "sampling/sampling_logp_difference/mean": 0.01493034791201353, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 60.234375, "completions/mean_terminated_length": 60.234375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.29062414169311523, "epoch": 0.5433628318584071, "frac_reward_zero_std": 1.0, "grad_norm": 0.07331000426106707, "kl": 0.011067867279052734, "learning_rate": 9.961878919765677e-07, "loss": 0.0001, "num_tokens": 4841629.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4982765913009644, "sampling/importance_sampling_ratio/mean": 0.9982985258102417, "sampling/importance_sampling_ratio/min": 0.5298459529876709, "sampling/sampling_logp_difference/max": 0.6351690292358398, "sampling/sampling_logp_difference/mean": 0.013999614864587784, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 79.125, "completions/mean_terminated_length": 79.125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2869476079940796, "epoch": 0.5451327433628319, "frac_reward_zero_std": 1.0, "grad_norm": 0.0780303316547964, "kl": 0.014650448225438595, "learning_rate": 9.96092118747184e-07, "loss": 0.0001, "num_tokens": 4857221.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994796514511108, "sampling/importance_sampling_ratio/min": 0.49689343571662903, "sampling/sampling_logp_difference/max": 0.7066882848739624, "sampling/sampling_logp_difference/mean": 0.018858671188354492, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 72.328125, "completions/mean_terminated_length": 72.328125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.25819018483161926, "epoch": 0.5469026548672566, "frac_reward_zero_std": 1.0, "grad_norm": 0.04189031557716253, "kl": 0.0059643518179655075, "learning_rate": 9.959951620400718e-07, "loss": 0.0001, "num_tokens": 4874138.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4584249258041382, "sampling/importance_sampling_ratio/mean": 0.9993717074394226, "sampling/importance_sampling_ratio/min": 0.6622897386550903, "sampling/sampling_logp_difference/max": 0.4120521545410156, "sampling/sampling_logp_difference/mean": 0.012498611584305763, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 68.3125, "completions/mean_terminated_length": 68.3125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.401637464761734, "epoch": 0.5486725663716814, "frac_reward_zero_std": 0.75, "grad_norm": 8.403602923707776, "kl": 0.013075219467282295, "learning_rate": 9.95897022086531e-07, "loss": 0.1173, "num_tokens": 4888862.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.3958466053009033, "sampling/importance_sampling_ratio/mean": 0.9982624650001526, "sampling/importance_sampling_ratio/min": 0.6836552619934082, "sampling/sampling_logp_difference/max": 0.38030147552490234, "sampling/sampling_logp_difference/mean": 0.016478929668664932, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 49.859375, "completions/mean_terminated_length": 49.859375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3399659991264343, "epoch": 0.5504424778761062, "frac_reward_zero_std": 1.0, "grad_norm": 0.15924851242976146, "kl": 0.015980035066604614, "learning_rate": 9.957976991206845e-07, "loss": 0.0001, "num_tokens": 4904533.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4301775693893433, "sampling/importance_sampling_ratio/mean": 1.001241683959961, "sampling/importance_sampling_ratio/min": 0.6126782298088074, "sampling/sampling_logp_difference/max": 0.4899153709411621, "sampling/sampling_logp_difference/mean": 0.01730850338935852, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 83.75, "completions/mean_terminated_length": 83.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.41513192653656006, "epoch": 0.552212389380531, "frac_reward_zero_std": 1.0, "grad_norm": 0.04073152091838604, "kl": 0.006671360228210688, "learning_rate": 9.956971933794773e-07, "loss": 0.0001, "num_tokens": 4920229.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.8718671798706055, "sampling/importance_sampling_ratio/mean": 1.0000343322753906, "sampling/importance_sampling_ratio/min": 0.5747321844100952, "sampling/sampling_logp_difference/max": 0.6269364356994629, "sampling/sampling_logp_difference/mean": 0.01486043632030487, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 135.65625, "completions/mean_terminated_length": 135.65625, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.4970802664756775, "epoch": 0.5539823008849557, "frac_reward_zero_std": 1.0, "grad_norm": 0.021734362861208452, "kl": 0.0033400128595530987, "learning_rate": 9.955955051026758e-07, "loss": 0.0, "num_tokens": 4939343.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5506399869918823, "sampling/importance_sampling_ratio/mean": 1.0004029273986816, "sampling/importance_sampling_ratio/min": 0.710355281829834, "sampling/sampling_logp_difference/max": 0.43866777420043945, "sampling/sampling_logp_difference/mean": 0.01776507869362831, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 52.359375, "completions/mean_terminated_length": 52.359375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2804589867591858, "epoch": 0.5557522123893806, "frac_reward_zero_std": 1.0, "grad_norm": 0.06747453302786525, "kl": 0.007154546212404966, "learning_rate": 9.954926345328678e-07, "loss": 0.0001, "num_tokens": 4954166.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5472042560577393, "sampling/importance_sampling_ratio/mean": 1.0000067949295044, "sampling/importance_sampling_ratio/min": 0.6417877674102783, "sampling/sampling_logp_difference/max": 0.4434976577758789, "sampling/sampling_logp_difference/mean": 0.014697583392262459, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 14.84375, "completions/mean_terminated_length": 14.84375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.07681912183761597, "epoch": 0.5575221238938053, "frac_reward_zero_std": 1.0, "grad_norm": 0.7951193652292655, "kl": 0.022772781550884247, "learning_rate": 9.953885819154614e-07, "loss": 0.0002, "num_tokens": 4975244.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5425488948822021, "sampling/importance_sampling_ratio/mean": 0.9988242983818054, "sampling/importance_sampling_ratio/min": 0.5694617629051208, "sampling/sampling_logp_difference/max": 0.5630636215209961, "sampling/sampling_logp_difference/mean": 0.018749669194221497, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 106.96875, "completions/mean_terminated_length": 106.96875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.4102153778076172, "epoch": 0.5592920353982301, "frac_reward_zero_std": 0.75, "grad_norm": 3.188538697896643, "kl": 0.003499663434922695, "learning_rate": 9.952833474986846e-07, "loss": 0.04, "num_tokens": 4992490.0, "reward": -0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5456247329711914, "sampling/importance_sampling_ratio/mean": 0.9988653063774109, "sampling/importance_sampling_ratio/min": 0.6146138310432434, "sampling/sampling_logp_difference/max": 0.48676109313964844, "sampling/sampling_logp_difference/mean": 0.020490184426307678, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 119.140625, "completions/mean_terminated_length": 119.140625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.6081708669662476, "epoch": 0.5610619469026549, "frac_reward_zero_std": 0.75, "grad_norm": 1.7009228075577634, "kl": 0.0045472802594304085, "learning_rate": 9.951769315335843e-07, "loss": 0.0164, "num_tokens": 5009779.0, "reward": -0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.9710369110107422, "sampling/importance_sampling_ratio/mean": 1.0008913278579712, "sampling/importance_sampling_ratio/min": 0.5404453873634338, "sampling/sampling_logp_difference/max": 0.6785597801208496, "sampling/sampling_logp_difference/mean": 0.02135409787297249, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 726.0, "completions/max_terminated_length": 726.0, "completions/mean_length": 79.640625, "completions/mean_terminated_length": 79.640625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.4556015729904175, "epoch": 0.5628318584070796, "frac_reward_zero_std": 0.75, "grad_norm": 11.94893909065176, "kl": 0.004789283964782953, "learning_rate": 9.95069334274027e-07, "loss": 0.2709, "num_tokens": 5026012.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.5947656631469727, "sampling/importance_sampling_ratio/mean": 1.0000662803649902, "sampling/importance_sampling_ratio/min": 0.6957854628562927, "sampling/sampling_logp_difference/max": 0.46672677993774414, "sampling/sampling_logp_difference/mean": 0.02015022188425064, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 41.609375, "completions/mean_terminated_length": 41.609375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.30406153202056885, "epoch": 0.5646017699115045, "frac_reward_zero_std": 0.75, "grad_norm": 12.714616664961063, "kl": 0.01399939227849245, "learning_rate": 9.949605559766967e-07, "loss": -0.1305, "num_tokens": 5041763.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4089725017547607, "sampling/importance_sampling_ratio/mean": 0.9992440938949585, "sampling/importance_sampling_ratio/min": 0.6184074282646179, "sampling/sampling_logp_difference/max": 0.4806077480316162, "sampling/sampling_logp_difference/mean": 0.012585079297423363, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 41.546875, "completions/mean_terminated_length": 41.546875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.17763113975524902, "epoch": 0.5663716814159292, "frac_reward_zero_std": 1.0, "grad_norm": 0.14469667542750955, "kl": 0.017358124256134033, "learning_rate": 9.94850596901095e-07, "loss": 0.0001, "num_tokens": 5056518.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4221454858779907, "sampling/importance_sampling_ratio/mean": 1.0001862049102783, "sampling/importance_sampling_ratio/min": 0.586487889289856, "sampling/sampling_logp_difference/max": 0.5336031913757324, "sampling/sampling_logp_difference/mean": 0.01157684437930584, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 76.484375, "completions/mean_terminated_length": 76.484375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3155892789363861, "epoch": 0.5681415929203539, "frac_reward_zero_std": 0.75, "grad_norm": 7.021400392096172, "kl": 0.010425843298435211, "learning_rate": 9.947394573095402e-07, "loss": 0.2305, "num_tokens": 5075909.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.4295427799224854, "sampling/importance_sampling_ratio/mean": 0.9993923902511597, "sampling/importance_sampling_ratio/min": 0.6148527264595032, "sampling/sampling_logp_difference/max": 0.4863724708557129, "sampling/sampling_logp_difference/mean": 0.01577897183597088, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 86.734375, "completions/mean_terminated_length": 86.734375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3866017162799835, "epoch": 0.5699115044247788, "frac_reward_zero_std": 1.0, "grad_norm": 0.040077809721334394, "kl": 0.007752751465886831, "learning_rate": 9.94627137467167e-07, "loss": 0.0001, "num_tokens": 5090708.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6048550605773926, "sampling/importance_sampling_ratio/mean": 0.9996352195739746, "sampling/importance_sampling_ratio/min": 0.5155983567237854, "sampling/sampling_logp_difference/max": 0.6624271869659424, "sampling/sampling_logp_difference/mean": 0.014700477942824364, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 35.953125, "completions/mean_terminated_length": 35.953125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.1371205598115921, "epoch": 0.5716814159292035, "frac_reward_zero_std": 1.0, "grad_norm": 0.2096955708072268, "kl": 0.019187739118933678, "learning_rate": 9.945136376419258e-07, "loss": 0.0001, "num_tokens": 5105873.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3945482969284058, "sampling/importance_sampling_ratio/mean": 0.9994967579841614, "sampling/importance_sampling_ratio/min": 0.47429314255714417, "sampling/sampling_logp_difference/max": 0.7459297180175781, "sampling/sampling_logp_difference/mean": 0.007329988293349743, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 33.78125, "completions/mean_terminated_length": 33.78125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.12136248499155045, "epoch": 0.5734513274336284, "frac_reward_zero_std": 1.0, "grad_norm": 0.20048100584716425, "kl": 0.018594108521938324, "learning_rate": 9.943989581045819e-07, "loss": 0.0001, "num_tokens": 5122099.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.266669511795044, "sampling/importance_sampling_ratio/mean": 0.9998998641967773, "sampling/importance_sampling_ratio/min": 0.7039791941642761, "sampling/sampling_logp_difference/max": 0.35100650787353516, "sampling/sampling_logp_difference/mean": 0.005552534945309162, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 55.84375, "completions/mean_terminated_length": 55.84375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3695143461227417, "epoch": 0.5752212389380531, "frac_reward_zero_std": 0.75, "grad_norm": 12.740230354540556, "kl": 0.00976649858057499, "learning_rate": 9.942830991287149e-07, "loss": 0.0784, "num_tokens": 5135977.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5773446559906006, "sampling/importance_sampling_ratio/mean": 0.9974552989006042, "sampling/importance_sampling_ratio/min": 0.6371076703071594, "sampling/sampling_logp_difference/max": 0.45574283599853516, "sampling/sampling_logp_difference/mean": 0.021933196112513542, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 97.875, "completions/mean_terminated_length": 97.875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.3761260509490967, "epoch": 0.5769911504424778, "frac_reward_zero_std": 1.0, "grad_norm": 0.0306591512793739, "kl": 0.003819515462964773, "learning_rate": 9.94166060990718e-07, "loss": 0.0, "num_tokens": 5151105.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6236847639083862, "sampling/importance_sampling_ratio/mean": 1.000258207321167, "sampling/importance_sampling_ratio/min": 0.5582404732704163, "sampling/sampling_logp_difference/max": 0.5829653739929199, "sampling/sampling_logp_difference/mean": 0.01616375334560871, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 40.4375, "completions/mean_terminated_length": 40.4375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.21063324809074402, "epoch": 0.5787610619469027, "frac_reward_zero_std": 1.0, "grad_norm": 0.18340809261246058, "kl": 0.03675929456949234, "learning_rate": 9.940478439697972e-07, "loss": 0.0002, "num_tokens": 5165245.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6573799848556519, "sampling/importance_sampling_ratio/mean": 0.9981979727745056, "sampling/importance_sampling_ratio/min": 0.47290316224098206, "sampling/sampling_logp_difference/max": 0.7488646507263184, "sampling/sampling_logp_difference/mean": 0.012335125356912613, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 54.9375, "completions/mean_terminated_length": 54.9375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.35798150300979614, "epoch": 0.5805309734513274, "frac_reward_zero_std": 0.5, "grad_norm": 7.141605430777651, "kl": 0.05895555764436722, "learning_rate": 9.939284483479715e-07, "loss": 0.0999, "num_tokens": 5178681.0, "reward": 0.59375, "reward_std": 0.497555673122406, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.5644574165344238, "sampling/importance_sampling_ratio/mean": 1.0009150505065918, "sampling/importance_sampling_ratio/min": 0.5880892872810364, "sampling/sampling_logp_difference/max": 0.5308763980865479, "sampling/sampling_logp_difference/mean": 0.02186429128050804, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.609375, "completions/mean_terminated_length": 14.609375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.05067729949951172, "epoch": 0.5823008849557522, "frac_reward_zero_std": 1.0, "grad_norm": 0.5404423741608648, "kl": 0.02486887201666832, "learning_rate": 9.93807874410071e-07, "loss": 0.0002, "num_tokens": 5191696.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.265113115310669, "sampling/importance_sampling_ratio/mean": 0.9971482753753662, "sampling/importance_sampling_ratio/min": 0.7563673257827759, "sampling/sampling_logp_difference/max": 0.279228150844574, "sampling/sampling_logp_difference/mean": 0.009500461630523205, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 15.828125, "completions/mean_terminated_length": 15.828125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.1415385901927948, "epoch": 0.584070796460177, "frac_reward_zero_std": 1.0, "grad_norm": 0.30440635525570386, "kl": 0.1242285966873169, "learning_rate": 9.936861224437372e-07, "loss": 0.0013, "num_tokens": 5205861.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5973737239837646, "sampling/importance_sampling_ratio/mean": 1.0021167993545532, "sampling/importance_sampling_ratio/min": 0.7834390997886658, "sampling/sampling_logp_difference/max": 0.46836090087890625, "sampling/sampling_logp_difference/mean": 0.008564083836972713, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 13.65625, "completions/mean_terminated_length": 13.65625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.062073707580566406, "epoch": 0.5858407079646017, "frac_reward_zero_std": 1.0, "grad_norm": 0.08148900210167391, "kl": 0.1352459043264389, "learning_rate": 9.935631927394214e-07, "loss": 0.0014, "num_tokens": 5218223.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6912966966629028, "sampling/importance_sampling_ratio/mean": 0.9999576807022095, "sampling/importance_sampling_ratio/min": 0.6084062457084656, "sampling/sampling_logp_difference/max": 0.5254955291748047, "sampling/sampling_logp_difference/mean": 0.011843099258840084, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.96875, "completions/mean_terminated_length": 14.96875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.07408523559570312, "epoch": 0.5876106194690266, "frac_reward_zero_std": 1.0, "grad_norm": 0.37575585050410487, "kl": 0.12586016952991486, "learning_rate": 9.934390855903852e-07, "loss": 0.0011, "num_tokens": 5233069.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.1157681941986084, "sampling/importance_sampling_ratio/mean": 0.9991582632064819, "sampling/importance_sampling_ratio/min": 0.8067606091499329, "sampling/sampling_logp_difference/max": 0.21472835540771484, "sampling/sampling_logp_difference/mean": 0.005963312461972237, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 98.0, "completions/max_terminated_length": 98.0, "completions/mean_length": 14.796875, "completions/mean_terminated_length": 14.796875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.20496223866939545, "epoch": 0.5893805309734513, "frac_reward_zero_std": 1.0, "grad_norm": 0.2765369592767717, "kl": 0.2802959978580475, "learning_rate": 9.93313801292698e-07, "loss": 0.003, "num_tokens": 5243936.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.646456241607666, "sampling/importance_sampling_ratio/mean": 1.0015816688537598, "sampling/importance_sampling_ratio/min": 0.5725432634353638, "sampling/sampling_logp_difference/max": 0.5576670169830322, "sampling/sampling_logp_difference/mean": 0.008417494595050812, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 19.640625, "completions/mean_terminated_length": 19.640625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.1658213883638382, "epoch": 0.5911504424778761, "frac_reward_zero_std": 0.75, "grad_norm": 38.715214411963814, "kl": 0.1244959682226181, "learning_rate": 9.93187340145239e-07, "loss": -0.506, "num_tokens": 5256345.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.4853169918060303, "sampling/importance_sampling_ratio/mean": 0.99802565574646, "sampling/importance_sampling_ratio/min": 0.6318823099136353, "sampling/sampling_logp_difference/max": 0.45905208587646484, "sampling/sampling_logp_difference/mean": 0.017435431480407715, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.16205374896526337, "epoch": 0.5929203539823009, "frac_reward_zero_std": 0.75, "grad_norm": 45.914292343197125, "kl": 0.5228198170661926, "learning_rate": 9.93059702449693e-07, "loss": -0.1383, "num_tokens": 5267081.0, "reward": -0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.230634093284607, "sampling/importance_sampling_ratio/mean": 1.0003182888031006, "sampling/importance_sampling_ratio/min": 0.8197073936462402, "sampling/sampling_logp_difference/max": 0.20752954483032227, "sampling/sampling_logp_difference/mean": 0.0072051300667226315, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 15.390625, "completions/mean_terminated_length": 15.390625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.1279073804616928, "epoch": 0.5946902654867257, "frac_reward_zero_std": 0.75, "grad_norm": 31.76875433357429, "kl": 0.5445804595947266, "learning_rate": 9.929308885105534e-07, "loss": -0.2191, "num_tokens": 5279250.0, "reward": -0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.1882487535476685, "sampling/importance_sampling_ratio/mean": 1.0010457038879395, "sampling/importance_sampling_ratio/min": 0.7639153599739075, "sampling/sampling_logp_difference/max": 0.2692983150482178, "sampling/sampling_logp_difference/mean": 0.00909061636775732, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 98.0, "completions/max_terminated_length": 98.0, "completions/mean_length": 16.4375, "completions/mean_terminated_length": 16.4375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.12667866051197052, "epoch": 0.5964601769911504, "frac_reward_zero_std": 0.75, "grad_norm": 50.528642905457026, "kl": 0.21811896562576294, "learning_rate": 9.928008986351186e-07, "loss": -0.4598, "num_tokens": 5290814.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.3493925333023071, "sampling/importance_sampling_ratio/mean": 1.000293493270874, "sampling/importance_sampling_ratio/min": 0.7212910056114197, "sampling/sampling_logp_difference/max": 0.32671260833740234, "sampling/sampling_logp_difference/mean": 0.007266349159181118, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 78.5625, "completions/mean_terminated_length": 78.5625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.40841299295425415, "epoch": 0.5982300884955752, "frac_reward_zero_std": 0.0, "grad_norm": 22.944560152841685, "kl": 0.12588217854499817, "learning_rate": 9.926697331334924e-07, "loss": -0.3419, "num_tokens": 5306290.0, "reward": 0.15625, "reward_std": 0.7297805547714233, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.8606716394424438, "sampling/importance_sampling_ratio/mean": 1.0010006427764893, "sampling/importance_sampling_ratio/min": 0.4402218163013458, "sampling/sampling_logp_difference/max": 0.8204765319824219, "sampling/sampling_logp_difference/mean": 0.02122463285923004, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 69.5, "completions/mean_terminated_length": 69.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3494517505168915, "epoch": 0.6, "frac_reward_zero_std": 1.0, "grad_norm": 0.03945891901988461, "kl": 0.004555393010377884, "learning_rate": 9.925373923185834e-07, "loss": 0.0001, "num_tokens": 5320178.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2857304811477661, "sampling/importance_sampling_ratio/mean": 0.9992601871490479, "sampling/importance_sampling_ratio/min": 0.6282122731208801, "sampling/sampling_logp_difference/max": 0.4648771286010742, "sampling/sampling_logp_difference/mean": 0.012271528132259846, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 86.203125, "completions/mean_terminated_length": 86.203125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.544769287109375, "epoch": 0.6017699115044248, "frac_reward_zero_std": 0.75, "grad_norm": 10.513692113628164, "kl": 0.012212522327899933, "learning_rate": 9.92403876506104e-07, "loss": -0.1197, "num_tokens": 5336207.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.5996742248535156, "sampling/importance_sampling_ratio/mean": 1.00087571144104, "sampling/importance_sampling_ratio/min": 0.7218353748321533, "sampling/sampling_logp_difference/max": 0.4697999954223633, "sampling/sampling_logp_difference/mean": 0.018725665286183357, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 50.984375, "completions/mean_terminated_length": 50.984375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.25952285528182983, "epoch": 0.6035398230088496, "frac_reward_zero_std": 1.0, "grad_norm": 0.13402526271688212, "kl": 0.025582045316696167, "learning_rate": 9.922691860145696e-07, "loss": 0.0001, "num_tokens": 5350654.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.56446373462677, "sampling/importance_sampling_ratio/mean": 1.0010714530944824, "sampling/importance_sampling_ratio/min": 0.6776019930839539, "sampling/sampling_logp_difference/max": 0.4475431442260742, "sampling/sampling_logp_difference/mean": 0.015527051873505116, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 51.3125, "completions/mean_terminated_length": 51.3125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.30644503235816956, "epoch": 0.6053097345132743, "frac_reward_zero_std": 1.0, "grad_norm": 0.28312367214821127, "kl": 0.05292034149169922, "learning_rate": 9.921333211652977e-07, "loss": 0.0002, "num_tokens": 5365426.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.364556074142456, "sampling/importance_sampling_ratio/mean": 1.0007081031799316, "sampling/importance_sampling_ratio/min": 0.559749960899353, "sampling/sampling_logp_difference/max": 0.5802650451660156, "sampling/sampling_logp_difference/mean": 0.014037737622857094, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 97.203125, "completions/mean_terminated_length": 97.203125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3141934275627136, "epoch": 0.6070796460176991, "frac_reward_zero_std": 1.0, "grad_norm": 0.040306133956537274, "kl": 0.008490835316479206, "learning_rate": 9.919962822824083e-07, "loss": 0.0001, "num_tokens": 5381807.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.52422034740448, "sampling/importance_sampling_ratio/mean": 1.0011824369430542, "sampling/importance_sampling_ratio/min": 0.6413918733596802, "sampling/sampling_logp_difference/max": 0.44411468505859375, "sampling/sampling_logp_difference/mean": 0.011814025230705738, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 112.453125, "completions/mean_terminated_length": 112.453125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3980545401573181, "epoch": 0.6088495575221239, "frac_reward_zero_std": 1.0, "grad_norm": 0.03816208034133788, "kl": 0.014165371656417847, "learning_rate": 9.918580696928205e-07, "loss": 0.0001, "num_tokens": 5400892.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8213403224945068, "sampling/importance_sampling_ratio/mean": 0.999814510345459, "sampling/importance_sampling_ratio/min": 0.5459651947021484, "sampling/sampling_logp_difference/max": 0.6052000522613525, "sampling/sampling_logp_difference/mean": 0.022808555513620377, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 65.78125, "completions/mean_terminated_length": 65.78125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3655658960342407, "epoch": 0.6106194690265486, "frac_reward_zero_std": 1.0, "grad_norm": 0.08708536054248218, "kl": 0.019657455384731293, "learning_rate": 9.91718683726255e-07, "loss": 0.0001, "num_tokens": 5417822.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4601846933364868, "sampling/importance_sampling_ratio/mean": 1.0012413263320923, "sampling/importance_sampling_ratio/min": 0.6977089643478394, "sampling/sampling_logp_difference/max": 0.37856292724609375, "sampling/sampling_logp_difference/mean": 0.016577202826738358, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 35.328125, "completions/mean_terminated_length": 35.328125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.22649946808815002, "epoch": 0.6123893805309735, "frac_reward_zero_std": 1.0, "grad_norm": 0.2680127798221453, "kl": 0.037691764533519745, "learning_rate": 9.915781247152308e-07, "loss": 0.0002, "num_tokens": 5431411.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6613584756851196, "sampling/importance_sampling_ratio/mean": 1.000566005706787, "sampling/importance_sampling_ratio/min": 0.6990428566932678, "sampling/sampling_logp_difference/max": 0.5076355934143066, "sampling/sampling_logp_difference/mean": 0.012610787525773048, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 80.25, "completions/mean_terminated_length": 80.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.38664960861206055, "epoch": 0.6141592920353982, "frac_reward_zero_std": 1.0, "grad_norm": 0.07839437680152708, "kl": 0.013993092812597752, "learning_rate": 9.914363929950657e-07, "loss": 0.0001, "num_tokens": 5447107.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0008046627044678, "sampling/importance_sampling_ratio/min": 0.5363089442253113, "sampling/sampling_logp_difference/max": 1.106311321258545, "sampling/sampling_logp_difference/mean": 0.018526988103985786, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 88.0, "completions/mean_terminated_length": 88.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.35303232073783875, "epoch": 0.6159292035398231, "frac_reward_zero_std": 1.0, "grad_norm": 0.1290413396036071, "kl": 0.03378347307443619, "learning_rate": 9.91293488903875e-07, "loss": 0.0002, "num_tokens": 5464083.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5305931568145752, "sampling/importance_sampling_ratio/mean": 1.0008561611175537, "sampling/importance_sampling_ratio/min": 0.6892523765563965, "sampling/sampling_logp_difference/max": 0.4256553649902344, "sampling/sampling_logp_difference/mean": 0.012805117294192314, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 101.84375, "completions/mean_terminated_length": 101.84375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.4014196991920471, "epoch": 0.6176991150442478, "frac_reward_zero_std": 1.0, "grad_norm": 0.05282522173640956, "kl": 0.012553641572594643, "learning_rate": 9.91149412782571e-07, "loss": 0.0001, "num_tokens": 5481881.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4656864404678345, "sampling/importance_sampling_ratio/mean": 0.9996223449707031, "sampling/importance_sampling_ratio/min": 0.6784743070602417, "sampling/sampling_logp_difference/max": 0.3879086971282959, "sampling/sampling_logp_difference/mean": 0.015167336910963058, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 121.859375, "completions/mean_terminated_length": 121.859375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.42137038707733154, "epoch": 0.6194690265486725, "frac_reward_zero_std": 1.0, "grad_norm": 0.17263349465215008, "kl": 0.021598845720291138, "learning_rate": 9.910041649748612e-07, "loss": 0.0002, "num_tokens": 5498240.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998193979263306, "sampling/importance_sampling_ratio/min": 0.42447274923324585, "sampling/sampling_logp_difference/max": 0.8569074869155884, "sampling/sampling_logp_difference/mean": 0.015872254967689514, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 120.828125, "completions/mean_terminated_length": 120.828125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5624607801437378, "epoch": 0.6212389380530974, "frac_reward_zero_std": 1.0, "grad_norm": 0.07907780312988107, "kl": 0.04852885752916336, "learning_rate": 9.908577458272495e-07, "loss": 0.0005, "num_tokens": 5517605.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.870368242263794, "sampling/importance_sampling_ratio/mean": 1.0003533363342285, "sampling/importance_sampling_ratio/min": 0.5663659572601318, "sampling/sampling_logp_difference/max": 0.6261353492736816, "sampling/sampling_logp_difference/mean": 0.021383430808782578, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 130.515625, "completions/mean_terminated_length": 130.515625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.510712742805481, "epoch": 0.6230088495575221, "frac_reward_zero_std": 1.0, "grad_norm": 0.14652883275587417, "kl": 0.037225618958473206, "learning_rate": 9.907101556890331e-07, "loss": 0.0004, "num_tokens": 5535334.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000260829925537, "sampling/importance_sampling_ratio/min": 0.6686792373657227, "sampling/sampling_logp_difference/max": 0.7186784744262695, "sampling/sampling_logp_difference/mean": 0.017669696360826492, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 134.375, "completions/mean_terminated_length": 134.375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.4618055820465088, "epoch": 0.6247787610619469, "frac_reward_zero_std": 1.0, "grad_norm": 0.0578213304417604, "kl": 0.034855760633945465, "learning_rate": 9.905613949123034e-07, "loss": 0.0004, "num_tokens": 5553918.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6922149658203125, "sampling/importance_sampling_ratio/mean": 0.9999423027038574, "sampling/importance_sampling_ratio/min": 0.6325134038925171, "sampling/sampling_logp_difference/max": 0.5260382890701294, "sampling/sampling_logp_difference/mean": 0.01687721721827984, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 122.09375, "completions/mean_terminated_length": 122.09375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.6637468338012695, "epoch": 0.6265486725663717, "frac_reward_zero_std": 1.0, "grad_norm": 0.05067320244235177, "kl": 0.02216402254998684, "learning_rate": 9.904114638519443e-07, "loss": 0.0003, "num_tokens": 5573492.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.7573411464691162, "sampling/importance_sampling_ratio/mean": 0.9993027448654175, "sampling/importance_sampling_ratio/min": 0.566085159778595, "sampling/sampling_logp_difference/max": 0.5690107345581055, "sampling/sampling_logp_difference/mean": 0.02189033478498459, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 108.984375, "completions/mean_terminated_length": 108.984375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.5064927339553833, "epoch": 0.6283185840707964, "frac_reward_zero_std": 1.0, "grad_norm": 0.022769992272464255, "kl": 0.003910430707037449, "learning_rate": 9.902603628656311e-07, "loss": 0.0, "num_tokens": 5590211.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4734493494033813, "sampling/importance_sampling_ratio/mean": 1.0005618333816528, "sampling/importance_sampling_ratio/min": 0.6874695420265198, "sampling/sampling_logp_difference/max": 0.387606143951416, "sampling/sampling_logp_difference/mean": 0.017741823568940163, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 146.703125, "completions/mean_terminated_length": 146.703125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5128886103630066, "epoch": 0.6300884955752213, "frac_reward_zero_std": 1.0, "grad_norm": 0.05910555125404528, "kl": 0.06985120475292206, "learning_rate": 9.901080923138308e-07, "loss": 0.0006, "num_tokens": 5622528.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000289797782898, "sampling/importance_sampling_ratio/min": 0.6196466088294983, "sampling/sampling_logp_difference/max": 0.8414640426635742, "sampling/sampling_logp_difference/mean": 0.01770433411002159, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 868.0, "completions/max_terminated_length": 868.0, "completions/mean_length": 178.484375, "completions/mean_terminated_length": 178.484375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.4746066629886627, "epoch": 0.631858407079646, "frac_reward_zero_std": 1.0, "grad_norm": 0.05989841681630618, "kl": 0.027999769896268845, "learning_rate": 9.899546525597997e-07, "loss": 0.0003, "num_tokens": 5644367.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.527292013168335, "sampling/importance_sampling_ratio/mean": 0.9995212554931641, "sampling/importance_sampling_ratio/min": 0.44542253017425537, "sampling/sampling_logp_difference/max": 0.8087320327758789, "sampling/sampling_logp_difference/mean": 0.017029207199811935, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 112.890625, "completions/mean_terminated_length": 112.890625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.620051383972168, "epoch": 0.6336283185840708, "frac_reward_zero_std": 1.0, "grad_norm": 0.026391207446395157, "kl": 0.005611338187009096, "learning_rate": 9.898000439695843e-07, "loss": 0.0001, "num_tokens": 5661352.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9880595207214355, "sampling/importance_sampling_ratio/mean": 1.0005333423614502, "sampling/importance_sampling_ratio/min": 0.7001907825469971, "sampling/sampling_logp_difference/max": 0.6871590614318848, "sampling/sampling_logp_difference/mean": 0.02070365846157074, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 122.03125, "completions/mean_terminated_length": 122.03125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.5711897611618042, "epoch": 0.6353982300884956, "frac_reward_zero_std": 1.0, "grad_norm": 0.36483039142888785, "kl": 0.0318559929728508, "learning_rate": 9.896442669120187e-07, "loss": 0.0003, "num_tokens": 5679354.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.430940866470337, "sampling/importance_sampling_ratio/mean": 0.9996042847633362, "sampling/importance_sampling_ratio/min": 0.4420183300971985, "sampling/sampling_logp_difference/max": 0.816403865814209, "sampling/sampling_logp_difference/mean": 0.019348707050085068, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 77.421875, "completions/mean_terminated_length": 77.421875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 0.31168049573898315, "epoch": 0.6371681415929203, "frac_reward_zero_std": 1.0, "grad_norm": 0.03541548343871878, "kl": 0.009070035070180893, "learning_rate": 9.894873217587245e-07, "loss": 0.0001, "num_tokens": 5697653.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5912870168685913, "sampling/importance_sampling_ratio/mean": 1.0003658533096313, "sampling/importance_sampling_ratio/min": 0.7069886326789856, "sampling/sampling_logp_difference/max": 0.4645431637763977, "sampling/sampling_logp_difference/mean": 0.01266229897737503, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 158.796875, "completions/mean_terminated_length": 158.796875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 0.5861371755599976, "epoch": 0.6389380530973451, "frac_reward_zero_std": 0.75, "grad_norm": 2.910836183478518, "kl": 0.04382926970720291, "learning_rate": 9.893292088841108e-07, "loss": 0.046, "num_tokens": 5719336.0, "reward": -0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6420366764068604, "sampling/importance_sampling_ratio/mean": 1.0004842281341553, "sampling/importance_sampling_ratio/min": 0.48981279134750366, "sampling/sampling_logp_difference/max": 0.7137320041656494, "sampling/sampling_logp_difference/mean": 0.01918848231434822, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 110.828125, "completions/mean_terminated_length": 110.828125, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.431569904088974, "epoch": 0.6407079646017699, "frac_reward_zero_std": 0.75, "grad_norm": 4.212237165655271, "kl": 0.0400247722864151, "learning_rate": 9.891699286653712e-07, "loss": -0.0838, "num_tokens": 5740701.0, "reward": -0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.780729055404663, "sampling/importance_sampling_ratio/mean": 1.0001611709594727, "sampling/importance_sampling_ratio/min": 0.5716253519058228, "sampling/sampling_logp_difference/max": 0.5770227909088135, "sampling/sampling_logp_difference/mean": 0.01780252903699875, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/max_terminated_length": 620.0, "completions/mean_length": 185.890625, "completions/mean_terminated_length": 185.890625, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.48092374205589294, "epoch": 0.6424778761061947, "frac_reward_zero_std": 0.5, "grad_norm": 2.68364450863825, "kl": 0.029903601855039597, "learning_rate": 9.890094814824852e-07, "loss": -0.0082, "num_tokens": 5764518.0, "reward": 0.15625, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.4594154357910156, "sampling/importance_sampling_ratio/mean": 1.0001177787780762, "sampling/importance_sampling_ratio/min": 0.4550616145133972, "sampling/sampling_logp_difference/max": 0.7873225212097168, "sampling/sampling_logp_difference/mean": 0.017083294689655304, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 98.375, "completions/mean_terminated_length": 98.375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.4051584005355835, "epoch": 0.6442477876106195, "frac_reward_zero_std": 1.0, "grad_norm": 0.034731977046040044, "kl": 0.013031707145273685, "learning_rate": 9.888478677182154e-07, "loss": 0.0001, "num_tokens": 5782318.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3868625164031982, "sampling/importance_sampling_ratio/mean": 1.0004628896713257, "sampling/importance_sampling_ratio/min": 0.6924977898597717, "sampling/sampling_logp_difference/max": 0.3674502372741699, "sampling/sampling_logp_difference/mean": 0.013802760280668736, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 104.421875, "completions/mean_terminated_length": 104.421875, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.4693533480167389, "epoch": 0.6460176991150443, "frac_reward_zero_std": 0.75, "grad_norm": 3.0831511432804044, "kl": 0.024606188759207726, "learning_rate": 9.886850877581078e-07, "loss": -0.1668, "num_tokens": 5801369.0, "reward": -0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.9070464372634888, "sampling/importance_sampling_ratio/mean": 1.00021493434906, "sampling/importance_sampling_ratio/min": 0.4210282862186432, "sampling/sampling_logp_difference/max": 0.8650553226470947, "sampling/sampling_logp_difference/mean": 0.016472456976771355, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 86.09375, "completions/mean_terminated_length": 86.09375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.3831039369106293, "epoch": 0.647787610619469, "frac_reward_zero_std": 1.0, "grad_norm": 0.05046837503924455, "kl": 0.008188454434275627, "learning_rate": 9.885211419904903e-07, "loss": 0.0001, "num_tokens": 5816703.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4862701892852783, "sampling/importance_sampling_ratio/mean": 0.9999388456344604, "sampling/importance_sampling_ratio/min": 0.7058389782905579, "sampling/sampling_logp_difference/max": 0.3962697982788086, "sampling/sampling_logp_difference/mean": 0.018592428416013718, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 82.09375, "completions/mean_terminated_length": 82.09375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.25472283363342285, "epoch": 0.6495575221238938, "frac_reward_zero_std": 1.0, "grad_norm": 0.04646732187995595, "kl": 0.018464451655745506, "learning_rate": 9.883560308064722e-07, "loss": 0.0001, "num_tokens": 5834549.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5658870935440063, "sampling/importance_sampling_ratio/mean": 0.9994906187057495, "sampling/importance_sampling_ratio/min": 0.7226534485816956, "sampling/sampling_logp_difference/max": 0.4484524726867676, "sampling/sampling_logp_difference/mean": 0.013782997615635395, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 105.5625, "completions/mean_terminated_length": 105.5625, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.3836281895637512, "epoch": 0.6513274336283186, "frac_reward_zero_std": 1.0, "grad_norm": 0.05150506273323067, "kl": 0.016738034784793854, "learning_rate": 9.881897545999429e-07, "loss": 0.0002, "num_tokens": 5852649.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4687012434005737, "sampling/importance_sampling_ratio/mean": 1.0002315044403076, "sampling/importance_sampling_ratio/min": 0.6318727731704712, "sampling/sampling_logp_difference/max": 0.4590672254562378, "sampling/sampling_logp_difference/mean": 0.013846302404999733, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 119.6875, "completions/mean_terminated_length": 119.6875, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 0.4400315582752228, "epoch": 0.6530973451327433, "frac_reward_zero_std": 1.0, "grad_norm": 0.04148350669275565, "kl": 0.017446907237172127, "learning_rate": 9.880223137675707e-07, "loss": 0.0002, "num_tokens": 5871701.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.461326003074646, "sampling/importance_sampling_ratio/mean": 1.0014960765838623, "sampling/importance_sampling_ratio/min": 0.6738423109054565, "sampling/sampling_logp_difference/max": 0.3947591781616211, "sampling/sampling_logp_difference/mean": 0.017619699239730835, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 77.9375, "completions/mean_terminated_length": 77.9375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.16520312428474426, "epoch": 0.6548672566371682, "frac_reward_zero_std": 0.75, "grad_norm": 6.952300200899005, "kl": 0.03090791031718254, "learning_rate": 9.87853708708803e-07, "loss": 0.1817, "num_tokens": 5892465.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.707594871520996, "sampling/importance_sampling_ratio/mean": 0.9993278980255127, "sampling/importance_sampling_ratio/min": 0.6782475709915161, "sampling/sampling_logp_difference/max": 0.535085916519165, "sampling/sampling_logp_difference/mean": 0.012654388323426247, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 113.5625, "completions/mean_terminated_length": 113.5625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3643713891506195, "epoch": 0.6566371681415929, "frac_reward_zero_std": 0.75, "grad_norm": 7.294525677618649, "kl": 0.0199178084731102, "learning_rate": 9.876839398258639e-07, "loss": 0.1817, "num_tokens": 5911253.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.5951619148254395, "sampling/importance_sampling_ratio/mean": 1.0007092952728271, "sampling/importance_sampling_ratio/min": 0.39645707607269287, "sampling/sampling_logp_difference/max": 0.9251875877380371, "sampling/sampling_logp_difference/mean": 0.014422750100493431, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 106.671875, "completions/mean_terminated_length": 106.671875, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.39586547017097473, "epoch": 0.6584070796460177, "frac_reward_zero_std": 1.0, "grad_norm": 0.02170165706755283, "kl": 0.00442938506603241, "learning_rate": 9.875130075237543e-07, "loss": 0.0, "num_tokens": 5928768.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6555458307266235, "sampling/importance_sampling_ratio/mean": 1.0000089406967163, "sampling/importance_sampling_ratio/min": 0.7011512517929077, "sampling/sampling_logp_difference/max": 0.5041307210922241, "sampling/sampling_logp_difference/mean": 0.014346872456371784, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 71.5, "completions/mean_terminated_length": 71.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.11664820462465286, "epoch": 0.6601769911504425, "frac_reward_zero_std": 0.75, "grad_norm": 6.413113126385533, "kl": 0.0214911587536335, "learning_rate": 9.873409122102503e-07, "loss": 0.185, "num_tokens": 5946384.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.7430909872055054, "sampling/importance_sampling_ratio/mean": 0.9998141527175903, "sampling/importance_sampling_ratio/min": 0.7246072292327881, "sampling/sampling_logp_difference/max": 0.5556598901748657, "sampling/sampling_logp_difference/mean": 0.007141374982893467, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 99.90625, "completions/mean_terminated_length": 99.90625, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.49403318762779236, "epoch": 0.6619469026548672, "frac_reward_zero_std": 0.75, "grad_norm": 2.065293362113313, "kl": 0.009674089029431343, "learning_rate": 9.87167654295903e-07, "loss": 0.083, "num_tokens": 5965722.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5598499774932861, "sampling/importance_sampling_ratio/mean": 0.9995803833007812, "sampling/importance_sampling_ratio/min": 0.5161864161491394, "sampling/sampling_logp_difference/max": 0.6612873077392578, "sampling/sampling_logp_difference/mean": 0.018438229337334633, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 49.8125, "completions/mean_terminated_length": 49.8125, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.1185794249176979, "epoch": 0.6637168141592921, "frac_reward_zero_std": 1.0, "grad_norm": 0.0955250497511546, "kl": 0.015874072909355164, "learning_rate": 9.869932341940358e-07, "loss": 0.0001, "num_tokens": 5981086.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0023404359817505, "sampling/importance_sampling_ratio/min": 0.5194261074066162, "sampling/sampling_logp_difference/max": 0.9786150455474854, "sampling/sampling_logp_difference/mean": 0.023584650829434395, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 42.40625, "completions/mean_terminated_length": 42.40625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.18201380968093872, "epoch": 0.6654867256637168, "frac_reward_zero_std": 1.0, "grad_norm": 0.18587050428707652, "kl": 0.0745650976896286, "learning_rate": 9.868176523207463e-07, "loss": 0.0004, "num_tokens": 5994552.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3351178169250488, "sampling/importance_sampling_ratio/mean": 0.9991710186004639, "sampling/importance_sampling_ratio/min": 0.5561547875404358, "sampling/sampling_logp_difference/max": 0.586708664894104, "sampling/sampling_logp_difference/mean": 0.014691246673464775, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 74.796875, "completions/mean_terminated_length": 74.796875, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.30755698680877686, "epoch": 0.6672566371681415, "frac_reward_zero_std": 1.0, "grad_norm": 0.10144206392078267, "kl": 0.0294176135212183, "learning_rate": 9.86640909094902e-07, "loss": 0.0001, "num_tokens": 6008331.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3973352909088135, "sampling/importance_sampling_ratio/mean": 0.9999659657478333, "sampling/importance_sampling_ratio/min": 0.40352514386177063, "sampling/sampling_logp_difference/max": 0.9075164794921875, "sampling/sampling_logp_difference/mean": 0.014053567312657833, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 66.953125, "completions/mean_terminated_length": 66.953125, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.25754937529563904, "epoch": 0.6690265486725664, "frac_reward_zero_std": 0.75, "grad_norm": 5.230792858420169, "kl": 0.024930693209171295, "learning_rate": 9.864630049381424e-07, "loss": 0.0888, "num_tokens": 6022776.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.4337999820709229, "sampling/importance_sampling_ratio/mean": 0.9992684721946716, "sampling/importance_sampling_ratio/min": 0.6020495295524597, "sampling/sampling_logp_difference/max": 0.5074155330657959, "sampling/sampling_logp_difference/mean": 0.009889904409646988, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 43.34375, "completions/mean_terminated_length": 43.34375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.17238153517246246, "epoch": 0.6707964601769911, "frac_reward_zero_std": 1.0, "grad_norm": 0.1524987210184169, "kl": 0.1290230005979538, "learning_rate": 9.862839402748753e-07, "loss": 0.0006, "num_tokens": 6036398.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5416227579116821, "sampling/importance_sampling_ratio/mean": 1.0005288124084473, "sampling/importance_sampling_ratio/min": 0.6389191746711731, "sampling/sampling_logp_difference/max": 0.44797730445861816, "sampling/sampling_logp_difference/mean": 0.010033435188233852, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 35.71875, "completions/mean_terminated_length": 35.71875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 0.1937960535287857, "epoch": 0.672566371681416, "frac_reward_zero_std": 0.75, "grad_norm": 8.898301167597465, "kl": 0.202432781457901, "learning_rate": 9.861037155322776e-07, "loss": -0.178, "num_tokens": 6049132.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.343531608581543, "sampling/importance_sampling_ratio/mean": 1.001887321472168, "sampling/importance_sampling_ratio/min": 0.6436159610748291, "sampling/sampling_logp_difference/max": 0.44065308570861816, "sampling/sampling_logp_difference/mean": 0.013763444498181343, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 38.84375, "completions/mean_terminated_length": 38.84375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.26760339736938477, "epoch": 0.6743362831858407, "frac_reward_zero_std": 0.5, "grad_norm": 30.52928589488721, "kl": 0.09609369188547134, "learning_rate": 9.859223311402936e-07, "loss": -0.4911, "num_tokens": 6062402.0, "reward": -0.03125, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.7370104789733887, "sampling/importance_sampling_ratio/mean": 1.0004743337631226, "sampling/importance_sampling_ratio/min": 0.7212648987770081, "sampling/sampling_logp_difference/max": 0.5521655082702637, "sampling/sampling_logp_difference/mean": 0.014504313468933105, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 116.0, "completions/max_terminated_length": 116.0, "completions/mean_length": 22.125, "completions/mean_terminated_length": 22.125, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.1711660772562027, "epoch": 0.6761061946902654, "frac_reward_zero_std": 0.75, "grad_norm": 24.017188586007887, "kl": 0.3548486530780792, "learning_rate": 9.85739787531634e-07, "loss": -0.3051, "num_tokens": 6074586.0, "reward": -0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": -0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.6289836168289185, "sampling/importance_sampling_ratio/mean": 1.0003910064697266, "sampling/importance_sampling_ratio/min": 0.5489668846130371, "sampling/sampling_logp_difference/max": 0.5997171401977539, "sampling/sampling_logp_difference/mean": 0.013700541108846664, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1171.0, "completions/max_terminated_length": 1171.0, "completions/mean_length": 59.65625, "completions/mean_terminated_length": 59.65625, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.17006395757198334, "epoch": 0.6778761061946903, "frac_reward_zero_std": 1.0, "grad_norm": 0.06476926527288644, "kl": 0.22285808622837067, "learning_rate": 9.85556085141775e-07, "loss": 0.0008, "num_tokens": 6088740.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5189270973205566, "sampling/importance_sampling_ratio/mean": 0.9982098340988159, "sampling/importance_sampling_ratio/min": 0.7894731760025024, "sampling/sampling_logp_difference/max": 0.41800427436828613, "sampling/sampling_logp_difference/mean": 0.009094467386603355, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 112.0, "completions/max_terminated_length": 112.0, "completions/mean_length": 20.15625, "completions/mean_terminated_length": 20.15625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.15394248068332672, "epoch": 0.679646017699115, "frac_reward_zero_std": 0.75, "grad_norm": 21.141641074846657, "kl": 0.3241913914680481, "learning_rate": 9.853712244089572e-07, "loss": 0.2209, "num_tokens": 6101086.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.425696611404419, "sampling/importance_sampling_ratio/mean": 1.0021278858184814, "sampling/importance_sampling_ratio/min": 0.6257392168045044, "sampling/sampling_logp_difference/max": 0.46882152557373047, "sampling/sampling_logp_difference/mean": 0.010466893203556538, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.1875, "completions/mean_terminated_length": 17.1875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.06128706410527229, "epoch": 0.6814159292035398, "frac_reward_zero_std": 1.0, "grad_norm": 0.4302172309995353, "kl": 0.2754594683647156, "learning_rate": 9.851852057741844e-07, "loss": 0.0027, "num_tokens": 6114986.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.1785469055175781, "sampling/importance_sampling_ratio/mean": 0.998342752456665, "sampling/importance_sampling_ratio/min": 0.8395922780036926, "sampling/sampling_logp_difference/max": 0.17483890056610107, "sampling/sampling_logp_difference/mean": 0.0034622247330844402, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.09375, "completions/mean_terminated_length": 17.09375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.07266160845756531, "epoch": 0.6831858407079646, "frac_reward_zero_std": 1.0, "grad_norm": 0.32445920449965693, "kl": 0.22553634643554688, "learning_rate": 9.849980296812231e-07, "loss": 0.0022, "num_tokens": 6126784.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3831980228424072, "sampling/importance_sampling_ratio/mean": 0.9980471134185791, "sampling/importance_sampling_ratio/min": 0.7396202683448792, "sampling/sampling_logp_difference/max": 0.3243982791900635, "sampling/sampling_logp_difference/mean": 0.01246151514351368, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1225.0, "completions/max_terminated_length": 1225.0, "completions/mean_length": 50.0625, "completions/mean_terminated_length": 50.0625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.36946654319763184, "epoch": 0.6849557522123894, "frac_reward_zero_std": 0.75, "grad_norm": 10.48257294738574, "kl": 0.0802491158246994, "learning_rate": 9.848096965766002e-07, "loss": -0.1169, "num_tokens": 6141316.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.3859869241714478, "sampling/importance_sampling_ratio/mean": 0.9985344409942627, "sampling/importance_sampling_ratio/min": 0.561350405216217, "sampling/sampling_logp_difference/max": 0.5774099826812744, "sampling/sampling_logp_difference/mean": 0.017032448202371597, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 61.625, "completions/mean_terminated_length": 61.625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.19366797804832458, "epoch": 0.6867256637168142, "frac_reward_zero_std": 0.5, "grad_norm": 10.91991659893057, "kl": 0.08208681643009186, "learning_rate": 9.846202069096038e-07, "loss": -0.2283, "num_tokens": 6157004.0, "reward": 0.40625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.6599446535110474, "sampling/importance_sampling_ratio/mean": 1.0001928806304932, "sampling/importance_sampling_ratio/min": 0.2644866704940796, "sampling/sampling_logp_difference/max": 1.3299643993377686, "sampling/sampling_logp_difference/mean": 0.010109896771609783, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1380.0, "completions/max_terminated_length": 1380.0, "completions/mean_length": 45.34375, "completions/mean_terminated_length": 45.34375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.24957646429538727, "epoch": 0.6884955752212389, "frac_reward_zero_std": 0.75, "grad_norm": 7.634748076840416, "kl": 0.20094981789588928, "learning_rate": 9.844295611322803e-07, "loss": -0.1671, "num_tokens": 6174466.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.3965059518814087, "sampling/importance_sampling_ratio/mean": 0.9998543858528137, "sampling/importance_sampling_ratio/min": 0.7887519001960754, "sampling/sampling_logp_difference/max": 0.33397340774536133, "sampling/sampling_logp_difference/mean": 0.0095017459243536, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1056.0, "completions/max_terminated_length": 1056.0, "completions/mean_length": 112.140625, "completions/mean_terminated_length": 112.140625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2890416085720062, "epoch": 0.6902654867256637, "frac_reward_zero_std": 0.25, "grad_norm": 7.721867954809243, "kl": 0.016003964468836784, "learning_rate": 9.842377596994344e-07, "loss": 0.9181, "num_tokens": 6194203.0, "reward": 0.875, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.59376060962677, "sampling/importance_sampling_ratio/mean": 1.0002254247665405, "sampling/importance_sampling_ratio/min": 0.6421988010406494, "sampling/sampling_logp_difference/max": 0.4660964012145996, "sampling/sampling_logp_difference/mean": 0.010162144899368286, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 61.90625, "completions/mean_terminated_length": 61.90625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.39551448822021484, "epoch": 0.6920353982300885, "frac_reward_zero_std": 0.75, "grad_norm": 9.681739496403345, "kl": 0.016538335010409355, "learning_rate": 9.84044803068628e-07, "loss": -0.1438, "num_tokens": 6208901.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.596825122833252, "sampling/importance_sampling_ratio/mean": 1.0015642642974854, "sampling/importance_sampling_ratio/min": 0.6489729285240173, "sampling/sampling_logp_difference/max": 0.4680173397064209, "sampling/sampling_logp_difference/mean": 0.018785130232572556, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 838.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 60.109375, "completions/mean_terminated_length": 60.109375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3214569389820099, "epoch": 0.6938053097345133, "frac_reward_zero_std": 1.0, "grad_norm": 0.057294202411554675, "kl": 0.011079341173171997, "learning_rate": 9.838506917001784e-07, "loss": 0.0001, "num_tokens": 6223164.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.2777345180511475, "sampling/importance_sampling_ratio/mean": 1.0013971328735352, "sampling/importance_sampling_ratio/min": 0.6055299639701843, "sampling/sampling_logp_difference/max": 0.5016512870788574, "sampling/sampling_logp_difference/mean": 0.013057949021458626, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 712.0, "completions/max_terminated_length": 712.0, "completions/mean_length": 140.515625, "completions/mean_terminated_length": 140.515625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.4342554211616516, "epoch": 0.695575221238938, "frac_reward_zero_std": 0.75, "grad_norm": 2.046959634621339, "kl": 0.013974161818623543, "learning_rate": 9.836554260571577e-07, "loss": -0.0494, "num_tokens": 6242509.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.4464900493621826, "sampling/importance_sampling_ratio/mean": 1.0002140998840332, "sampling/importance_sampling_ratio/min": 0.4558376967906952, "sampling/sampling_logp_difference/max": 0.7856185436248779, "sampling/sampling_logp_difference/mean": 0.016138717532157898, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1088.0, "completions/max_terminated_length": 1088.0, "completions/mean_length": 133.3125, "completions/mean_terminated_length": 133.3125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.4057089686393738, "epoch": 0.6973451327433628, "frac_reward_zero_std": 0.75, "grad_norm": 3.5408018723691908, "kl": 0.008893290534615517, "learning_rate": 9.834590066053917e-07, "loss": 0.4581, "num_tokens": 6262081.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6015470027923584, "sampling/importance_sampling_ratio/mean": 0.9998269081115723, "sampling/importance_sampling_ratio/min": 0.540658175945282, "sampling/sampling_logp_difference/max": 0.6149680614471436, "sampling/sampling_logp_difference/mean": 0.013680309988558292, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 88.953125, "completions/mean_terminated_length": 88.953125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.446719229221344, "epoch": 0.6991150442477876, "frac_reward_zero_std": 1.0, "grad_norm": 0.0329129135848926, "kl": 0.007592624984681606, "learning_rate": 9.832614338134595e-07, "loss": 0.0001, "num_tokens": 6276606.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4540531635284424, "sampling/importance_sampling_ratio/mean": 1.0001907348632812, "sampling/importance_sampling_ratio/min": 0.6069669127464294, "sampling/sampling_logp_difference/max": 0.4992809295654297, "sampling/sampling_logp_difference/mean": 0.01567765697836876, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 108.09375, "completions/mean_terminated_length": 108.09375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.41068732738494873, "epoch": 0.7008849557522124, "frac_reward_zero_std": 1.0, "grad_norm": 0.03007958844071075, "kl": 0.007306972984224558, "learning_rate": 9.8306270815269e-07, "loss": 0.0001, "num_tokens": 6293428.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7739129066467285, "sampling/importance_sampling_ratio/mean": 1.0002840757369995, "sampling/importance_sampling_ratio/min": 0.5545526742935181, "sampling/sampling_logp_difference/max": 0.5895934104919434, "sampling/sampling_logp_difference/mean": 0.016644949093461037, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 96.875, "completions/mean_terminated_length": 96.875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.4401080906391144, "epoch": 0.7026548672566372, "frac_reward_zero_std": 0.75, "grad_norm": 3.4586871026462416, "kl": 0.010626133531332016, "learning_rate": 9.828628300971638e-07, "loss": -0.0859, "num_tokens": 6310460.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5749874114990234, "sampling/importance_sampling_ratio/mean": 1.0006617307662964, "sampling/importance_sampling_ratio/min": 0.5700552463531494, "sampling/sampling_logp_difference/max": 0.5620219707489014, "sampling/sampling_logp_difference/mean": 0.018485158681869507, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 48.0625, "completions/mean_terminated_length": 48.0625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3010576069355011, "epoch": 0.7044247787610619, "frac_reward_zero_std": 1.0, "grad_norm": 0.047274504860197054, "kl": 0.009327048435807228, "learning_rate": 9.826618001237099e-07, "loss": 0.0001, "num_tokens": 6324384.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6009161472320557, "sampling/importance_sampling_ratio/mean": 0.9997097253799438, "sampling/importance_sampling_ratio/min": 0.7120568752288818, "sampling/sampling_logp_difference/max": 0.47057604789733887, "sampling/sampling_logp_difference/mean": 0.011248650029301643, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 65.53125, "completions/mean_terminated_length": 65.53125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3446110188961029, "epoch": 0.7061946902654868, "frac_reward_zero_std": 0.75, "grad_norm": 13.730077407054516, "kl": 0.015162178315222263, "learning_rate": 9.82459618711906e-07, "loss": 0.275, "num_tokens": 6339490.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998742341995239, "sampling/importance_sampling_ratio/min": 0.5365955829620361, "sampling/sampling_logp_difference/max": 0.877489447593689, "sampling/sampling_logp_difference/mean": 0.014317413792014122, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 61.59375, "completions/mean_terminated_length": 61.59375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.3405458927154541, "epoch": 0.7079646017699115, "frac_reward_zero_std": 1.0, "grad_norm": 0.11617255689871578, "kl": 0.023211687803268433, "learning_rate": 9.822562863440755e-07, "loss": 0.0002, "num_tokens": 6357672.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3631829023361206, "sampling/importance_sampling_ratio/mean": 1.0004823207855225, "sampling/importance_sampling_ratio/min": 0.6919498443603516, "sampling/sampling_logp_difference/max": 0.3682417869567871, "sampling/sampling_logp_difference/mean": 0.01271877158433199, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 91.984375, "completions/mean_terminated_length": 91.984375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3686937987804413, "epoch": 0.7097345132743362, "frac_reward_zero_std": 1.0, "grad_norm": 0.05523325223140905, "kl": 0.01296952087432146, "learning_rate": 9.820518035052889e-07, "loss": 0.0001, "num_tokens": 6374407.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4117615222930908, "sampling/importance_sampling_ratio/mean": 1.0009119510650635, "sampling/importance_sampling_ratio/min": 0.6341397166252136, "sampling/sampling_logp_difference/max": 0.4554860591888428, "sampling/sampling_logp_difference/mean": 0.013343364000320435, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 69.921875, "completions/mean_terminated_length": 69.921875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.32802262902259827, "epoch": 0.7115044247787611, "frac_reward_zero_std": 1.0, "grad_norm": 0.08481747708423726, "kl": 0.017437882721424103, "learning_rate": 9.818461706833602e-07, "loss": 0.0001, "num_tokens": 6388386.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3902863264083862, "sampling/importance_sampling_ratio/mean": 0.9997358322143555, "sampling/importance_sampling_ratio/min": 0.6693275570869446, "sampling/sampling_logp_difference/max": 0.4014817476272583, "sampling/sampling_logp_difference/mean": 0.013436712324619293, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 71.171875, "completions/mean_terminated_length": 71.171875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3199332654476166, "epoch": 0.7132743362831858, "frac_reward_zero_std": 0.75, "grad_norm": 11.113427158524635, "kl": 0.01687893643975258, "learning_rate": 9.816393883688475e-07, "loss": -0.3956, "num_tokens": 6403565.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.577026605606079, "sampling/importance_sampling_ratio/mean": 0.9993441700935364, "sampling/importance_sampling_ratio/min": 0.5030588507652283, "sampling/sampling_logp_difference/max": 0.6870481967926025, "sampling/sampling_logp_difference/mean": 0.013590632006525993, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 82.15625, "completions/mean_terminated_length": 82.15625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.34127122163772583, "epoch": 0.7150442477876107, "frac_reward_zero_std": 0.75, "grad_norm": 2.6826531843832764, "kl": 0.020201334729790688, "learning_rate": 9.814314570550505e-07, "loss": -0.0153, "num_tokens": 6424359.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.437758445739746, "sampling/importance_sampling_ratio/mean": 0.9995735883712769, "sampling/importance_sampling_ratio/min": 0.4791787564754486, "sampling/sampling_logp_difference/max": 0.7356815338134766, "sampling/sampling_logp_difference/mean": 0.013350830413401127, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 59.859375, "completions/mean_terminated_length": 59.859375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.3270696699619293, "epoch": 0.7168141592920354, "frac_reward_zero_std": 1.0, "grad_norm": 0.06489359238192514, "kl": 0.022430576384067535, "learning_rate": 9.812223772380105e-07, "loss": 0.0002, "num_tokens": 6443438.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5900388956069946, "sampling/importance_sampling_ratio/mean": 0.9994983077049255, "sampling/importance_sampling_ratio/min": 0.645928144454956, "sampling/sampling_logp_difference/max": 0.4637584686279297, "sampling/sampling_logp_difference/mean": 0.014710272662341595, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 82.078125, "completions/mean_terminated_length": 82.078125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.41037648916244507, "epoch": 0.7185840707964601, "frac_reward_zero_std": 1.0, "grad_norm": 0.04683548653083853, "kl": 0.02197330817580223, "learning_rate": 9.810121494165087e-07, "loss": 0.0001, "num_tokens": 6459123.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4226871728897095, "sampling/importance_sampling_ratio/mean": 0.9984689354896545, "sampling/importance_sampling_ratio/min": 0.6845190525054932, "sampling/sampling_logp_difference/max": 0.37903881072998047, "sampling/sampling_logp_difference/mean": 0.018058765679597855, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.29711028933525085, "epoch": 0.720353982300885, "frac_reward_zero_std": 1.0, "grad_norm": 0.07698501154185808, "kl": 0.024243097752332687, "learning_rate": 9.808007740920645e-07, "loss": 0.0001, "num_tokens": 6472723.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.2674616575241089, "sampling/importance_sampling_ratio/mean": 1.0002200603485107, "sampling/importance_sampling_ratio/min": 0.5505961775779724, "sampling/sampling_logp_difference/max": 0.5967535972595215, "sampling/sampling_logp_difference/mean": 0.012171144597232342, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 121.03125, "completions/mean_terminated_length": 121.03125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.4948740005493164, "epoch": 0.7221238938053097, "frac_reward_zero_std": 0.5, "grad_norm": 7.680780315466812, "kl": 0.015606466680765152, "learning_rate": 9.80588251768935e-07, "loss": 0.1649, "num_tokens": 6491157.0, "reward": 0.6875, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.3771916627883911, "sampling/importance_sampling_ratio/mean": 0.999821126461029, "sampling/importance_sampling_ratio/min": 0.7080368399620056, "sampling/sampling_logp_difference/max": 0.3452591896057129, "sampling/sampling_logp_difference/mean": 0.017087643966078758, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 67.546875, "completions/mean_terminated_length": 67.546875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2259364277124405, "epoch": 0.7238938053097345, "frac_reward_zero_std": 1.0, "grad_norm": 0.059875820545522006, "kl": 0.023707356303930283, "learning_rate": 9.803745829541137e-07, "loss": 0.0001, "num_tokens": 6507720.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2575604915618896, "sampling/importance_sampling_ratio/mean": 0.998282790184021, "sampling/importance_sampling_ratio/min": 0.3100183606147766, "sampling/sampling_logp_difference/max": 1.171123743057251, "sampling/sampling_logp_difference/mean": 0.011251486837863922, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 111.15625, "completions/mean_terminated_length": 111.15625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.4915921688079834, "epoch": 0.7256637168141593, "frac_reward_zero_std": 0.75, "grad_norm": 9.162438830181475, "kl": 0.011929306201636791, "learning_rate": 9.801597681573289e-07, "loss": -0.306, "num_tokens": 6524306.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002862215042114, "sampling/importance_sampling_ratio/min": 0.619353711605072, "sampling/sampling_logp_difference/max": 0.8039919137954712, "sampling/sampling_logp_difference/mean": 0.019170641899108887, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 94.578125, "completions/mean_terminated_length": 94.578125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.373543918132782, "epoch": 0.727433628318584, "frac_reward_zero_std": 1.0, "grad_norm": 0.05289658409649595, "kl": 0.017881829291582108, "learning_rate": 9.799438078910432e-07, "loss": 0.0001, "num_tokens": 6539895.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3026437759399414, "sampling/importance_sampling_ratio/mean": 1.0004127025604248, "sampling/importance_sampling_ratio/min": 0.6380023956298828, "sampling/sampling_logp_difference/max": 0.4494132995605469, "sampling/sampling_logp_difference/mean": 0.014169419184327126, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 94.734375, "completions/mean_terminated_length": 94.734375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.25864890217781067, "epoch": 0.7292035398230089, "frac_reward_zero_std": 1.0, "grad_norm": 0.03554165373600353, "kl": 0.011999612674117088, "learning_rate": 9.797267026704514e-07, "loss": 0.0001, "num_tokens": 6558230.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3425296545028687, "sampling/importance_sampling_ratio/mean": 0.9993278980255127, "sampling/importance_sampling_ratio/min": 0.6613734364509583, "sampling/sampling_logp_difference/max": 0.4134366512298584, "sampling/sampling_logp_difference/mean": 0.011325197294354439, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 96.0625, "completions/mean_terminated_length": 96.0625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.38927680253982544, "epoch": 0.7309734513274336, "frac_reward_zero_std": 1.0, "grad_norm": 0.03206544035807514, "kl": 0.009823622182011604, "learning_rate": 9.7950845301348e-07, "loss": 0.0001, "num_tokens": 6577162.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4312158823013306, "sampling/importance_sampling_ratio/mean": 1.0001397132873535, "sampling/importance_sampling_ratio/min": 0.41807880997657776, "sampling/sampling_logp_difference/max": 0.8720853328704834, "sampling/sampling_logp_difference/mean": 0.015506758354604244, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 95.359375, "completions/mean_terminated_length": 95.359375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2924111783504486, "epoch": 0.7327433628318584, "frac_reward_zero_std": 0.75, "grad_norm": 3.3881119270730435, "kl": 0.015888947993516922, "learning_rate": 9.792890594407855e-07, "loss": -0.0539, "num_tokens": 6601089.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.449302315711975, "sampling/importance_sampling_ratio/mean": 1.0000194311141968, "sampling/importance_sampling_ratio/min": 0.5437875986099243, "sampling/sampling_logp_difference/max": 0.6091965436935425, "sampling/sampling_logp_difference/mean": 0.011562496423721313, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 96.40625, "completions/mean_terminated_length": 96.40625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.29087594151496887, "epoch": 0.7345132743362832, "frac_reward_zero_std": 1.0, "grad_norm": 0.032274670059917146, "kl": 0.00947969127446413, "learning_rate": 9.790685224757532e-07, "loss": 0.0001, "num_tokens": 6623995.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4449760913848877, "sampling/importance_sampling_ratio/mean": 0.9994337558746338, "sampling/importance_sampling_ratio/min": 0.5143414735794067, "sampling/sampling_logp_difference/max": 0.6648678779602051, "sampling/sampling_logp_difference/mean": 0.016016721725463867, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 65.4375, "completions/mean_terminated_length": 65.4375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2695866525173187, "epoch": 0.736283185840708, "frac_reward_zero_std": 1.0, "grad_norm": 0.10092692024078374, "kl": 0.025945590808987617, "learning_rate": 9.788468426444967e-07, "loss": 0.0001, "num_tokens": 6640055.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.255564570426941, "sampling/importance_sampling_ratio/mean": 1.000176191329956, "sampling/importance_sampling_ratio/min": 0.4881514608860016, "sampling/sampling_logp_difference/max": 0.7171295881271362, "sampling/sampling_logp_difference/mean": 0.01213115081191063, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 59.21875, "completions/mean_terminated_length": 59.21875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.27273327112197876, "epoch": 0.7380530973451327, "frac_reward_zero_std": 1.0, "grad_norm": 0.046918830851836506, "kl": 0.011121374554932117, "learning_rate": 9.786240204758552e-07, "loss": 0.0001, "num_tokens": 6655957.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3171130418777466, "sampling/importance_sampling_ratio/mean": 0.9994080662727356, "sampling/importance_sampling_ratio/min": 0.5929533839225769, "sampling/sampling_logp_difference/max": 0.5226395130157471, "sampling/sampling_logp_difference/mean": 0.012166490778326988, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 722.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 119.375, "completions/mean_terminated_length": 119.375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.4039311110973358, "epoch": 0.7398230088495575, "frac_reward_zero_std": 0.5, "grad_norm": 4.238070768225619, "kl": 0.03022146411240101, "learning_rate": 9.784000565013933e-07, "loss": -0.0596, "num_tokens": 6675757.0, "reward": 0.15625, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.4661486148834229, "sampling/importance_sampling_ratio/mean": 1.0014400482177734, "sampling/importance_sampling_ratio/min": 0.40373674035072327, "sampling/sampling_logp_difference/max": 0.9069922566413879, "sampling/sampling_logp_difference/mean": 0.016016965731978416, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 91.609375, "completions/mean_terminated_length": 91.609375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2584455907344818, "epoch": 0.7415929203539823, "frac_reward_zero_std": 1.0, "grad_norm": 0.038183061521502, "kl": 0.012106215581297874, "learning_rate": 9.781749512553998e-07, "loss": 0.0001, "num_tokens": 6694612.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4827467203140259, "sampling/importance_sampling_ratio/mean": 0.9982657432556152, "sampling/importance_sampling_ratio/min": 0.6861007213592529, "sampling/sampling_logp_difference/max": 0.393896222114563, "sampling/sampling_logp_difference/mean": 0.015866706147789955, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 91.4375, "completions/mean_terminated_length": 91.4375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.4179188907146454, "epoch": 0.7433628318584071, "frac_reward_zero_std": 1.0, "grad_norm": 0.02882059358884815, "kl": 0.006676926743239164, "learning_rate": 9.779487052748863e-07, "loss": 0.0001, "num_tokens": 6709984.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.3979723453521729, "sampling/importance_sampling_ratio/mean": 0.9994974732398987, "sampling/importance_sampling_ratio/min": 0.6180513501167297, "sampling/sampling_logp_difference/max": 0.4811837673187256, "sampling/sampling_logp_difference/mean": 0.01743045449256897, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 92.171875, "completions/mean_terminated_length": 92.171875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.46604567766189575, "epoch": 0.7451327433628319, "frac_reward_zero_std": 1.0, "grad_norm": 0.029793712984509103, "kl": 0.008214936591684818, "learning_rate": 9.777213190995847e-07, "loss": 0.0001, "num_tokens": 6725323.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.729860782623291, "sampling/importance_sampling_ratio/mean": 1.0001142024993896, "sampling/importance_sampling_ratio/min": 0.6954634189605713, "sampling/sampling_logp_difference/max": 0.5480408668518066, "sampling/sampling_logp_difference/mean": 0.017748814076185226, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 135.203125, "completions/mean_terminated_length": 135.203125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.5472180843353271, "epoch": 0.7469026548672566, "frac_reward_zero_std": 0.25, "grad_norm": 3.990843197760746, "kl": 0.013447487726807594, "learning_rate": 9.774927932719482e-07, "loss": 0.0167, "num_tokens": 6743560.0, "reward": -0.03125, "reward_std": 0.375, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002206563949585, "sampling/importance_sampling_ratio/min": 0.6289763450622559, "sampling/sampling_logp_difference/max": 1.2453601360321045, "sampling/sampling_logp_difference/mean": 0.018746210262179375, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 137.109375, "completions/mean_terminated_length": 137.109375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5125194191932678, "epoch": 0.7486725663716814, "frac_reward_zero_std": 0.75, "grad_norm": 2.2868626079139616, "kl": 0.010398737154901028, "learning_rate": 9.77263128337148e-07, "loss": -0.098, "num_tokens": 6761343.0, "reward": -0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.3858596086502075, "sampling/importance_sampling_ratio/mean": 1.0002697706222534, "sampling/importance_sampling_ratio/min": 0.5133931636810303, "sampling/sampling_logp_difference/max": 0.6667132377624512, "sampling/sampling_logp_difference/mean": 0.017657287418842316, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 116.3125, "completions/mean_terminated_length": 116.3125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2789403796195984, "epoch": 0.7504424778761062, "frac_reward_zero_std": 0.75, "grad_norm": 2.833428871020576, "kl": 0.010706520639359951, "learning_rate": 9.770323248430727e-07, "loss": -0.2037, "num_tokens": 6778371.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.3047248125076294, "sampling/importance_sampling_ratio/mean": 0.9989539980888367, "sampling/importance_sampling_ratio/min": 0.6344852447509766, "sampling/sampling_logp_difference/max": 0.4549412727355957, "sampling/sampling_logp_difference/mean": 0.011665822938084602, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 112.546875, "completions/mean_terminated_length": 112.546875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.528927206993103, "epoch": 0.7522123893805309, "frac_reward_zero_std": 0.75, "grad_norm": 7.167222849854241, "kl": 0.0181064922362566, "learning_rate": 9.768003833403276e-07, "loss": -0.0977, "num_tokens": 6796278.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.610059142112732, "sampling/importance_sampling_ratio/mean": 0.9996190667152405, "sampling/importance_sampling_ratio/min": 0.6272081136703491, "sampling/sampling_logp_difference/max": 0.4762709140777588, "sampling/sampling_logp_difference/mean": 0.01853884942829609, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 47.734375, "completions/mean_terminated_length": 47.734375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3304079473018646, "epoch": 0.7539823008849558, "frac_reward_zero_std": 1.0, "grad_norm": 0.05113450168112818, "kl": 0.01000053621828556, "learning_rate": 9.765673043822324e-07, "loss": 0.0001, "num_tokens": 6813573.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999748468399048, "sampling/importance_sampling_ratio/min": 0.7175012230873108, "sampling/sampling_logp_difference/max": 0.739088773727417, "sampling/sampling_logp_difference/mean": 0.014557043090462685, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 137.03125, "completions/mean_terminated_length": 137.03125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.4420037567615509, "epoch": 0.7557522123893805, "frac_reward_zero_std": 1.0, "grad_norm": 0.02800507514263248, "kl": 0.009223876520991325, "learning_rate": 9.763330885248204e-07, "loss": 0.0001, "num_tokens": 6831543.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4312554597854614, "sampling/importance_sampling_ratio/mean": 0.9998476505279541, "sampling/importance_sampling_ratio/min": 0.6370866894721985, "sampling/sampling_logp_difference/max": 0.4508495330810547, "sampling/sampling_logp_difference/mean": 0.015897046774625778, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 50.671875, "completions/mean_terminated_length": 50.671875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.1680561751127243, "epoch": 0.7575221238938054, "frac_reward_zero_std": 1.0, "grad_norm": 0.04842198314396375, "kl": 0.007218526676297188, "learning_rate": 9.760977363268373e-07, "loss": 0.0001, "num_tokens": 6846066.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2974542379379272, "sampling/importance_sampling_ratio/mean": 1.0001842975616455, "sampling/importance_sampling_ratio/min": 0.6183997988700867, "sampling/sampling_logp_difference/max": 0.4806201457977295, "sampling/sampling_logp_difference/mean": 0.011776283383369446, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 98.734375, "completions/mean_terminated_length": 98.734375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.21390385925769806, "epoch": 0.7592920353982301, "frac_reward_zero_std": 0.75, "grad_norm": 2.798618793037985, "kl": 0.010727117769420147, "learning_rate": 9.758612483497394e-07, "loss": 0.0569, "num_tokens": 6863473.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.3427993059158325, "sampling/importance_sampling_ratio/mean": 0.999305248260498, "sampling/importance_sampling_ratio/min": 0.6970505714416504, "sampling/sampling_logp_difference/max": 0.3608973026275635, "sampling/sampling_logp_difference/mean": 0.010015236213803291, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 34.109375, "completions/mean_terminated_length": 34.109375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2121407836675644, "epoch": 0.7610619469026548, "frac_reward_zero_std": 1.0, "grad_norm": 0.04642061517941747, "kl": 0.008114354684948921, "learning_rate": 9.756236251576924e-07, "loss": 0.0001, "num_tokens": 6878456.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.271593451499939, "sampling/importance_sampling_ratio/mean": 1.000221848487854, "sampling/importance_sampling_ratio/min": 0.6872886419296265, "sampling/sampling_logp_difference/max": 0.3750009536743164, "sampling/sampling_logp_difference/mean": 0.008882062509655952, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 185.09375, "completions/mean_terminated_length": 185.09375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.44268298149108887, "epoch": 0.7628318584070797, "frac_reward_zero_std": 0.75, "grad_norm": 4.191665702774492, "kl": 0.04039476066827774, "learning_rate": 9.753848673175707e-07, "loss": -0.1335, "num_tokens": 6899774.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.6913667917251587, "sampling/importance_sampling_ratio/mean": 0.9997708797454834, "sampling/importance_sampling_ratio/min": 0.46763890981674194, "sampling/sampling_logp_difference/max": 0.7600588798522949, "sampling/sampling_logp_difference/mean": 0.01565675251185894, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 43.234375, "completions/mean_terminated_length": 43.234375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.1949003040790558, "epoch": 0.7646017699115044, "frac_reward_zero_std": 1.0, "grad_norm": 0.06551818397017477, "kl": 0.008975997567176819, "learning_rate": 9.751449753989546e-07, "loss": 0.0001, "num_tokens": 6915357.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4351768493652344, "sampling/importance_sampling_ratio/mean": 1.0001879930496216, "sampling/importance_sampling_ratio/min": 0.5978617668151855, "sampling/sampling_logp_difference/max": 0.5143957138061523, "sampling/sampling_logp_difference/mean": 0.011728991754353046, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/max_terminated_length": 601.0, "completions/mean_length": 109.34375, "completions/mean_terminated_length": 109.34375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3204788267612457, "epoch": 0.7663716814159292, "frac_reward_zero_std": 0.75, "grad_norm": 3.4514060945140144, "kl": 0.016035258769989014, "learning_rate": 9.74903949974131e-07, "loss": 0.174, "num_tokens": 6933891.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.3704556226730347, "sampling/importance_sampling_ratio/mean": 1.000302791595459, "sampling/importance_sampling_ratio/min": 0.370818167924881, "sampling/sampling_logp_difference/max": 0.9920434951782227, "sampling/sampling_logp_difference/mean": 0.012358264066278934, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 122.0, "completions/max_terminated_length": 122.0, "completions/mean_length": 51.78125, "completions/mean_terminated_length": 51.78125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2988964915275574, "epoch": 0.768141592920354, "frac_reward_zero_std": 1.0, "grad_norm": 0.0646780144526037, "kl": 0.008315995335578918, "learning_rate": 9.746617916180905e-07, "loss": 0.0001, "num_tokens": 6948405.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2864738702774048, "sampling/importance_sampling_ratio/mean": 0.9995741844177246, "sampling/importance_sampling_ratio/min": 0.682759165763855, "sampling/sampling_logp_difference/max": 0.3816131353378296, "sampling/sampling_logp_difference/mean": 0.010760385543107986, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 149.9375, "completions/mean_terminated_length": 149.9375, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 0.46958667039871216, "epoch": 0.7699115044247787, "frac_reward_zero_std": 0.5, "grad_norm": 3.2892048962797666, "kl": 0.014349989593029022, "learning_rate": 9.744185009085256e-07, "loss": 0.058, "num_tokens": 6967809.0, "reward": 0.5, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.470845341682434, "sampling/importance_sampling_ratio/mean": 1.0003101825714111, "sampling/importance_sampling_ratio/min": 0.6346219778060913, "sampling/sampling_logp_difference/max": 0.4547257423400879, "sampling/sampling_logp_difference/mean": 0.01665390096604824, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 66.078125, "completions/mean_terminated_length": 66.078125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.1759306788444519, "epoch": 0.7716814159292036, "frac_reward_zero_std": 0.75, "grad_norm": 4.144032453870604, "kl": 0.005398609209805727, "learning_rate": 9.741740784258311e-07, "loss": 0.189, "num_tokens": 6984070.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.762777328491211, "sampling/importance_sampling_ratio/mean": 0.998619794845581, "sampling/importance_sampling_ratio/min": 0.5538696050643921, "sampling/sampling_logp_difference/max": 0.5908260345458984, "sampling/sampling_logp_difference/mean": 0.01123537216335535, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 123.640625, "completions/mean_terminated_length": 123.640625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5412547588348389, "epoch": 0.7734513274336283, "frac_reward_zero_std": 0.75, "grad_norm": 1.8359115425008254, "kl": 0.013453925028443336, "learning_rate": 9.739285247531017e-07, "loss": 0.0471, "num_tokens": 7002815.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000159502029419, "sampling/importance_sampling_ratio/min": 0.44841375946998596, "sampling/sampling_logp_difference/max": 1.0421335697174072, "sampling/sampling_logp_difference/mean": 0.019044142216444016, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 108.671875, "completions/mean_terminated_length": 108.671875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.4487525224685669, "epoch": 0.7752212389380531, "frac_reward_zero_std": 1.0, "grad_norm": 0.03290813086570887, "kl": 0.011676624417304993, "learning_rate": 9.736818404761302e-07, "loss": 0.0001, "num_tokens": 7020698.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5356125831604004, "sampling/importance_sampling_ratio/mean": 0.9999417662620544, "sampling/importance_sampling_ratio/min": 0.6842086911201477, "sampling/sampling_logp_difference/max": 0.42892932891845703, "sampling/sampling_logp_difference/mean": 0.018000802025198936, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 142.734375, "completions/mean_terminated_length": 142.734375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.558229386806488, "epoch": 0.7769911504424779, "frac_reward_zero_std": 0.5, "grad_norm": 2.561573745299718, "kl": 0.01746143400669098, "learning_rate": 9.734340261834066e-07, "loss": -0.0123, "num_tokens": 7040329.0, "reward": -0.09375, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.3913923501968384, "sampling/importance_sampling_ratio/mean": 1.0002951622009277, "sampling/importance_sampling_ratio/min": 0.2626253366470337, "sampling/sampling_logp_difference/max": 1.337026834487915, "sampling/sampling_logp_difference/mean": 0.018860086798667908, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 84.484375, "completions/mean_terminated_length": 84.484375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.43578213453292847, "epoch": 0.7787610619469026, "frac_reward_zero_std": 0.5, "grad_norm": 8.487202414597359, "kl": 0.011191412806510925, "learning_rate": 9.73185082466117e-07, "loss": -0.1363, "num_tokens": 7054808.0, "reward": -0.1875, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": -0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.597430944442749, "sampling/importance_sampling_ratio/mean": 1.0004340410232544, "sampling/importance_sampling_ratio/min": 0.5151932239532471, "sampling/sampling_logp_difference/max": 0.6632132530212402, "sampling/sampling_logp_difference/mean": 0.015007372945547104, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 68.921875, "completions/mean_terminated_length": 68.921875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.22991178929805756, "epoch": 0.7805309734513274, "frac_reward_zero_std": 0.75, "grad_norm": 4.0598608952177795, "kl": 0.021865490823984146, "learning_rate": 9.729350099181419e-07, "loss": -0.0892, "num_tokens": 7070659.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0008248090744019, "sampling/importance_sampling_ratio/min": 0.4950145483016968, "sampling/sampling_logp_difference/max": 0.9080419540405273, "sampling/sampling_logp_difference/mean": 0.013804643414914608, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 105.8125, "completions/mean_terminated_length": 105.8125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.49717026948928833, "epoch": 0.7823008849557522, "frac_reward_zero_std": 1.0, "grad_norm": 0.04886234557938152, "kl": 0.01464010775089264, "learning_rate": 9.726838091360545e-07, "loss": 0.0001, "num_tokens": 7087271.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9791940450668335, "sampling/importance_sampling_ratio/mean": 0.9998542666435242, "sampling/importance_sampling_ratio/min": 0.69510418176651, "sampling/sampling_logp_difference/max": 0.6826896667480469, "sampling/sampling_logp_difference/mean": 0.017835965380072594, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 173.484375, "completions/mean_terminated_length": 173.484375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5604503154754639, "epoch": 0.784070796460177, "frac_reward_zero_std": 0.75, "grad_norm": 4.387832890411488, "kl": 0.013209588825702667, "learning_rate": 9.724314807191196e-07, "loss": 0.1046, "num_tokens": 7108870.0, "reward": -0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5783559083938599, "sampling/importance_sampling_ratio/mean": 0.9999769926071167, "sampling/importance_sampling_ratio/min": 0.5818358659744263, "sampling/sampling_logp_difference/max": 0.5415668487548828, "sampling/sampling_logp_difference/mean": 0.018716178834438324, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 101.421875, "completions/mean_terminated_length": 101.421875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.42693156003952026, "epoch": 0.7858407079646018, "frac_reward_zero_std": 0.75, "grad_norm": 3.142223704463168, "kl": 0.01894482970237732, "learning_rate": 9.721780252692917e-07, "loss": -0.0856, "num_tokens": 7125889.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.4481490850448608, "sampling/importance_sampling_ratio/mean": 1.0003788471221924, "sampling/importance_sampling_ratio/min": 0.7009619474411011, "sampling/sampling_logp_difference/max": 0.370286226272583, "sampling/sampling_logp_difference/mean": 0.01637919247150421, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 69.28125, "completions/mean_terminated_length": 69.28125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.376160204410553, "epoch": 0.7876106194690266, "frac_reward_zero_std": 0.75, "grad_norm": 3.813332613168112, "kl": 0.01751999743282795, "learning_rate": 9.719234433912146e-07, "loss": 0.1039, "num_tokens": 7140835.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.3959933519363403, "sampling/importance_sampling_ratio/mean": 1.0019288063049316, "sampling/importance_sampling_ratio/min": 0.7030871510505676, "sampling/sampling_logp_difference/max": 0.35227441787719727, "sampling/sampling_logp_difference/mean": 0.01676108129322529, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 104.9375, "completions/mean_terminated_length": 104.9375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3870048522949219, "epoch": 0.7893805309734513, "frac_reward_zero_std": 1.0, "grad_norm": 0.0487475996446023, "kl": 0.016693545505404472, "learning_rate": 9.716677356922192e-07, "loss": 0.0001, "num_tokens": 7156863.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3446028232574463, "sampling/importance_sampling_ratio/mean": 0.999738335609436, "sampling/importance_sampling_ratio/min": 0.6871829628944397, "sampling/sampling_logp_difference/max": 0.3751547336578369, "sampling/sampling_logp_difference/mean": 0.01365131326019764, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 103.28125, "completions/mean_terminated_length": 103.28125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.19822871685028076, "epoch": 0.7911504424778761, "frac_reward_zero_std": 1.0, "grad_norm": 0.04623919445889071, "kl": 0.016442574560642242, "learning_rate": 9.714109027823216e-07, "loss": 0.0001, "num_tokens": 7174289.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.3470251560211182, "sampling/importance_sampling_ratio/mean": 0.9989169836044312, "sampling/importance_sampling_ratio/min": 0.6370871663093567, "sampling/sampling_logp_difference/max": 0.4508488178253174, "sampling/sampling_logp_difference/mean": 0.007454845122992992, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 80.140625, "completions/mean_terminated_length": 80.140625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2729114592075348, "epoch": 0.7929203539823009, "frac_reward_zero_std": 0.75, "grad_norm": 3.5695436591573864, "kl": 0.021001402288675308, "learning_rate": 9.711529452742229e-07, "loss": 0.0173, "num_tokens": 7190314.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5101569890975952, "sampling/importance_sampling_ratio/mean": 0.9994341135025024, "sampling/importance_sampling_ratio/min": 0.6054678559303284, "sampling/sampling_logp_difference/max": 0.5017538070678711, "sampling/sampling_logp_difference/mean": 0.010070646181702614, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 91.5625, "completions/mean_terminated_length": 91.5625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.36996370553970337, "epoch": 0.7946902654867256, "frac_reward_zero_std": 1.0, "grad_norm": 0.053565450688631694, "kl": 0.03582077473402023, "learning_rate": 9.708938637833064e-07, "loss": 0.0002, "num_tokens": 7208782.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4157929420471191, "sampling/importance_sampling_ratio/mean": 0.9996511936187744, "sampling/importance_sampling_ratio/min": 0.5518012642860413, "sampling/sampling_logp_difference/max": 0.5945672988891602, "sampling/sampling_logp_difference/mean": 0.015002947300672531, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 101.875, "completions/mean_terminated_length": 101.875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.5070844888687134, "epoch": 0.7964601769911505, "frac_reward_zero_std": 0.75, "grad_norm": 2.6667280847715173, "kl": 0.0295796487480402, "learning_rate": 9.706336589276374e-07, "loss": -0.0625, "num_tokens": 7226870.0, "reward": -0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.3591194152832031, "sampling/importance_sampling_ratio/mean": 1.0007846355438232, "sampling/importance_sampling_ratio/min": 0.6882472634315491, "sampling/sampling_logp_difference/max": 0.37360715866088867, "sampling/sampling_logp_difference/mean": 0.016192808747291565, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 77.359375, "completions/mean_terminated_length": 77.359375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.27710431814193726, "epoch": 0.7982300884955752, "frac_reward_zero_std": 0.75, "grad_norm": 4.356774751619394, "kl": 0.03306623920798302, "learning_rate": 9.703723313279605e-07, "loss": 0.1667, "num_tokens": 7243117.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5509188175201416, "sampling/importance_sampling_ratio/mean": 1.0007847547531128, "sampling/importance_sampling_ratio/min": 0.6969705820083618, "sampling/sampling_logp_difference/max": 0.43884754180908203, "sampling/sampling_logp_difference/mean": 0.012798595242202282, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 60.84375, "completions/mean_terminated_length": 60.84375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3076930642127991, "epoch": 0.8, "frac_reward_zero_std": 1.0, "grad_norm": 0.09004043696286056, "kl": 0.025182174518704414, "learning_rate": 9.701098816076995e-07, "loss": 0.0002, "num_tokens": 7259971.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2837449312210083, "sampling/importance_sampling_ratio/mean": 1.0000991821289062, "sampling/importance_sampling_ratio/min": 0.4642953872680664, "sampling/sampling_logp_difference/max": 0.7672343254089355, "sampling/sampling_logp_difference/mean": 0.01526334322988987, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 70.25, "completions/mean_terminated_length": 70.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3211529552936554, "epoch": 0.8017699115044248, "frac_reward_zero_std": 1.0, "grad_norm": 0.08582681747073204, "kl": 0.03589726239442825, "learning_rate": 9.698463103929541e-07, "loss": 0.0002, "num_tokens": 7275139.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8302972316741943, "sampling/importance_sampling_ratio/mean": 1.0005204677581787, "sampling/importance_sampling_ratio/min": 0.6988446116447449, "sampling/sampling_logp_difference/max": 0.6044783592224121, "sampling/sampling_logp_difference/mean": 0.011284945532679558, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 38.78125, "completions/mean_terminated_length": 38.78125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.19094860553741455, "epoch": 0.8035398230088495, "frac_reward_zero_std": 1.0, "grad_norm": 0.11639956999007596, "kl": 0.0388774499297142, "learning_rate": 9.695816183125003e-07, "loss": 0.0003, "num_tokens": 7288709.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.43803071975708, "sampling/importance_sampling_ratio/mean": 0.9972746968269348, "sampling/importance_sampling_ratio/min": 0.6347212195396423, "sampling/sampling_logp_difference/max": 0.45456933975219727, "sampling/sampling_logp_difference/mean": 0.011178547516465187, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 104.15625, "completions/mean_terminated_length": 104.15625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.427107036113739, "epoch": 0.8053097345132744, "frac_reward_zero_std": 1.0, "grad_norm": 0.06527203937308146, "kl": 0.026374639943242073, "learning_rate": 9.693158059977877e-07, "loss": 0.0002, "num_tokens": 7306815.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5272523164749146, "sampling/importance_sampling_ratio/mean": 0.9991812705993652, "sampling/importance_sampling_ratio/min": 0.5485890507698059, "sampling/sampling_logp_difference/max": 0.6004055738449097, "sampling/sampling_logp_difference/mean": 0.016266994178295135, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 98.703125, "completions/mean_terminated_length": 98.703125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.31570273637771606, "epoch": 0.8070796460176991, "frac_reward_zero_std": 1.0, "grad_norm": 0.07527148416420061, "kl": 0.029133901000022888, "learning_rate": 9.690488740829383e-07, "loss": 0.0002, "num_tokens": 7324908.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3732151985168457, "sampling/importance_sampling_ratio/mean": 1.0015473365783691, "sampling/importance_sampling_ratio/min": 0.6976590752601624, "sampling/sampling_logp_difference/max": 0.36002469062805176, "sampling/sampling_logp_difference/mean": 0.014895064756274223, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 105.59375, "completions/mean_terminated_length": 105.59375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.38262939453125, "epoch": 0.8088495575221238, "frac_reward_zero_std": 1.0, "grad_norm": 0.06269113840649056, "kl": 0.021498549729585648, "learning_rate": 9.68780823204745e-07, "loss": 0.0002, "num_tokens": 7341666.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5470013618469238, "sampling/importance_sampling_ratio/mean": 0.9993369579315186, "sampling/importance_sampling_ratio/min": 0.685339093208313, "sampling/sampling_logp_difference/max": 0.43631839752197266, "sampling/sampling_logp_difference/mean": 0.014647518284618855, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 101.40625, "completions/mean_terminated_length": 101.40625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.29837697744369507, "epoch": 0.8106194690265487, "frac_reward_zero_std": 1.0, "grad_norm": 0.06559806490152259, "kl": 0.018434328958392143, "learning_rate": 9.685116540026701e-07, "loss": 0.0002, "num_tokens": 7358172.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.436000108718872, "sampling/importance_sampling_ratio/mean": 1.0009186267852783, "sampling/importance_sampling_ratio/min": 0.6362788081169128, "sampling/sampling_logp_difference/max": 0.4521183967590332, "sampling/sampling_logp_difference/mean": 0.011897927150130272, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 792.0, "completions/max_terminated_length": 792.0, "completions/mean_length": 137.171875, "completions/mean_terminated_length": 137.171875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.5502569675445557, "epoch": 0.8123893805309734, "frac_reward_zero_std": 0.75, "grad_norm": 3.735182949968868, "kl": 0.01877054199576378, "learning_rate": 9.682413671188444e-07, "loss": 0.3218, "num_tokens": 7379463.0, "reward": -0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.593226432800293, "sampling/importance_sampling_ratio/mean": 1.0002869367599487, "sampling/importance_sampling_ratio/min": 0.36082524061203003, "sampling/sampling_logp_difference/max": 1.0193616151809692, "sampling/sampling_logp_difference/mean": 0.017945023253560066, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 36.671875, "completions/mean_terminated_length": 36.671875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.14537887275218964, "epoch": 0.8141592920353983, "frac_reward_zero_std": 0.75, "grad_norm": 5.268007704313631, "kl": 0.01473191287368536, "learning_rate": 9.679699631980637e-07, "loss": -0.0339, "num_tokens": 7391026.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.2735956907272339, "sampling/importance_sampling_ratio/mean": 0.9991209506988525, "sampling/importance_sampling_ratio/min": 0.68517005443573, "sampling/sampling_logp_difference/max": 0.37808823585510254, "sampling/sampling_logp_difference/mean": 0.00664309598505497, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 59.921875, "completions/mean_terminated_length": 59.921875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.257783442735672, "epoch": 0.815929203539823, "frac_reward_zero_std": 1.0, "grad_norm": 0.06853751772765107, "kl": 0.012552602216601372, "learning_rate": 9.6769744288779e-07, "loss": 0.0002, "num_tokens": 7404413.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5852875709533691, "sampling/importance_sampling_ratio/mean": 0.9996552467346191, "sampling/importance_sampling_ratio/min": 0.6521279215812683, "sampling/sampling_logp_difference/max": 0.4607658386230469, "sampling/sampling_logp_difference/mean": 0.01270623505115509, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 16.140625, "completions/mean_terminated_length": 16.140625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.18410514295101166, "epoch": 0.8176991150442477, "frac_reward_zero_std": 1.0, "grad_norm": 0.17198806886123844, "kl": 0.02371949329972267, "learning_rate": 9.674238068381478e-07, "loss": 0.0003, "num_tokens": 7415350.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.637800693511963, "sampling/importance_sampling_ratio/mean": 0.9997628927230835, "sampling/importance_sampling_ratio/min": 0.42162972688674927, "sampling/sampling_logp_difference/max": 0.8636277914047241, "sampling/sampling_logp_difference/mean": 0.009763983078300953, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 110.875, "completions/mean_terminated_length": 110.875, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.4821784496307373, "epoch": 0.8194690265486726, "frac_reward_zero_std": 1.0, "grad_norm": 0.07166451040935833, "kl": 0.017150908708572388, "learning_rate": 9.671490557019233e-07, "loss": 0.0002, "num_tokens": 7434286.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.764185905456543, "sampling/importance_sampling_ratio/mean": 1.0012612342834473, "sampling/importance_sampling_ratio/min": 0.6241204142570496, "sampling/sampling_logp_difference/max": 0.5676894187927246, "sampling/sampling_logp_difference/mean": 0.018654324114322662, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 125.703125, "completions/mean_terminated_length": 125.703125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.5399432182312012, "epoch": 0.8212389380530973, "frac_reward_zero_std": 1.0, "grad_norm": 0.07115499287856507, "kl": 0.020138971507549286, "learning_rate": 9.668731901345632e-07, "loss": 0.0002, "num_tokens": 7451675.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3973848819732666, "sampling/importance_sampling_ratio/mean": 1.000847578048706, "sampling/importance_sampling_ratio/min": 0.6998101472854614, "sampling/sampling_logp_difference/max": 0.3569462299346924, "sampling/sampling_logp_difference/mean": 0.01800524815917015, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 61.75, "completions/mean_terminated_length": 61.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.46671062707901, "epoch": 0.8230088495575221, "frac_reward_zero_std": 1.0, "grad_norm": 0.08836532913771333, "kl": 0.02342853508889675, "learning_rate": 9.665962107941724e-07, "loss": 0.0003, "num_tokens": 7468491.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6156078577041626, "sampling/importance_sampling_ratio/mean": 1.0004723072052002, "sampling/importance_sampling_ratio/min": 0.6833137273788452, "sampling/sampling_logp_difference/max": 0.47971129417419434, "sampling/sampling_logp_difference/mean": 0.017525987699627876, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 86.828125, "completions/mean_terminated_length": 86.828125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.4605414867401123, "epoch": 0.8247787610619469, "frac_reward_zero_std": 1.0, "grad_norm": 0.06902940022709253, "kl": 0.015131618827581406, "learning_rate": 9.663181183415131e-07, "loss": 0.0002, "num_tokens": 7485584.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6584858894348145, "sampling/importance_sampling_ratio/mean": 0.9997375011444092, "sampling/importance_sampling_ratio/min": 0.5581821799278259, "sampling/sampling_logp_difference/max": 0.5830698013305664, "sampling/sampling_logp_difference/mean": 0.017058638855814934, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 77.765625, "completions/mean_terminated_length": 77.765625, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 0.45054739713668823, "epoch": 0.8265486725663717, "frac_reward_zero_std": 0.75, "grad_norm": 11.732709836704123, "kl": 0.028658758848905563, "learning_rate": 9.660389134400033e-07, "loss": 0.0982, "num_tokens": 7503057.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000295639038086, "sampling/importance_sampling_ratio/min": 0.6674122214317322, "sampling/sampling_logp_difference/max": 1.2609937191009521, "sampling/sampling_logp_difference/mean": 0.021381115540862083, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 109.78125, "completions/mean_terminated_length": 109.78125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3166119456291199, "epoch": 0.8283185840707965, "frac_reward_zero_std": 1.0, "grad_norm": 0.17233483628358817, "kl": 0.018055278807878494, "learning_rate": 9.657585967557138e-07, "loss": 0.0002, "num_tokens": 7520851.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999561071395874, "sampling/importance_sampling_ratio/min": 0.29186415672302246, "sampling/sampling_logp_difference/max": 1.2314667701721191, "sampling/sampling_logp_difference/mean": 0.012147579342126846, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 108.1875, "completions/mean_terminated_length": 108.1875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.22439920902252197, "epoch": 0.8300884955752212, "frac_reward_zero_std": 0.5, "grad_norm": 5.521774332190136, "kl": 0.0083629060536623, "learning_rate": 9.654771689573684e-07, "loss": -0.1336, "num_tokens": 7537055.0, "reward": 0.8125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.7783924341201782, "sampling/importance_sampling_ratio/mean": 1.00020170211792, "sampling/importance_sampling_ratio/min": 0.6649647355079651, "sampling/sampling_logp_difference/max": 0.5757098197937012, "sampling/sampling_logp_difference/mean": 0.010277085937559605, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 38.5, "completions/mean_terminated_length": 38.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.1761656403541565, "epoch": 0.831858407079646, "frac_reward_zero_std": 0.75, "grad_norm": 5.292711007458864, "kl": 0.0075834584422409534, "learning_rate": 9.651946307163416e-07, "loss": 0.0957, "num_tokens": 7550127.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.3057875633239746, "sampling/importance_sampling_ratio/mean": 1.001237154006958, "sampling/importance_sampling_ratio/min": 0.5409621000289917, "sampling/sampling_logp_difference/max": 0.6144061088562012, "sampling/sampling_logp_difference/mean": 0.013811590149998665, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 96.671875, "completions/mean_terminated_length": 96.671875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.5126727819442749, "epoch": 0.8336283185840708, "frac_reward_zero_std": 1.0, "grad_norm": 0.1124198074532935, "kl": 0.027021564543247223, "learning_rate": 9.64910982706657e-07, "loss": 0.0002, "num_tokens": 7573178.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5440689325332642, "sampling/importance_sampling_ratio/mean": 0.9985802173614502, "sampling/importance_sampling_ratio/min": 0.6255274415016174, "sampling/sampling_logp_difference/max": 0.4691600799560547, "sampling/sampling_logp_difference/mean": 0.019156504422426224, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 79.421875, "completions/mean_terminated_length": 79.421875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3693407475948334, "epoch": 0.8353982300884956, "frac_reward_zero_std": 1.0, "grad_norm": 0.06026925116566931, "kl": 0.029721589758992195, "learning_rate": 9.646262256049852e-07, "loss": 0.0002, "num_tokens": 7586709.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.3058912754058838, "sampling/importance_sampling_ratio/mean": 0.9994974136352539, "sampling/importance_sampling_ratio/min": 0.5555564761161804, "sampling/sampling_logp_difference/max": 0.587785005569458, "sampling/sampling_logp_difference/mean": 0.01478197705000639, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 59.390625, "completions/mean_terminated_length": 59.390625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.21782368421554565, "epoch": 0.8371681415929203, "frac_reward_zero_std": 1.0, "grad_norm": 0.06599991621803777, "kl": 0.012956816703081131, "learning_rate": 9.643403600906432e-07, "loss": 0.0002, "num_tokens": 7600206.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5796705484390259, "sampling/importance_sampling_ratio/mean": 0.9997380375862122, "sampling/importance_sampling_ratio/min": 0.6913318037986755, "sampling/sampling_logp_difference/max": 0.4572162628173828, "sampling/sampling_logp_difference/mean": 0.013016052544116974, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 72.265625, "completions/mean_terminated_length": 72.265625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.27213335037231445, "epoch": 0.8389380530973451, "frac_reward_zero_std": 1.0, "grad_norm": 0.06018047672571673, "kl": 0.010599360801279545, "learning_rate": 9.640533868455918e-07, "loss": 0.0002, "num_tokens": 7618191.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4041589498519897, "sampling/importance_sampling_ratio/mean": 0.9997031092643738, "sampling/importance_sampling_ratio/min": 0.6552351117134094, "sampling/sampling_logp_difference/max": 0.4227612018585205, "sampling/sampling_logp_difference/mean": 0.010823149234056473, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 68.75, "completions/mean_terminated_length": 68.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.37680357694625854, "epoch": 0.8407079646017699, "frac_reward_zero_std": 1.0, "grad_norm": 0.07623883161439017, "kl": 0.020373038947582245, "learning_rate": 9.637653065544349e-07, "loss": 0.0002, "num_tokens": 7634287.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6568858623504639, "sampling/importance_sampling_ratio/mean": 0.9996323585510254, "sampling/importance_sampling_ratio/min": 0.7048661112785339, "sampling/sampling_logp_difference/max": 0.5049397945404053, "sampling/sampling_logp_difference/mean": 0.015485817566514015, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 93.6875, "completions/mean_terminated_length": 93.6875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.4273343086242676, "epoch": 0.8424778761061947, "frac_reward_zero_std": 1.0, "grad_norm": 0.08576728790420027, "kl": 0.02765519730746746, "learning_rate": 9.634761199044165e-07, "loss": 0.0002, "num_tokens": 7650235.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5675156116485596, "sampling/importance_sampling_ratio/mean": 1.0012264251708984, "sampling/importance_sampling_ratio/min": 0.28429707884788513, "sampling/sampling_logp_difference/max": 1.2577356100082397, "sampling/sampling_logp_difference/mean": 0.01669345051050186, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 57.390625, "completions/mean_terminated_length": 57.390625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.1557103544473648, "epoch": 0.8442477876106195, "frac_reward_zero_std": 1.0, "grad_norm": 0.08774264129861918, "kl": 0.01975005678832531, "learning_rate": 9.63185827585421e-07, "loss": 0.0002, "num_tokens": 7666228.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.931877851486206, "sampling/importance_sampling_ratio/mean": 0.9998209476470947, "sampling/importance_sampling_ratio/min": 0.6257392168045044, "sampling/sampling_logp_difference/max": 0.6584925651550293, "sampling/sampling_logp_difference/mean": 0.010466136038303375, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 75.609375, "completions/mean_terminated_length": 75.609375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3157695531845093, "epoch": 0.8460176991150442, "frac_reward_zero_std": 1.0, "grad_norm": 0.06225484644973683, "kl": 0.022523541003465652, "learning_rate": 9.628944302899695e-07, "loss": 0.0002, "num_tokens": 7682395.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5499114990234375, "sampling/importance_sampling_ratio/mean": 1.0018589496612549, "sampling/importance_sampling_ratio/min": 0.7029415965080261, "sampling/sampling_logp_difference/max": 0.4381978511810303, "sampling/sampling_logp_difference/mean": 0.015560553409159184, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.06959155946969986, "epoch": 0.8477876106194691, "frac_reward_zero_std": 1.0, "grad_norm": 0.27239551020489455, "kl": 0.013529862277209759, "learning_rate": 9.6260192871322e-07, "loss": 0.0001, "num_tokens": 7695019.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7910892963409424, "sampling/importance_sampling_ratio/mean": 1.000964879989624, "sampling/importance_sampling_ratio/min": 0.7308072447776794, "sampling/sampling_logp_difference/max": 0.5828239917755127, "sampling/sampling_logp_difference/mean": 0.0050699952989816666, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 60.28125, "completions/mean_terminated_length": 60.28125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2767450213432312, "epoch": 0.8495575221238938, "frac_reward_zero_std": 1.0, "grad_norm": 0.05418953873731226, "kl": 0.009159858338534832, "learning_rate": 9.623083235529646e-07, "loss": 0.0001, "num_tokens": 7710733.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4479974508285522, "sampling/importance_sampling_ratio/mean": 0.9996775388717651, "sampling/importance_sampling_ratio/min": 0.6703086495399475, "sampling/sampling_logp_difference/max": 0.4000170826911926, "sampling/sampling_logp_difference/mean": 0.010580535978078842, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 67.609375, "completions/mean_terminated_length": 67.609375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.26749616861343384, "epoch": 0.8513274336283185, "frac_reward_zero_std": 1.0, "grad_norm": 0.041986870827581534, "kl": 0.007706194184720516, "learning_rate": 9.620136155096275e-07, "loss": 0.0001, "num_tokens": 7724564.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5395063161849976, "sampling/importance_sampling_ratio/mean": 0.9995875954627991, "sampling/importance_sampling_ratio/min": 0.5570306777954102, "sampling/sampling_logp_difference/max": 0.5851349830627441, "sampling/sampling_logp_difference/mean": 0.0122038209810853, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 62.09375, "completions/mean_terminated_length": 62.09375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.2679656147956848, "epoch": 0.8530973451327434, "frac_reward_zero_std": 1.0, "grad_norm": 0.06334004054578615, "kl": 0.013048840686678886, "learning_rate": 9.617178052862649e-07, "loss": 0.0001, "num_tokens": 7738714.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4340221881866455, "sampling/importance_sampling_ratio/mean": 1.0003671646118164, "sampling/importance_sampling_ratio/min": 0.6112040877342224, "sampling/sampling_logp_difference/max": 0.4923243522644043, "sampling/sampling_logp_difference/mean": 0.012869300320744514, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 79.59375, "completions/mean_terminated_length": 79.59375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3289504051208496, "epoch": 0.8548672566371681, "frac_reward_zero_std": 1.0, "grad_norm": 0.04462738289464286, "kl": 0.014411314390599728, "learning_rate": 9.614208935885614e-07, "loss": 0.0001, "num_tokens": 7754512.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3118646144866943, "sampling/importance_sampling_ratio/mean": 1.0010559558868408, "sampling/importance_sampling_ratio/min": 0.6392340064048767, "sampling/sampling_logp_difference/max": 0.44748473167419434, "sampling/sampling_logp_difference/mean": 0.01603720150887966, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 78.796875, "completions/mean_terminated_length": 78.796875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.28768643736839294, "epoch": 0.856637168141593, "frac_reward_zero_std": 1.0, "grad_norm": 0.040511614927051826, "kl": 0.007257409393787384, "learning_rate": 9.6112288112483e-07, "loss": 0.0001, "num_tokens": 7771427.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4626340866088867, "sampling/importance_sampling_ratio/mean": 0.9994089603424072, "sampling/importance_sampling_ratio/min": 0.6803320646286011, "sampling/sampling_logp_difference/max": 0.3851742744445801, "sampling/sampling_logp_difference/mean": 0.010980907827615738, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 110.84375, "completions/mean_terminated_length": 110.84375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.4556252956390381, "epoch": 0.8584070796460177, "frac_reward_zero_std": 1.0, "grad_norm": 0.054893842974183746, "kl": 0.013409667648375034, "learning_rate": 9.608237686060097e-07, "loss": 0.0001, "num_tokens": 7788601.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3639452457427979, "sampling/importance_sampling_ratio/mean": 1.0000271797180176, "sampling/importance_sampling_ratio/min": 0.561255931854248, "sampling/sampling_logp_difference/max": 0.5775783061981201, "sampling/sampling_logp_difference/mean": 0.0168587788939476, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 62.03125, "completions/mean_terminated_length": 62.03125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3430402874946594, "epoch": 0.8601769911504424, "frac_reward_zero_std": 1.0, "grad_norm": 0.05995714263974936, "kl": 0.013722425326704979, "learning_rate": 9.605235567456635e-07, "loss": 0.0001, "num_tokens": 7803531.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6505627632141113, "sampling/importance_sampling_ratio/mean": 0.999412477016449, "sampling/importance_sampling_ratio/min": 0.6296095848083496, "sampling/sampling_logp_difference/max": 0.5011162757873535, "sampling/sampling_logp_difference/mean": 0.014488639310002327, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 100.546875, "completions/mean_terminated_length": 100.546875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3184402883052826, "epoch": 0.8619469026548673, "frac_reward_zero_std": 0.75, "grad_norm": 6.789551644989066, "kl": 0.014047389850020409, "learning_rate": 9.602222462599766e-07, "loss": 0.1634, "num_tokens": 7819838.0, "reward": -0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4881186485290527, "sampling/importance_sampling_ratio/mean": 1.001170039176941, "sampling/importance_sampling_ratio/min": 0.6323857307434082, "sampling/sampling_logp_difference/max": 0.4582557678222656, "sampling/sampling_logp_difference/mean": 0.013673178851604462, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 54.6875, "completions/mean_terminated_length": 54.6875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.36572539806365967, "epoch": 0.863716814159292, "frac_reward_zero_std": 0.5, "grad_norm": 37.659219580786726, "kl": 0.07570485770702362, "learning_rate": 9.599198378677558e-07, "loss": 0.7396, "num_tokens": 7837898.0, "reward": 0.625, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0011978149414062, "sampling/importance_sampling_ratio/min": 0.7209191918373108, "sampling/sampling_logp_difference/max": 0.7518572807312012, "sampling/sampling_logp_difference/mean": 0.013867147266864777, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.046875, "completions/mean_terminated_length": 15.046875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.06888234615325928, "epoch": 0.8654867256637168, "frac_reward_zero_std": 1.0, "grad_norm": 0.2142080578369344, "kl": 0.024057459086179733, "learning_rate": 9.596163322904269e-07, "loss": 0.0002, "num_tokens": 7851069.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3726508617401123, "sampling/importance_sampling_ratio/mean": 1.0013298988342285, "sampling/importance_sampling_ratio/min": 0.8065332174301147, "sampling/sampling_logp_difference/max": 0.3167438507080078, "sampling/sampling_logp_difference/mean": 0.006785606499761343, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 88.53125, "completions/mean_terminated_length": 88.53125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.35719120502471924, "epoch": 0.8672566371681416, "frac_reward_zero_std": 1.0, "grad_norm": 0.0314355509412585, "kl": 0.011010359972715378, "learning_rate": 9.593117302520328e-07, "loss": 0.0001, "num_tokens": 7866143.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.511204719543457, "sampling/importance_sampling_ratio/mean": 0.9987002015113831, "sampling/importance_sampling_ratio/min": 0.6565293073654175, "sampling/sampling_logp_difference/max": 0.4207879304885864, "sampling/sampling_logp_difference/mean": 0.014091964811086655, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 92.375, "completions/mean_terminated_length": 92.375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.4985180199146271, "epoch": 0.8690265486725663, "frac_reward_zero_std": 0.5, "grad_norm": 7.644053057921895, "kl": 0.018009070307016373, "learning_rate": 9.590060324792325e-07, "loss": 0.1329, "num_tokens": 7881895.0, "reward": 0.03125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.8456369638442993, "sampling/importance_sampling_ratio/mean": 1.0008618831634521, "sampling/importance_sampling_ratio/min": 0.6019213795661926, "sampling/sampling_logp_difference/max": 0.6128244400024414, "sampling/sampling_logp_difference/mean": 0.01921398565173149, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 42.25, "completions/mean_terminated_length": 42.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2719489336013794, "epoch": 0.8707964601769912, "frac_reward_zero_std": 1.0, "grad_norm": 0.08532220988810806, "kl": 0.010976046323776245, "learning_rate": 9.58699239701299e-07, "loss": 0.0001, "num_tokens": 7898663.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.421367883682251, "sampling/importance_sampling_ratio/mean": 1.0002838373184204, "sampling/importance_sampling_ratio/min": 0.6368826031684875, "sampling/sampling_logp_difference/max": 0.4511699676513672, "sampling/sampling_logp_difference/mean": 0.012832563370466232, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 58.21875, "completions/mean_terminated_length": 58.21875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.1278567761182785, "epoch": 0.8725663716814159, "frac_reward_zero_std": 0.75, "grad_norm": 8.595419109387096, "kl": 0.01883358508348465, "learning_rate": 9.58391352650117e-07, "loss": -0.5106, "num_tokens": 7911909.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.2526459693908691, "sampling/importance_sampling_ratio/mean": 1.0006308555603027, "sampling/importance_sampling_ratio/min": 0.4962599277496338, "sampling/sampling_logp_difference/max": 0.700655460357666, "sampling/sampling_logp_difference/mean": 0.010099800303578377, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 40.125, "completions/mean_terminated_length": 40.125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.24204905331134796, "epoch": 0.8743362831858407, "frac_reward_zero_std": 1.0, "grad_norm": 0.06587748198923991, "kl": 0.009902874939143658, "learning_rate": 9.580823720601823e-07, "loss": 0.0001, "num_tokens": 7928429.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.445271372795105, "sampling/importance_sampling_ratio/mean": 1.0001397132873535, "sampling/importance_sampling_ratio/min": 0.7533186078071594, "sampling/sampling_logp_difference/max": 0.36829710006713867, "sampling/sampling_logp_difference/mean": 0.014522343873977661, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 56.25, "completions/mean_terminated_length": 56.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2897334098815918, "epoch": 0.8761061946902655, "frac_reward_zero_std": 0.75, "grad_norm": 7.1646682486535305, "kl": 0.1686052680015564, "learning_rate": 9.57772298668599e-07, "loss": -0.1076, "num_tokens": 7940925.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.336471676826477, "sampling/importance_sampling_ratio/mean": 1.0002723932266235, "sampling/importance_sampling_ratio/min": 0.7755419015884399, "sampling/sampling_logp_difference/max": 0.29003310203552246, "sampling/sampling_logp_difference/mean": 0.012305348180234432, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 71.4375, "completions/mean_terminated_length": 71.4375, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 0.4769209027290344, "epoch": 0.8778761061946903, "frac_reward_zero_std": 0.75, "grad_norm": 10.18835702156589, "kl": 0.060292646288871765, "learning_rate": 9.57461133215079e-07, "loss": -0.1092, "num_tokens": 7956665.0, "reward": -0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.618024468421936, "sampling/importance_sampling_ratio/mean": 0.99906325340271, "sampling/importance_sampling_ratio/min": 0.6003147959709167, "sampling/sampling_logp_difference/max": 0.5103011131286621, "sampling/sampling_logp_difference/mean": 0.017318300902843475, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 58.671875, "completions/mean_terminated_length": 58.671875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2595871090888977, "epoch": 0.879646017699115, "frac_reward_zero_std": 1.0, "grad_norm": 0.0416886480079275, "kl": 0.0072275553829967976, "learning_rate": 9.57148876441938e-07, "loss": 0.0001, "num_tokens": 7972820.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.45339035987854, "sampling/importance_sampling_ratio/mean": 1.000535011291504, "sampling/importance_sampling_ratio/min": 0.6839017868041992, "sampling/sampling_logp_difference/max": 0.3799409866333008, "sampling/sampling_logp_difference/mean": 0.010540075600147247, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 39.84375, "completions/mean_terminated_length": 39.84375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2132524847984314, "epoch": 0.8814159292035398, "frac_reward_zero_std": 1.0, "grad_norm": 0.05927996815786505, "kl": 0.005652535706758499, "learning_rate": 9.568355290940966e-07, "loss": 0.0001, "num_tokens": 7987946.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.584210753440857, "sampling/importance_sampling_ratio/mean": 1.0005481243133545, "sampling/importance_sampling_ratio/min": 0.655351996421814, "sampling/sampling_logp_difference/max": 0.4600863456726074, "sampling/sampling_logp_difference/mean": 0.010919488966464996, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 84.90625, "completions/mean_terminated_length": 84.90625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.296769917011261, "epoch": 0.8831858407079646, "frac_reward_zero_std": 0.75, "grad_norm": 6.735124506983188, "kl": 0.016414642333984375, "learning_rate": 9.565210919190763e-07, "loss": 0.015, "num_tokens": 8003700.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.9363287687301636, "sampling/importance_sampling_ratio/mean": 0.9987481236457825, "sampling/importance_sampling_ratio/min": 0.7225105166435242, "sampling/sampling_logp_difference/max": 0.6607937812805176, "sampling/sampling_logp_difference/mean": 0.015974685549736023, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 38.03125, "completions/mean_terminated_length": 38.03125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.19154030084609985, "epoch": 0.8849557522123894, "frac_reward_zero_std": 1.0, "grad_norm": 0.08183335721489599, "kl": 0.007817339152097702, "learning_rate": 9.562055656669987e-07, "loss": 0.0001, "num_tokens": 8020886.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.598867416381836, "sampling/importance_sampling_ratio/mean": 1.0001122951507568, "sampling/importance_sampling_ratio/min": 0.5727236270904541, "sampling/sampling_logp_difference/max": 0.5573520660400391, "sampling/sampling_logp_difference/mean": 0.013924205675721169, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 97.796875, "completions/mean_terminated_length": 97.796875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3268697261810303, "epoch": 0.8867256637168142, "frac_reward_zero_std": 1.0, "grad_norm": 0.029052867098632782, "kl": 0.007557909470051527, "learning_rate": 9.558889510905835e-07, "loss": 0.0001, "num_tokens": 8039145.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.647871494293213, "sampling/importance_sampling_ratio/mean": 1.000300407409668, "sampling/importance_sampling_ratio/min": 0.6561062932014465, "sampling/sampling_logp_difference/max": 0.49948447942733765, "sampling/sampling_logp_difference/mean": 0.013082124292850494, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 94.796875, "completions/mean_terminated_length": 94.796875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.43946853280067444, "epoch": 0.8884955752212389, "frac_reward_zero_std": 1.0, "grad_norm": 0.04044782874748557, "kl": 0.058621086180210114, "learning_rate": 9.555712489451464e-07, "loss": 0.0002, "num_tokens": 8056556.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5867271423339844, "sampling/importance_sampling_ratio/mean": 0.9995195269584656, "sampling/importance_sampling_ratio/min": 0.6893123984336853, "sampling/sampling_logp_difference/max": 0.4616734981536865, "sampling/sampling_logp_difference/mean": 0.01584906503558159, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 105.546875, "completions/mean_terminated_length": 105.546875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.5053263306617737, "epoch": 0.8902654867256637, "frac_reward_zero_std": 0.75, "grad_norm": 3.045677652748138, "kl": 0.2734725773334503, "learning_rate": 9.55252459988598e-07, "loss": 0.0963, "num_tokens": 8073807.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.57107675075531, "sampling/importance_sampling_ratio/mean": 0.9994295239448547, "sampling/importance_sampling_ratio/min": 0.5382452011108398, "sampling/sampling_logp_difference/max": 0.619441032409668, "sampling/sampling_logp_difference/mean": 0.018887978047132492, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 61.453125, "completions/mean_terminated_length": 61.453125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.23520615696907043, "epoch": 0.8920353982300885, "frac_reward_zero_std": 1.0, "grad_norm": 0.044142626143724856, "kl": 0.009435078129172325, "learning_rate": 9.549325849814418e-07, "loss": 0.0001, "num_tokens": 8089500.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4041417837142944, "sampling/importance_sampling_ratio/mean": 0.9998427629470825, "sampling/importance_sampling_ratio/min": 0.6066602468490601, "sampling/sampling_logp_difference/max": 0.499786376953125, "sampling/sampling_logp_difference/mean": 0.013650336302816868, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 895.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 128.921875, "completions/mean_terminated_length": 128.921875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3584873080253601, "epoch": 0.8938053097345132, "frac_reward_zero_std": 0.75, "grad_norm": 3.164030357296124, "kl": 0.006217114627361298, "learning_rate": 9.546116246867713e-07, "loss": 0.3262, "num_tokens": 8108295.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.430129885673523, "sampling/importance_sampling_ratio/mean": 1.0008875131607056, "sampling/importance_sampling_ratio/min": 0.6261174082756042, "sampling/sampling_logp_difference/max": 0.4682173728942871, "sampling/sampling_logp_difference/mean": 0.013849091716110706, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 48.671875, "completions/mean_terminated_length": 48.671875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 0.19223593175411224, "epoch": 0.8955752212389381, "frac_reward_zero_std": 0.75, "grad_norm": 9.298491366271188, "kl": 0.012916343286633492, "learning_rate": 9.542895798702701e-07, "loss": -0.1124, "num_tokens": 8121762.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.254530429840088, "sampling/importance_sampling_ratio/mean": 1.0021955966949463, "sampling/importance_sampling_ratio/min": 0.6728146076202393, "sampling/sampling_logp_difference/max": 0.3962855339050293, "sampling/sampling_logp_difference/mean": 0.012941524386405945, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 109.03125, "completions/mean_terminated_length": 109.03125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.4453309178352356, "epoch": 0.8973451327433628, "frac_reward_zero_std": 1.0, "grad_norm": 0.03895957066929234, "kl": 0.01071190182119608, "learning_rate": 9.539664513002084e-07, "loss": 0.0001, "num_tokens": 8138468.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3871991634368896, "sampling/importance_sampling_ratio/mean": 1.0000834465026855, "sampling/importance_sampling_ratio/min": 0.5638471245765686, "sampling/sampling_logp_difference/max": 0.5729721784591675, "sampling/sampling_logp_difference/mean": 0.016829367727041245, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 83.03125, "completions/mean_terminated_length": 83.03125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.34089598059654236, "epoch": 0.8991150442477877, "frac_reward_zero_std": 1.0, "grad_norm": 0.044027304289342664, "kl": 0.009923753328621387, "learning_rate": 9.536422397474418e-07, "loss": 0.0001, "num_tokens": 8153046.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4149080514907837, "sampling/importance_sampling_ratio/mean": 1.0008606910705566, "sampling/importance_sampling_ratio/min": 0.6992542147636414, "sampling/sampling_logp_difference/max": 0.3577408790588379, "sampling/sampling_logp_difference/mean": 0.013866678811609745, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 96.796875, "completions/mean_terminated_length": 96.796875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 0.3181552290916443, "epoch": 0.9008849557522124, "frac_reward_zero_std": 1.0, "grad_norm": 0.035081746701379146, "kl": 0.009072013199329376, "learning_rate": 9.533169459854098e-07, "loss": 0.0001, "num_tokens": 8169865.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.546746850013733, "sampling/importance_sampling_ratio/mean": 0.9996622204780579, "sampling/importance_sampling_ratio/min": 0.7007312774658203, "sampling/sampling_logp_difference/max": 0.4361538887023926, "sampling/sampling_logp_difference/mean": 0.012515220791101456, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 33.640625, "completions/mean_terminated_length": 33.640625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.16498476266860962, "epoch": 0.9026548672566371, "frac_reward_zero_std": 1.0, "grad_norm": 0.04012191510593879, "kl": 0.005203623324632645, "learning_rate": 9.529905707901333e-07, "loss": 0.0001, "num_tokens": 8182402.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2919236421585083, "sampling/importance_sampling_ratio/mean": 0.9999414086341858, "sampling/importance_sampling_ratio/min": 0.5896210074424744, "sampling/sampling_logp_difference/max": 0.5282753705978394, "sampling/sampling_logp_difference/mean": 0.008416219614446163, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 113.21875, "completions/mean_terminated_length": 113.21875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 0.3625057339668274, "epoch": 0.904424778761062, "frac_reward_zero_std": 0.75, "grad_norm": 3.568032198967511, "kl": 0.006654072552919388, "learning_rate": 9.526631149402134e-07, "loss": 0.0482, "num_tokens": 8201504.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.3120354413986206, "sampling/importance_sampling_ratio/mean": 1.0005406141281128, "sampling/importance_sampling_ratio/min": 0.5685505867004395, "sampling/sampling_logp_difference/max": 0.5646649599075317, "sampling/sampling_logp_difference/mean": 0.013463255017995834, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 114.828125, "completions/mean_terminated_length": 114.828125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3808782994747162, "epoch": 0.9061946902654867, "frac_reward_zero_std": 1.0, "grad_norm": 0.036424015039903675, "kl": 0.009641697630286217, "learning_rate": 9.523345792168288e-07, "loss": 0.0001, "num_tokens": 8218661.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5945333242416382, "sampling/importance_sampling_ratio/mean": 0.9990466237068176, "sampling/importance_sampling_ratio/min": 0.498073011636734, "sampling/sampling_logp_difference/max": 0.6970086097717285, "sampling/sampling_logp_difference/mean": 0.015809332951903343, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 89.890625, "completions/mean_terminated_length": 89.890625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.36214619874954224, "epoch": 0.9079646017699115, "frac_reward_zero_std": 1.0, "grad_norm": 0.02469736712140168, "kl": 0.009469065815210342, "learning_rate": 9.520049644037347e-07, "loss": 0.0001, "num_tokens": 8235598.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2883288860321045, "sampling/importance_sampling_ratio/mean": 1.0004262924194336, "sampling/importance_sampling_ratio/min": 0.5304144620895386, "sampling/sampling_logp_difference/max": 0.634096622467041, "sampling/sampling_logp_difference/mean": 0.011682311072945595, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 88.03125, "completions/mean_terminated_length": 88.03125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.44736549258232117, "epoch": 0.9097345132743363, "frac_reward_zero_std": 1.0, "grad_norm": 0.036895631890060225, "kl": 0.016795285046100616, "learning_rate": 9.516742712872605e-07, "loss": 0.0002, "num_tokens": 8259312.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6561634540557861, "sampling/importance_sampling_ratio/mean": 0.9994844198226929, "sampling/importance_sampling_ratio/min": 0.5980254411697388, "sampling/sampling_logp_difference/max": 0.5141220092773438, "sampling/sampling_logp_difference/mean": 0.017143195495009422, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 140.765625, "completions/mean_terminated_length": 140.765625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.46250495314598083, "epoch": 0.911504424778761, "frac_reward_zero_std": 0.75, "grad_norm": 2.5997216064046285, "kl": 0.008927903138101101, "learning_rate": 9.513425006563078e-07, "loss": -0.1194, "num_tokens": 8278049.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.4426480531692505, "sampling/importance_sampling_ratio/mean": 1.0001449584960938, "sampling/importance_sampling_ratio/min": 0.6405063271522522, "sampling/sampling_logp_difference/max": 0.4454963207244873, "sampling/sampling_logp_difference/mean": 0.015760906040668488, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 96.828125, "completions/mean_terminated_length": 96.828125, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 0.5690308809280396, "epoch": 0.9132743362831859, "frac_reward_zero_std": 1.0, "grad_norm": 0.0331973440363682, "kl": 0.009910114109516144, "learning_rate": 9.51009653302349e-07, "loss": 0.0001, "num_tokens": 8296374.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6122362613677979, "sampling/importance_sampling_ratio/mean": 0.9999167323112488, "sampling/importance_sampling_ratio/min": 0.5489570498466492, "sampling/sampling_logp_difference/max": 0.5997350215911865, "sampling/sampling_logp_difference/mean": 0.019404888153076172, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/max_terminated_length": 591.0, "completions/mean_length": 154.234375, "completions/mean_terminated_length": 154.234375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.36337944865226746, "epoch": 0.9150442477876106, "frac_reward_zero_std": 0.75, "grad_norm": 1.664883462583671, "kl": 0.014331409707665443, "learning_rate": 9.506757300194248e-07, "loss": -0.0603, "num_tokens": 8318053.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5481709241867065, "sampling/importance_sampling_ratio/mean": 0.9998996257781982, "sampling/importance_sampling_ratio/min": 0.6269489526748657, "sampling/sampling_logp_difference/max": 0.4668901562690735, "sampling/sampling_logp_difference/mean": 0.012643365189433098, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 139.34375, "completions/mean_terminated_length": 139.34375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.43411684036254883, "epoch": 0.9168141592920354, "frac_reward_zero_std": 0.75, "grad_norm": 2.140124683083626, "kl": 0.018226414918899536, "learning_rate": 9.50340731604143e-07, "loss": 0.2147, "num_tokens": 8336539.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.62480890750885, "sampling/importance_sampling_ratio/mean": 0.9991416931152344, "sampling/importance_sampling_ratio/min": 0.236353799700737, "sampling/sampling_logp_difference/max": 1.4424254894256592, "sampling/sampling_logp_difference/mean": 0.015893306583166122, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 95.546875, "completions/mean_terminated_length": 95.546875, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 0.5186383128166199, "epoch": 0.9185840707964602, "frac_reward_zero_std": 1.0, "grad_norm": 0.039527739559405944, "kl": 0.011614596471190453, "learning_rate": 9.500046588556761e-07, "loss": 0.0001, "num_tokens": 8352046.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996851682662964, "sampling/importance_sampling_ratio/min": 0.7078847289085388, "sampling/sampling_logp_difference/max": 0.9716105461120605, "sampling/sampling_logp_difference/mean": 0.017761271446943283, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 107.421875, "completions/mean_terminated_length": 107.421875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.3980678915977478, "epoch": 0.9203539823008849, "frac_reward_zero_std": 1.0, "grad_norm": 0.03050835034680889, "kl": 0.011641588993370533, "learning_rate": 9.496675125757594e-07, "loss": 0.0001, "num_tokens": 8368809.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.9738656282424927, "sampling/importance_sampling_ratio/mean": 1.0001617670059204, "sampling/importance_sampling_ratio/min": 0.6098880767822266, "sampling/sampling_logp_difference/max": 0.6799938678741455, "sampling/sampling_logp_difference/mean": 0.014947582967579365, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1060.0, "completions/max_terminated_length": 1060.0, "completions/mean_length": 140.09375, "completions/mean_terminated_length": 140.09375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.42774447798728943, "epoch": 0.9221238938053097, "frac_reward_zero_std": 0.5, "grad_norm": 3.4898741180099533, "kl": 0.00820097140967846, "learning_rate": 9.493292935686894e-07, "loss": 0.261, "num_tokens": 8387039.0, "reward": 0.0, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.515367865562439, "sampling/importance_sampling_ratio/mean": 1.0005155801773071, "sampling/importance_sampling_ratio/min": 0.6541255116462708, "sampling/sampling_logp_difference/max": 0.4244561195373535, "sampling/sampling_logp_difference/mean": 0.01558619737625122, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 118.859375, "completions/mean_terminated_length": 118.859375, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 0.499830037355423, "epoch": 0.9238938053097345, "frac_reward_zero_std": 1.0, "grad_norm": 0.0253552589765691, "kl": 0.009436726570129395, "learning_rate": 9.489900026413216e-07, "loss": 0.0001, "num_tokens": 8404118.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5663814544677734, "sampling/importance_sampling_ratio/mean": 1.0002120733261108, "sampling/importance_sampling_ratio/min": 0.4994365870952606, "sampling/sampling_logp_difference/max": 0.6942746639251709, "sampling/sampling_logp_difference/mean": 0.018683871254324913, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 91.046875, "completions/mean_terminated_length": 91.046875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.4420931935310364, "epoch": 0.9256637168141593, "frac_reward_zero_std": 1.0, "grad_norm": 0.032238059689454056, "kl": 0.009348719380795956, "learning_rate": 9.486496406030685e-07, "loss": 0.0001, "num_tokens": 8421017.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4921983480453491, "sampling/importance_sampling_ratio/mean": 0.9986906051635742, "sampling/importance_sampling_ratio/min": 0.7028223276138306, "sampling/sampling_logp_difference/max": 0.4002504348754883, "sampling/sampling_logp_difference/mean": 0.01746968738734722, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 42.96875, "completions/mean_terminated_length": 42.96875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.21450337767601013, "epoch": 0.9274336283185841, "frac_reward_zero_std": 1.0, "grad_norm": 0.06758275124603391, "kl": 0.006579137407243252, "learning_rate": 9.483082082658982e-07, "loss": 0.0001, "num_tokens": 8439863.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.434173345565796, "sampling/importance_sampling_ratio/mean": 0.9990978240966797, "sampling/importance_sampling_ratio/min": 0.5833903551101685, "sampling/sampling_logp_difference/max": 0.5388987064361572, "sampling/sampling_logp_difference/mean": 0.012227091938257217, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 122.5, "completions/mean_terminated_length": 122.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.42718803882598877, "epoch": 0.9292035398230089, "frac_reward_zero_std": 0.75, "grad_norm": 3.389959562645312, "kl": 0.010070500895380974, "learning_rate": 9.479657064443321e-07, "loss": -0.1146, "num_tokens": 8459079.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.6004486083984375, "sampling/importance_sampling_ratio/mean": 1.0002415180206299, "sampling/importance_sampling_ratio/min": 0.6741685271263123, "sampling/sampling_logp_difference/max": 0.47028398513793945, "sampling/sampling_logp_difference/mean": 0.016869788989424706, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 95.09375, "completions/mean_terminated_length": 95.09375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.5120389461517334, "epoch": 0.9309734513274336, "frac_reward_zero_std": 0.75, "grad_norm": 2.8066848398685473, "kl": 0.02020437829196453, "learning_rate": 9.476221359554423e-07, "loss": 0.0886, "num_tokens": 8477517.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.464606523513794, "sampling/importance_sampling_ratio/mean": 0.9992693662643433, "sampling/importance_sampling_ratio/min": 0.7041698694229126, "sampling/sampling_logp_difference/max": 0.38158655166625977, "sampling/sampling_logp_difference/mean": 0.017616450786590576, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 70.546875, "completions/mean_terminated_length": 70.546875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.260747492313385, "epoch": 0.9327433628318584, "frac_reward_zero_std": 1.0, "grad_norm": 0.030913574652803144, "kl": 0.006850100588053465, "learning_rate": 9.472774976188513e-07, "loss": 0.0001, "num_tokens": 8493760.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6097594499588013, "sampling/importance_sampling_ratio/mean": 1.001157522201538, "sampling/importance_sampling_ratio/min": 0.7061507105827332, "sampling/sampling_logp_difference/max": 0.47608476877212524, "sampling/sampling_logp_difference/mean": 0.009453845210373402, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 37.4375, "completions/mean_terminated_length": 37.4375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.18946631252765656, "epoch": 0.9345132743362832, "frac_reward_zero_std": 1.0, "grad_norm": 0.06875360822757838, "kl": 0.009271876886487007, "learning_rate": 9.469317922567286e-07, "loss": 0.0002, "num_tokens": 8512924.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5250033140182495, "sampling/importance_sampling_ratio/mean": 1.0006039142608643, "sampling/importance_sampling_ratio/min": 0.6538918614387512, "sampling/sampling_logp_difference/max": 0.42481327056884766, "sampling/sampling_logp_difference/mean": 0.010327961295843124, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 70.53125, "completions/mean_terminated_length": 70.53125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.28202685713768005, "epoch": 0.9362831858407079, "frac_reward_zero_std": 0.75, "grad_norm": 11.06847389682932, "kl": 0.01626504212617874, "learning_rate": 9.465850206937887e-07, "loss": 0.1768, "num_tokens": 8530238.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9993406534194946, "sampling/importance_sampling_ratio/min": 0.6998425126075745, "sampling/sampling_logp_difference/max": 0.7249191999435425, "sampling/sampling_logp_difference/mean": 0.012953529134392738, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 62.25, "completions/mean_terminated_length": 62.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2565949559211731, "epoch": 0.9380530973451328, "frac_reward_zero_std": 1.0, "grad_norm": 0.03577417046315069, "kl": 0.0072803376242518425, "learning_rate": 9.462371837572906e-07, "loss": 0.0001, "num_tokens": 8545950.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.40841805934906, "sampling/importance_sampling_ratio/mean": 1.0001753568649292, "sampling/importance_sampling_ratio/min": 0.691235363483429, "sampling/sampling_logp_difference/max": 0.3692748546600342, "sampling/sampling_logp_difference/mean": 0.010468710213899612, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 60.609375, "completions/mean_terminated_length": 60.609375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.26018762588500977, "epoch": 0.9398230088495575, "frac_reward_zero_std": 1.0, "grad_norm": 0.05116817286548551, "kl": 0.010109255090355873, "learning_rate": 9.45888282277034e-07, "loss": 0.0001, "num_tokens": 8566389.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2612920999526978, "sampling/importance_sampling_ratio/mean": 1.0020697116851807, "sampling/importance_sampling_ratio/min": 0.5850200653076172, "sampling/sampling_logp_difference/max": 0.5361090898513794, "sampling/sampling_logp_difference/mean": 0.013840936124324799, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 66.421875, "completions/mean_terminated_length": 66.421875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2860739529132843, "epoch": 0.9415929203539823, "frac_reward_zero_std": 1.0, "grad_norm": 0.03885608048943523, "kl": 0.008966992609202862, "learning_rate": 9.455383170853585e-07, "loss": 0.0001, "num_tokens": 8582464.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9738166332244873, "sampling/importance_sampling_ratio/mean": 0.9996252059936523, "sampling/importance_sampling_ratio/min": 0.5545295476913452, "sampling/sampling_logp_difference/max": 0.679969072341919, "sampling/sampling_logp_difference/mean": 0.01339352410286665, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 108.578125, "completions/mean_terminated_length": 108.578125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.35880762338638306, "epoch": 0.9433628318584071, "frac_reward_zero_std": 1.0, "grad_norm": 0.05997824834894199, "kl": 0.016652042046189308, "learning_rate": 9.451872890171419e-07, "loss": 0.0002, "num_tokens": 8609493.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.435707449913025, "sampling/importance_sampling_ratio/mean": 0.999294638633728, "sampling/importance_sampling_ratio/min": 0.47342342138290405, "sampling/sampling_logp_difference/max": 0.747765064239502, "sampling/sampling_logp_difference/mean": 0.013728786259889603, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 88.546875, "completions/mean_terminated_length": 88.546875, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 0.45613330602645874, "epoch": 0.9451327433628318, "frac_reward_zero_std": 1.0, "grad_norm": 0.09313465584163444, "kl": 0.015598511323332787, "learning_rate": 9.448351989097962e-07, "loss": 0.0002, "num_tokens": 8626280.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3734320402145386, "sampling/importance_sampling_ratio/mean": 0.9996883869171143, "sampling/importance_sampling_ratio/min": 0.5861338376998901, "sampling/sampling_logp_difference/max": 0.5342071056365967, "sampling/sampling_logp_difference/mean": 0.014164279215037823, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 70.375, "completions/mean_terminated_length": 70.375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.32646340131759644, "epoch": 0.9469026548672567, "frac_reward_zero_std": 1.0, "grad_norm": 0.03490572895714704, "kl": 0.00837424024939537, "learning_rate": 9.444820476032685e-07, "loss": 0.0001, "num_tokens": 8642800.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.33780038356781, "sampling/importance_sampling_ratio/mean": 0.9992144107818604, "sampling/importance_sampling_ratio/min": 0.6560214757919312, "sampling/sampling_logp_difference/max": 0.42156171798706055, "sampling/sampling_logp_difference/mean": 0.010708148591220379, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 111.046875, "completions/mean_terminated_length": 111.046875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.5900201201438904, "epoch": 0.9486725663716814, "frac_reward_zero_std": 1.0, "grad_norm": 0.0580239595566989, "kl": 0.018893791362643242, "learning_rate": 9.441278359400364e-07, "loss": 0.0002, "num_tokens": 8659587.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4816340208053589, "sampling/importance_sampling_ratio/mean": 1.0002975463867188, "sampling/importance_sampling_ratio/min": 0.6631470322608948, "sampling/sampling_logp_difference/max": 0.41075849533081055, "sampling/sampling_logp_difference/mean": 0.021431345492601395, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 59.03125, "completions/mean_terminated_length": 59.03125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2766770124435425, "epoch": 0.9504424778761061, "frac_reward_zero_std": 1.0, "grad_norm": 0.08836968551118032, "kl": 0.00837495643645525, "learning_rate": 9.437725647651078e-07, "loss": 0.0001, "num_tokens": 8674293.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.759122371673584, "sampling/importance_sampling_ratio/mean": 0.9989175200462341, "sampling/importance_sampling_ratio/min": 0.5930877923965454, "sampling/sampling_logp_difference/max": 0.5648150444030762, "sampling/sampling_logp_difference/mean": 0.016760773956775665, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 56.75, "completions/mean_terminated_length": 56.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3292761445045471, "epoch": 0.952212389380531, "frac_reward_zero_std": 1.0, "grad_norm": 0.03191293281770827, "kl": 0.007732439786195755, "learning_rate": 9.434162349260178e-07, "loss": 0.0001, "num_tokens": 8688741.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.281243920326233, "sampling/importance_sampling_ratio/mean": 0.9997268915176392, "sampling/importance_sampling_ratio/min": 0.7160525321960449, "sampling/sampling_logp_difference/max": 0.3340017795562744, "sampling/sampling_logp_difference/mean": 0.0122067891061306, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 76.296875, "completions/mean_terminated_length": 76.296875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 0.35782527923583984, "epoch": 0.9539823008849557, "frac_reward_zero_std": 0.75, "grad_norm": 10.676094231660771, "kl": 0.010862261056900024, "learning_rate": 9.430588472728269e-07, "loss": 0.0578, "num_tokens": 8704760.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0010135173797607, "sampling/importance_sampling_ratio/min": 0.6073746085166931, "sampling/sampling_logp_difference/max": 0.8597202301025391, "sampling/sampling_logp_difference/mean": 0.013356505893170834, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 74.0, "completions/mean_terminated_length": 74.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3447580337524414, "epoch": 0.9557522123893806, "frac_reward_zero_std": 0.75, "grad_norm": 11.105585897945515, "kl": 0.013816621154546738, "learning_rate": 9.427004026581196e-07, "loss": 0.1718, "num_tokens": 8719640.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.652379035949707, "sampling/importance_sampling_ratio/mean": 1.001023769378662, "sampling/importance_sampling_ratio/min": 0.7009364366531372, "sampling/sampling_logp_difference/max": 0.502216100692749, "sampling/sampling_logp_difference/mean": 0.013380978256464005, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 123.703125, "completions/mean_terminated_length": 123.703125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3127603530883789, "epoch": 0.9575221238938053, "frac_reward_zero_std": 0.75, "grad_norm": 5.913535613821327, "kl": 0.015127960592508316, "learning_rate": 9.423409019370014e-07, "loss": -0.202, "num_tokens": 8738533.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.567948818206787, "sampling/importance_sampling_ratio/mean": 0.9999661445617676, "sampling/importance_sampling_ratio/min": 0.6265570521354675, "sampling/sampling_logp_difference/max": 0.4675154685974121, "sampling/sampling_logp_difference/mean": 0.012319693341851234, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 87.265625, "completions/mean_terminated_length": 87.265625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.4443648159503937, "epoch": 0.95929203539823, "frac_reward_zero_std": 1.0, "grad_norm": 0.08539988253972454, "kl": 0.018398810178041458, "learning_rate": 9.419803459670979e-07, "loss": 0.0002, "num_tokens": 8754758.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.574440360069275, "sampling/importance_sampling_ratio/mean": 0.9996380805969238, "sampling/importance_sampling_ratio/min": 0.5782991051673889, "sampling/sampling_logp_difference/max": 0.5476641654968262, "sampling/sampling_logp_difference/mean": 0.015303626656532288, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 105.0625, "completions/mean_terminated_length": 105.0625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.5941020250320435, "epoch": 0.9610619469026549, "frac_reward_zero_std": 1.0, "grad_norm": 0.04467205979015622, "kl": 0.015284831635653973, "learning_rate": 9.416187356085512e-07, "loss": 0.0002, "num_tokens": 8770970.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3732892274856567, "sampling/importance_sampling_ratio/mean": 1.0000840425491333, "sampling/importance_sampling_ratio/min": 0.27647536993026733, "sampling/sampling_logp_difference/max": 1.2856335639953613, "sampling/sampling_logp_difference/mean": 0.020597338676452637, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 135.796875, "completions/mean_terminated_length": 135.796875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.4649226665496826, "epoch": 0.9628318584070796, "frac_reward_zero_std": 1.0, "grad_norm": 0.0357517830995541, "kl": 0.014059186913073063, "learning_rate": 9.412560717240195e-07, "loss": 0.0001, "num_tokens": 8789613.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002797842025757, "sampling/importance_sampling_ratio/min": 0.5656505823135376, "sampling/sampling_logp_difference/max": 0.6993000507354736, "sampling/sampling_logp_difference/mean": 0.017499998211860657, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 33.875, "completions/mean_terminated_length": 33.875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.1782006323337555, "epoch": 0.9646017699115044, "frac_reward_zero_std": 1.0, "grad_norm": 0.09206582105872552, "kl": 0.01672673039138317, "learning_rate": 9.408923551786742e-07, "loss": 0.0002, "num_tokens": 8804885.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4249764680862427, "sampling/importance_sampling_ratio/mean": 0.9994561672210693, "sampling/importance_sampling_ratio/min": 0.7034332752227783, "sampling/sampling_logp_difference/max": 0.3541553020477295, "sampling/sampling_logp_difference/mean": 0.011499391868710518, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 41.890625, "completions/mean_terminated_length": 41.890625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.1523999273777008, "epoch": 0.9663716814159292, "frac_reward_zero_std": 1.0, "grad_norm": 0.037107556403945, "kl": 0.0039587770588696, "learning_rate": 9.405275868401974e-07, "loss": 0.0001, "num_tokens": 8821678.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4257996082305908, "sampling/importance_sampling_ratio/mean": 0.999606728553772, "sampling/importance_sampling_ratio/min": 0.725745677947998, "sampling/sampling_logp_difference/max": 0.3547327518463135, "sampling/sampling_logp_difference/mean": 0.005943705327808857, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 116.28125, "completions/mean_terminated_length": 116.28125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.4968239665031433, "epoch": 0.968141592920354, "frac_reward_zero_std": 0.75, "grad_norm": 6.457847220461041, "kl": 0.021092496812343597, "learning_rate": 9.40161767578781e-07, "loss": 0.1605, "num_tokens": 8838672.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.600094199180603, "sampling/importance_sampling_ratio/mean": 1.0008270740509033, "sampling/importance_sampling_ratio/min": 0.26678600907325745, "sampling/sampling_logp_difference/max": 1.3213083744049072, "sampling/sampling_logp_difference/mean": 0.018169770017266273, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 54.5, "completions/mean_terminated_length": 54.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2445429116487503, "epoch": 0.9699115044247788, "frac_reward_zero_std": 1.0, "grad_norm": 0.07124112019982765, "kl": 0.010958978906273842, "learning_rate": 9.397948982671236e-07, "loss": 0.0001, "num_tokens": 8852992.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6126388311386108, "sampling/importance_sampling_ratio/mean": 1.0011223554611206, "sampling/importance_sampling_ratio/min": 0.7174001336097717, "sampling/sampling_logp_difference/max": 0.4778718948364258, "sampling/sampling_logp_difference/mean": 0.010054649785161018, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 82.65625, "completions/mean_terminated_length": 82.65625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.4400416612625122, "epoch": 0.9716814159292035, "frac_reward_zero_std": 0.75, "grad_norm": 6.758155387149215, "kl": 0.01893545500934124, "learning_rate": 9.394269797804288e-07, "loss": 0.1139, "num_tokens": 8870970.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.4193644523620605, "sampling/importance_sampling_ratio/mean": 0.999426007270813, "sampling/importance_sampling_ratio/min": 0.4615151286125183, "sampling/sampling_logp_difference/max": 0.7732404470443726, "sampling/sampling_logp_difference/mean": 0.015387221239507198, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 92.0625, "completions/mean_terminated_length": 92.0625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.4796140491962433, "epoch": 0.9734513274336283, "frac_reward_zero_std": 0.5, "grad_norm": 9.630481719584537, "kl": 0.016388213261961937, "learning_rate": 9.390580129964035e-07, "loss": 0.2398, "num_tokens": 8886062.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.8056321144104004, "sampling/importance_sampling_ratio/mean": 0.9994021654129028, "sampling/importance_sampling_ratio/min": 0.6967775821685791, "sampling/sampling_logp_difference/max": 0.5909106731414795, "sampling/sampling_logp_difference/mean": 0.02025291696190834, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 49.71875, "completions/mean_terminated_length": 49.71875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.21508081257343292, "epoch": 0.9752212389380531, "frac_reward_zero_std": 1.0, "grad_norm": 0.066991866362209, "kl": 0.013176451437175274, "learning_rate": 9.386879987952549e-07, "loss": 0.0001, "num_tokens": 8899756.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4295161962509155, "sampling/importance_sampling_ratio/mean": 1.0003056526184082, "sampling/importance_sampling_ratio/min": 0.7088531851768494, "sampling/sampling_logp_difference/max": 0.35733604431152344, "sampling/sampling_logp_difference/mean": 0.009776454418897629, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 61.96875, "completions/mean_terminated_length": 61.96875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.325038880109787, "epoch": 0.9769911504424779, "frac_reward_zero_std": 0.75, "grad_norm": 13.860276512203823, "kl": 0.04639158397912979, "learning_rate": 9.383169380596892e-07, "loss": 0.1206, "num_tokens": 8916202.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.8170350790023804, "sampling/importance_sampling_ratio/mean": 0.9982390403747559, "sampling/importance_sampling_ratio/min": 0.6422437429428101, "sampling/sampling_logp_difference/max": 0.5972061157226562, "sampling/sampling_logp_difference/mean": 0.014898296445608139, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 76.328125, "completions/mean_terminated_length": 76.328125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2448478639125824, "epoch": 0.9787610619469026, "frac_reward_zero_std": 1.0, "grad_norm": 0.10604575756165804, "kl": 0.0240221805870533, "learning_rate": 9.37944831674909e-07, "loss": 0.0002, "num_tokens": 8930895.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4379768371582031, "sampling/importance_sampling_ratio/mean": 0.9995517134666443, "sampling/importance_sampling_ratio/min": 0.6523293852806091, "sampling/sampling_logp_difference/max": 0.4272056221961975, "sampling/sampling_logp_difference/mean": 0.010927535593509674, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 66.859375, "completions/mean_terminated_length": 66.859375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.26345235109329224, "epoch": 0.9805309734513274, "frac_reward_zero_std": 0.75, "grad_norm": 9.897098778736009, "kl": 0.02797047048807144, "learning_rate": 9.37571680528612e-07, "loss": -0.0969, "num_tokens": 8945126.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0015347003936768, "sampling/importance_sampling_ratio/min": 0.5551202893257141, "sampling/sampling_logp_difference/max": 0.7081301212310791, "sampling/sampling_logp_difference/mean": 0.010650092735886574, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 65.53125, "completions/mean_terminated_length": 65.53125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2612670361995697, "epoch": 0.9823008849557522, "frac_reward_zero_std": 1.0, "grad_norm": 0.09061518316154932, "kl": 0.015878070145845413, "learning_rate": 9.371974855109874e-07, "loss": 0.0002, "num_tokens": 8962232.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5268092155456543, "sampling/importance_sampling_ratio/mean": 1.0007729530334473, "sampling/importance_sampling_ratio/min": 0.6973515152931213, "sampling/sampling_logp_difference/max": 0.42318010330200195, "sampling/sampling_logp_difference/mean": 0.011717341840267181, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.10745927691459656, "epoch": 0.984070796460177, "frac_reward_zero_std": 0.75, "grad_norm": 23.680701551151667, "kl": 0.20216985046863556, "learning_rate": 9.368222475147153e-07, "loss": -0.623, "num_tokens": 8973160.0, "reward": -0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.8384283781051636, "sampling/importance_sampling_ratio/mean": 1.0007233619689941, "sampling/importance_sampling_ratio/min": 0.7110517024993896, "sampling/sampling_logp_difference/max": 0.6089110374450684, "sampling/sampling_logp_difference/mean": 0.01067114993929863, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.015625, "completions/mean_terminated_length": 17.015625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.06491994112730026, "epoch": 0.9858407079646018, "frac_reward_zero_std": 1.0, "grad_norm": 0.2432899230676877, "kl": 0.1308480203151703, "learning_rate": 9.36445967434964e-07, "loss": 0.0012, "num_tokens": 8989801.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.1004294157028198, "sampling/importance_sampling_ratio/mean": 1.0001790523529053, "sampling/importance_sampling_ratio/min": 0.9071494936943054, "sampling/sampling_logp_difference/max": 0.09744799137115479, "sampling/sampling_logp_difference/mean": 0.0038687689229846, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 91.8125, "completions/mean_terminated_length": 91.8125, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.34381309151649475, "epoch": 0.9876106194690265, "frac_reward_zero_std": 0.75, "grad_norm": 3.125285207917033, "kl": 0.02384905517101288, "learning_rate": 9.360686461693872e-07, "loss": 0.1224, "num_tokens": 9005949.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4320323467254639, "sampling/importance_sampling_ratio/mean": 1.0000149011611938, "sampling/importance_sampling_ratio/min": 0.7160810232162476, "sampling/sampling_logp_difference/max": 0.35909461975097656, "sampling/sampling_logp_difference/mean": 0.01379399560391903, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 111.0, "completions/max_terminated_length": 111.0, "completions/mean_length": 23.640625, "completions/mean_terminated_length": 23.640625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.25869446992874146, "epoch": 0.9893805309734514, "frac_reward_zero_std": 0.5, "grad_norm": 14.470531190670112, "kl": 0.21281038224697113, "learning_rate": 9.356902846181228e-07, "loss": 0.1498, "num_tokens": 9019494.0, "reward": 0.375, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.56514573097229, "sampling/importance_sampling_ratio/mean": 1.0025410652160645, "sampling/importance_sampling_ratio/min": 0.7757983803749084, "sampling/sampling_logp_difference/max": 0.4479789733886719, "sampling/sampling_logp_difference/mean": 0.012579815462231636, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 61.671875, "completions/mean_terminated_length": 61.671875, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.32114139199256897, "epoch": 0.9911504424778761, "frac_reward_zero_std": 0.75, "grad_norm": 5.609035510367293, "kl": 0.018139678984880447, "learning_rate": 9.353108836837905e-07, "loss": 0.093, "num_tokens": 9036113.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6116080284118652, "sampling/importance_sampling_ratio/mean": 1.0010817050933838, "sampling/importance_sampling_ratio/min": 0.6868107914924622, "sampling/sampling_logp_difference/max": 0.4772324562072754, "sampling/sampling_logp_difference/mean": 0.012453297153115273, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 14.59375, "completions/mean_terminated_length": 14.59375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.07570166885852814, "epoch": 0.9929203539823008, "frac_reward_zero_std": 1.0, "grad_norm": 0.20701388280592103, "kl": 0.19794254004955292, "learning_rate": 9.349304442714895e-07, "loss": 0.0019, "num_tokens": 9050007.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2850689888000488, "sampling/importance_sampling_ratio/mean": 0.99912029504776, "sampling/importance_sampling_ratio/min": 0.8150334358215332, "sampling/sampling_logp_difference/max": 0.2508124113082886, "sampling/sampling_logp_difference/mean": 0.011712642386555672, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 52.640625, "completions/mean_terminated_length": 52.640625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.32283660769462585, "epoch": 0.9946902654867257, "frac_reward_zero_std": 0.75, "grad_norm": 8.951665755188321, "kl": 0.12831877171993256, "learning_rate": 9.345489672887962e-07, "loss": -0.1108, "num_tokens": 9063488.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4442304372787476, "sampling/importance_sampling_ratio/mean": 0.999579906463623, "sampling/importance_sampling_ratio/min": 0.6357463002204895, "sampling/sampling_logp_difference/max": 0.4529557228088379, "sampling/sampling_logp_difference/mean": 0.013744027353823185, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 39.609375, "completions/mean_terminated_length": 39.609375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.23001480102539062, "epoch": 0.9964601769911504, "frac_reward_zero_std": 0.75, "grad_norm": 21.980215852115183, "kl": 0.34417223930358887, "learning_rate": 9.341664536457625e-07, "loss": -0.1346, "num_tokens": 9075783.0, "reward": -0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994174838066101, "sampling/importance_sampling_ratio/min": 0.685285210609436, "sampling/sampling_logp_difference/max": 1.5750985145568848, "sampling/sampling_logp_difference/mean": 0.015202976763248444, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.828125, "completions/mean_terminated_length": 15.828125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.08043765276670456, "epoch": 0.9982300884955753, "frac_reward_zero_std": 1.0, "grad_norm": 0.11158896786330862, "kl": 0.08205146342515945, "learning_rate": 9.337829042549133e-07, "loss": 0.0008, "num_tokens": 9087580.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4533406496047974, "sampling/importance_sampling_ratio/mean": 1.0018446445465088, "sampling/importance_sampling_ratio/min": 0.6155814528465271, "sampling/sampling_logp_difference/max": 0.48518800735473633, "sampling/sampling_logp_difference/mean": 0.01030774973332882, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 53.546875, "completions/mean_terminated_length": 53.546875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.3179180324077606, "epoch": 1.0, "frac_reward_zero_std": 0.75, "grad_norm": 9.110451794643717, "kl": 0.07360747456550598, "learning_rate": 9.33398320031244e-07, "loss": -0.2468, "num_tokens": 9100799.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.5650666952133179, "sampling/importance_sampling_ratio/mean": 0.9979598522186279, "sampling/importance_sampling_ratio/min": 0.6005945205688477, "sampling/sampling_logp_difference/max": 0.5098352432250977, "sampling/sampling_logp_difference/mean": 0.017205916345119476, "step": 565 } ], "logging_steps": 1, "max_steps": 2260, "num_input_tokens_seen": 9100799, "num_train_epochs": 4, "save_steps": 565, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }