Files
test_gin_rummy_qwen_2-5_3B/trainer_state.json
ModelHub XC 92d21beda9 初始化项目,由ModelHub XC社区提供模型
Model: bimabk/test_gin_rummy_qwen_2-5_3B
Source: Original Platform
2026-05-06 13:37:50 +08:00

1955 lines
79 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.00075,
"eval_steps": 500,
"global_step": 75,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10096.0,
"completions/max_terminated_length": 10096.0,
"completions/mean_length": 8672.71875,
"completions/mean_terminated_length": 8672.71875,
"completions/min_length": 3020.0,
"completions/min_terminated_length": 3020.0,
"entropy": 0.49113161116838455,
"epoch": 1e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.241949200630188,
"kl": 0.0,
"learning_rate": 0.0,
"loss": -0.0633,
"num_tokens": 306152.0,
"reward": -0.4408680200576782,
"reward_std": 0.3989785313606262,
"rewards/rollout_eval_reward_func/mean": 0.11064532399177551,
"rewards/rollout_eval_reward_func/std": 0.21571724116802216,
"rewards/rollout_reward_func/mean": -0.4408680200576782,
"rewards/rollout_reward_func/std": 0.44763946533203125,
"sampling/importance_sampling_ratio/max": 1.2819759845733643,
"sampling/importance_sampling_ratio/mean": 0.9992397427558899,
"sampling/importance_sampling_ratio/min": 0.7715137004852295,
"sampling/sampling_logp_difference/max": 0.2594008445739746,
"sampling/sampling_logp_difference/mean": 0.01546277105808258,
"step": 1,
"step_time": 73.26994180099973
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.49113161116838455,
"epoch": 2e-05,
"grad_norm": 1.2400784492492676,
"kl": 0.0,
"learning_rate": 2.8571428571428573e-06,
"loss": -0.0633,
"step": 2,
"step_time": 30.109230951999052
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0005208333604969084,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0005208333604969084,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10009.0,
"completions/max_terminated_length": 10009.0,
"completions/mean_length": 7330.1875,
"completions/mean_terminated_length": 7330.1875,
"completions/min_length": 346.0,
"completions/min_terminated_length": 346.0,
"entropy": 0.5131296459585428,
"epoch": 3e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.102152943611145,
"kl": 0.0009028113518070313,
"learning_rate": 5.7142857142857145e-06,
"loss": -0.2347,
"num_tokens": 569740.0,
"reward": -0.48799318075180054,
"reward_std": 0.5598607063293457,
"rewards/rollout_eval_reward_func/mean": 0.22929370403289795,
"rewards/rollout_eval_reward_func/std": 0.26715749502182007,
"rewards/rollout_reward_func/mean": -0.48799318075180054,
"rewards/rollout_reward_func/std": 0.5559459924697876,
"sampling/importance_sampling_ratio/max": 1.2627520561218262,
"sampling/importance_sampling_ratio/mean": 1.0006182193756104,
"sampling/importance_sampling_ratio/min": 0.7627776861190796,
"sampling/sampling_logp_difference/max": 0.27078866958618164,
"sampling/sampling_logp_difference/mean": 0.014230873435735703,
"step": 3,
"step_time": 68.85090976999709
},
{
"clip_ratio/high_max": 0.0020833334419876337,
"clip_ratio/high_mean": 0.0010416667209938169,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0010416667209938169,
"entropy": 0.5151741988956928,
"epoch": 4e-05,
"grad_norm": 1.0848904848098755,
"kl": 0.0004950130587531021,
"learning_rate": 8.571428571428573e-06,
"loss": -0.2336,
"step": 4,
"step_time": 28.428488818004553
},
{
"clip_ratio/high_max": 0.0010416667209938169,
"clip_ratio/high_mean": 0.0005208333604969084,
"clip_ratio/low_mean": 0.0005208333604969084,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0010416667209938169,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10323.0,
"completions/max_terminated_length": 10323.0,
"completions/mean_length": 8267.125,
"completions/mean_terminated_length": 8267.125,
"completions/min_length": 1640.0,
"completions/min_terminated_length": 1640.0,
"entropy": 0.5123504158109426,
"epoch": 5e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1476984024047852,
"kl": 0.0007431179510604125,
"learning_rate": 1.1428571428571429e-05,
"loss": -0.0418,
"num_tokens": 862728.0,
"reward": -0.46075016260147095,
"reward_std": 0.5065791606903076,
"rewards/rollout_eval_reward_func/mean": 0.128683939576149,
"rewards/rollout_eval_reward_func/std": 0.2396152913570404,
"rewards/rollout_reward_func/mean": -0.46075016260147095,
"rewards/rollout_reward_func/std": 0.5104123950004578,
"sampling/importance_sampling_ratio/max": 1.3248213529586792,
"sampling/importance_sampling_ratio/mean": 1.0001360177993774,
"sampling/importance_sampling_ratio/min": 0.6914317011833191,
"sampling/sampling_logp_difference/max": 0.3689908981323242,
"sampling/sampling_logp_difference/mean": 0.016226449981331825,
"step": 5,
"step_time": 75.37122915000327
},
{
"clip_ratio/high_max": 0.0026041667442768812,
"clip_ratio/high_mean": 0.0013020833721384406,
"clip_ratio/low_mean": 0.0032900729565881193,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00459215632872656,
"entropy": 0.5106779877096415,
"epoch": 6e-05,
"grad_norm": 1.0145094394683838,
"kl": 0.0013804795053147245,
"learning_rate": 1.4285714285714285e-05,
"loss": -0.045,
"step": 6,
"step_time": 29.551835642994774
},
{
"clip_ratio/high_max": 0.0024003623984754086,
"clip_ratio/high_mean": 0.0012001811992377043,
"clip_ratio/low_mean": 0.0005208333604969084,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0017210145597346127,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10088.0,
"completions/max_terminated_length": 10088.0,
"completions/mean_length": 8518.21875,
"completions/mean_terminated_length": 8518.21875,
"completions/min_length": 4084.0,
"completions/min_terminated_length": 4084.0,
"entropy": 0.5038529355078936,
"epoch": 7e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5022886991500854,
"kl": 0.002840353590727318,
"learning_rate": 1.7142857142857145e-05,
"loss": -0.0036,
"num_tokens": 1164601.0,
"reward": -0.41255950927734375,
"reward_std": 0.46968239545822144,
"rewards/rollout_eval_reward_func/mean": 0.11216971278190613,
"rewards/rollout_eval_reward_func/std": 0.2204883098602295,
"rewards/rollout_reward_func/mean": -0.41255950927734375,
"rewards/rollout_reward_func/std": 0.5122336149215698,
"sampling/importance_sampling_ratio/max": 1.4158059358596802,
"sampling/importance_sampling_ratio/mean": 1.0018370151519775,
"sampling/importance_sampling_ratio/min": 0.7707551121711731,
"sampling/sampling_logp_difference/max": 0.3476989269256592,
"sampling/sampling_logp_difference/mean": 0.017664402723312378,
"step": 7,
"step_time": 77.99332059699736
},
{
"clip_ratio/high_max": 0.005842391517944634,
"clip_ratio/high_mean": 0.0034420291776768863,
"clip_ratio/low_mean": 0.0051097974355798215,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008551826613256708,
"entropy": 0.5001224614679813,
"epoch": 8e-05,
"grad_norm": 1.3377231359481812,
"kl": 0.006958273006603122,
"learning_rate": 2e-05,
"loss": -0.0079,
"step": 8,
"step_time": 30.119341139003154
},
{
"clip_ratio/high_max": 0.0020833334419876337,
"clip_ratio/high_mean": 0.0010416667209938169,
"clip_ratio/low_mean": 0.00046641789958812296,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0015080846205819398,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9987.0,
"completions/max_terminated_length": 9987.0,
"completions/mean_length": 8235.9375,
"completions/mean_terminated_length": 8235.9375,
"completions/min_length": 2028.0,
"completions/min_terminated_length": 2028.0,
"entropy": 0.5665333420038223,
"epoch": 9e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.413293719291687,
"kl": 0.012357485480606556,
"learning_rate": 2.2857142857142858e-05,
"loss": -0.0089,
"num_tokens": 1456974.0,
"reward": -0.2786320447921753,
"reward_std": 0.4699662923812866,
"rewards/rollout_eval_reward_func/mean": 0.12322154641151428,
"rewards/rollout_eval_reward_func/std": 0.23254993557929993,
"rewards/rollout_reward_func/mean": -0.2786320447921753,
"rewards/rollout_reward_func/std": 0.510530948638916,
"sampling/importance_sampling_ratio/max": 1.6322839260101318,
"sampling/importance_sampling_ratio/mean": 0.9981738328933716,
"sampling/importance_sampling_ratio/min": 0.6440463662147522,
"sampling/sampling_logp_difference/max": 0.48998022079467773,
"sampling/sampling_logp_difference/mean": 0.02640429511666298,
"step": 9,
"step_time": 80.34681812299641
},
{
"clip_ratio/high_max": 0.028179825632832944,
"clip_ratio/high_mean": 0.01559113833354786,
"clip_ratio/low_mean": 0.01464278216008097,
"clip_ratio/low_min": 0.006223290809430182,
"clip_ratio/region_mean": 0.03023392061004415,
"entropy": 0.5607042815536261,
"epoch": 0.0001,
"grad_norm": 1.2342119216918945,
"kl": 0.03045007959008217,
"learning_rate": 2.5714285714285714e-05,
"loss": -0.0159,
"step": 10,
"step_time": 28.650263912999435
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0005208333604969084,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0005208333604969084,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10076.0,
"completions/max_terminated_length": 10076.0,
"completions/mean_length": 8311.21875,
"completions/mean_terminated_length": 8311.21875,
"completions/min_length": 1530.0,
"completions/min_terminated_length": 1530.0,
"entropy": 0.4887528121471405,
"epoch": 0.00011,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4407812356948853,
"kl": 0.04280303395353258,
"learning_rate": 2.857142857142857e-05,
"loss": -0.0508,
"num_tokens": 1751757.0,
"reward": -0.26280224323272705,
"reward_std": 0.4824950098991394,
"rewards/rollout_eval_reward_func/mean": 0.1091209352016449,
"rewards/rollout_eval_reward_func/std": 0.22141531109809875,
"rewards/rollout_reward_func/mean": -0.26280224323272705,
"rewards/rollout_reward_func/std": 0.4825066328048706,
"sampling/importance_sampling_ratio/max": 2.2060391902923584,
"sampling/importance_sampling_ratio/mean": 1.003042221069336,
"sampling/importance_sampling_ratio/min": 0.505047619342804,
"sampling/sampling_logp_difference/max": 0.79119873046875,
"sampling/sampling_logp_difference/mean": 0.03998423367738724,
"step": 11,
"step_time": 81.20211481799561
},
{
"clip_ratio/high_max": 0.031166458851657808,
"clip_ratio/high_mean": 0.01714572956552729,
"clip_ratio/low_mean": 0.018567851395346224,
"clip_ratio/low_min": 0.005885701393708587,
"clip_ratio/region_mean": 0.0357135811354965,
"entropy": 0.47410433553159237,
"epoch": 0.00012,
"grad_norm": 1.048365831375122,
"kl": 0.08051084214821458,
"learning_rate": 3.142857142857143e-05,
"loss": -0.0558,
"step": 12,
"step_time": 29.28374841400546
},
{
"clip_ratio/high_max": 0.001953125,
"clip_ratio/high_mean": 0.0009765625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0009765625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10209.0,
"completions/max_terminated_length": 10209.0,
"completions/mean_length": 8161.71875,
"completions/mean_terminated_length": 8161.71875,
"completions/min_length": 1827.0,
"completions/min_terminated_length": 1827.0,
"entropy": 0.43679925985634327,
"epoch": 0.00013,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.0560696125030518,
"kl": 0.09263498219661415,
"learning_rate": 3.428571428571429e-05,
"loss": 0.1042,
"num_tokens": 2042327.0,
"reward": -0.02590048871934414,
"reward_std": 0.6161512732505798,
"rewards/rollout_eval_reward_func/mean": 0.16006097197532654,
"rewards/rollout_eval_reward_func/std": 0.2864827811717987,
"rewards/rollout_reward_func/mean": -0.02590048871934414,
"rewards/rollout_reward_func/std": 0.6041470170021057,
"sampling/importance_sampling_ratio/max": 2.7582640647888184,
"sampling/importance_sampling_ratio/mean": 0.9981331825256348,
"sampling/importance_sampling_ratio/min": 0.361401230096817,
"sampling/sampling_logp_difference/max": 1.0177664756774902,
"sampling/sampling_logp_difference/mean": 0.06089622899889946,
"step": 13,
"step_time": 85.01218143400365
},
{
"clip_ratio/high_max": 0.012486383900977671,
"clip_ratio/high_mean": 0.007805692031979561,
"clip_ratio/low_mean": 0.030729168094694614,
"clip_ratio/low_min": 0.015625000465661287,
"clip_ratio/region_mean": 0.038534860184881836,
"entropy": 0.41658624820411205,
"epoch": 0.00014,
"grad_norm": 1.044942855834961,
"kl": 0.16313170175999403,
"learning_rate": 3.7142857142857143e-05,
"loss": 0.1002,
"step": 14,
"step_time": 28.990433916003894
},
{
"clip_ratio/high_max": 0.00596590933855623,
"clip_ratio/high_mean": 0.0035037880297750235,
"clip_ratio/low_mean": 0.0005122950533404946,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004016083083115518,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10134.0,
"completions/max_terminated_length": 10134.0,
"completions/mean_length": 8323.34375,
"completions/mean_terminated_length": 8323.34375,
"completions/min_length": 1934.0,
"completions/min_terminated_length": 1934.0,
"entropy": 0.44160761684179306,
"epoch": 0.00015,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2544862031936646,
"kl": 0.21658248733729124,
"learning_rate": 4e-05,
"loss": 0.1199,
"num_tokens": 2337711.0,
"reward": -0.0776321142911911,
"reward_std": 0.5812347531318665,
"rewards/rollout_eval_reward_func/mean": 0.14151422679424286,
"rewards/rollout_eval_reward_func/std": 0.2538794279098511,
"rewards/rollout_reward_func/mean": -0.0776321142911911,
"rewards/rollout_reward_func/std": 0.5845968723297119,
"sampling/importance_sampling_ratio/max": 1.8725090026855469,
"sampling/importance_sampling_ratio/mean": 0.9912927150726318,
"sampling/importance_sampling_ratio/min": 0.1565917581319809,
"sampling/sampling_logp_difference/max": 1.8541131019592285,
"sampling/sampling_logp_difference/mean": 0.06762713938951492,
"step": 15,
"step_time": 87.63701662399762
},
{
"clip_ratio/high_max": 0.033285985700786114,
"clip_ratio/high_mean": 0.02006899402476847,
"clip_ratio/low_mean": 0.017902423918712884,
"clip_ratio/low_min": 0.008303140406496823,
"clip_ratio/region_mean": 0.03797141805989668,
"entropy": 0.43832515366375446,
"epoch": 0.00016,
"grad_norm": 1.1862040758132935,
"kl": 0.2433762801811099,
"learning_rate": 4.2857142857142856e-05,
"loss": 0.1137,
"step": 16,
"step_time": 30.26940473900322
},
{
"clip_ratio/high_max": 0.005208333604969084,
"clip_ratio/high_mean": 0.002604166802484542,
"clip_ratio/low_mean": 0.0005208333604969084,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0031250001629814506,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10099.0,
"completions/max_terminated_length": 10099.0,
"completions/mean_length": 8931.625,
"completions/mean_terminated_length": 8931.625,
"completions/min_length": 2013.0,
"completions/min_terminated_length": 2013.0,
"entropy": 0.4058182891458273,
"epoch": 0.00017,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.206827998161316,
"kl": 0.1886005294509232,
"learning_rate": 4.5714285714285716e-05,
"loss": -0.0944,
"num_tokens": 2652765.0,
"reward": -0.0752231553196907,
"reward_std": 0.48041343688964844,
"rewards/rollout_eval_reward_func/mean": 0.10861280560493469,
"rewards/rollout_eval_reward_func/std": 0.2368263602256775,
"rewards/rollout_reward_func/mean": -0.0752231553196907,
"rewards/rollout_reward_func/std": 0.5091694593429565,
"sampling/importance_sampling_ratio/max": 2.2689177989959717,
"sampling/importance_sampling_ratio/mean": 1.0046234130859375,
"sampling/importance_sampling_ratio/min": 0.1846628040075302,
"sampling/sampling_logp_difference/max": 1.6892237663269043,
"sampling/sampling_logp_difference/mean": 0.06120520830154419,
"step": 17,
"step_time": 96.5394253049999
},
{
"clip_ratio/high_max": 0.0221070961561054,
"clip_ratio/high_mean": 0.013136881520040333,
"clip_ratio/low_mean": 0.005389189289417118,
"clip_ratio/low_min": 0.002066256827674806,
"clip_ratio/region_mean": 0.01852607080945745,
"entropy": 0.40752917528152466,
"epoch": 0.00018,
"grad_norm": 1.039859652519226,
"kl": 0.20007089478895068,
"learning_rate": 4.8571428571428576e-05,
"loss": -0.1064,
"step": 18,
"step_time": 29.607819763001316
},
{
"clip_ratio/high_max": 0.00424107164144516,
"clip_ratio/high_mean": 0.00212053582072258,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00212053582072258,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9868.0,
"completions/max_terminated_length": 9868.0,
"completions/mean_length": 7739.625,
"completions/mean_terminated_length": 7739.625,
"completions/min_length": 1494.0,
"completions/min_terminated_length": 1494.0,
"entropy": 0.3824189379811287,
"epoch": 0.00019,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1822205781936646,
"kl": 0.1706448094919324,
"learning_rate": 5.142857142857143e-05,
"loss": -0.1187,
"num_tokens": 2929452.0,
"reward": 0.1796756088733673,
"reward_std": 0.6716787815093994,
"rewards/rollout_eval_reward_func/mean": 0.25978150963783264,
"rewards/rollout_eval_reward_func/std": 0.31619328260421753,
"rewards/rollout_reward_func/mean": 0.1796756088733673,
"rewards/rollout_reward_func/std": 0.6625394821166992,
"sampling/importance_sampling_ratio/max": 1.8655627965927124,
"sampling/importance_sampling_ratio/mean": 1.0000479221343994,
"sampling/importance_sampling_ratio/min": 0.33482789993286133,
"sampling/sampling_logp_difference/max": 1.0941386222839355,
"sampling/sampling_logp_difference/mean": 0.04819408059120178,
"step": 19,
"step_time": 92.65558583299753
},
{
"clip_ratio/high_max": 0.030015080701559782,
"clip_ratio/high_mean": 0.018132540630176663,
"clip_ratio/low_mean": 0.03180725604761392,
"clip_ratio/low_min": 0.0052083334885537624,
"clip_ratio/region_mean": 0.049939796910621226,
"entropy": 0.3580914381891489,
"epoch": 0.0002,
"grad_norm": 1.152976155281067,
"kl": 0.2634436935186386,
"learning_rate": 5.428571428571428e-05,
"loss": -0.1272,
"step": 20,
"step_time": 28.27301450500272
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0020833335001952946,
"clip_ratio/low_min": 0.0010416667209938169,
"clip_ratio/region_mean": 0.0020833335001952946,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10426.0,
"completions/max_terminated_length": 10426.0,
"completions/mean_length": 7911.40625,
"completions/mean_terminated_length": 7911.40625,
"completions/min_length": 1040.0,
"completions/min_terminated_length": 1040.0,
"entropy": 0.3455618601292372,
"epoch": 0.00021,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9142677187919617,
"kl": 0.2354841867927462,
"learning_rate": 5.714285714285714e-05,
"loss": -0.0904,
"num_tokens": 3211621.0,
"reward": 0.09562171995639801,
"reward_std": 0.6017146706581116,
"rewards/rollout_eval_reward_func/mean": 0.1835619956254959,
"rewards/rollout_eval_reward_func/std": 0.2800058424472809,
"rewards/rollout_reward_func/mean": 0.09562171995639801,
"rewards/rollout_reward_func/std": 0.5979344248771667,
"sampling/importance_sampling_ratio/max": 1.7227435111999512,
"sampling/importance_sampling_ratio/mean": 0.9981924295425415,
"sampling/importance_sampling_ratio/min": 0.38243889808654785,
"sampling/sampling_logp_difference/max": 0.961186408996582,
"sampling/sampling_logp_difference/mean": 0.04361895099282265,
"step": 21,
"step_time": 94.40408171299714
},
{
"clip_ratio/high_max": 0.03222161578014493,
"clip_ratio/high_mean": 0.0181941413320601,
"clip_ratio/low_mean": 0.02708333428017795,
"clip_ratio/low_min": 0.0062500000931322575,
"clip_ratio/region_mean": 0.04527747584506869,
"entropy": 0.3229655371978879,
"epoch": 0.00022,
"grad_norm": 0.8647798895835876,
"kl": 0.21354854525998235,
"learning_rate": 6e-05,
"loss": -0.1008,
"step": 22,
"step_time": 30.11174104199381
},
{
"clip_ratio/high_max": 0.0011160714784637094,
"clip_ratio/high_mean": 0.0005580357392318547,
"clip_ratio/low_mean": 0.0010995370685122907,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0016575728077441454,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10470.0,
"completions/max_terminated_length": 10470.0,
"completions/mean_length": 7568.375,
"completions/mean_terminated_length": 7568.375,
"completions/min_length": 2202.0,
"completions/min_terminated_length": 2202.0,
"entropy": 0.28525836300104856,
"epoch": 0.00023,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.0814907550811768,
"kl": 0.35280791157856584,
"learning_rate": 6.285714285714286e-05,
"loss": 0.016,
"num_tokens": 3482436.0,
"reward": 0.2800288200378418,
"reward_std": 0.7106037139892578,
"rewards/rollout_eval_reward_func/mean": 0.33079269528388977,
"rewards/rollout_eval_reward_func/std": 0.3271085023880005,
"rewards/rollout_reward_func/mean": 0.2800288200378418,
"rewards/rollout_reward_func/std": 0.6996307373046875,
"sampling/importance_sampling_ratio/max": 1.6482936143875122,
"sampling/importance_sampling_ratio/mean": 1.0002542734146118,
"sampling/importance_sampling_ratio/min": 0.2758394777774811,
"sampling/sampling_logp_difference/max": 1.2879362106323242,
"sampling/sampling_logp_difference/mean": 0.0332026481628418,
"step": 23,
"step_time": 93.99063302500326
},
{
"clip_ratio/high_max": 0.01396139187272638,
"clip_ratio/high_mean": 0.007690923230256885,
"clip_ratio/low_mean": 0.01880787085974589,
"clip_ratio/low_min": 0.0031250001629814506,
"clip_ratio/region_mean": 0.02649879432283342,
"entropy": 0.2676102966070175,
"epoch": 0.00024,
"grad_norm": 0.8727543354034424,
"kl": 0.3772396189160645,
"learning_rate": 6.571428571428571e-05,
"loss": 0.0057,
"step": 24,
"step_time": 29.4178187339985
},
{
"clip_ratio/high_max": 0.004613095428794622,
"clip_ratio/high_mean": 0.002985895553138107,
"clip_ratio/low_mean": 0.0005208333604969084,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0035067289136350155,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9569.0,
"completions/max_terminated_length": 9569.0,
"completions/mean_length": 7533.28125,
"completions/mean_terminated_length": 7533.28125,
"completions/min_length": 2449.0,
"completions/min_terminated_length": 2449.0,
"entropy": 0.2505391649901867,
"epoch": 0.00025,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.017386555671692,
"kl": 0.21523633878678083,
"learning_rate": 6.857142857142858e-05,
"loss": 0.0242,
"num_tokens": 3751699.0,
"reward": 0.3911706805229187,
"reward_std": 0.638326108455658,
"rewards/rollout_eval_reward_func/mean": 0.36318597197532654,
"rewards/rollout_eval_reward_func/std": 0.3184514343738556,
"rewards/rollout_reward_func/mean": 0.3911706805229187,
"rewards/rollout_reward_func/std": 0.6562069654464722,
"sampling/importance_sampling_ratio/max": 1.5404945611953735,
"sampling/importance_sampling_ratio/mean": 0.9984301328659058,
"sampling/importance_sampling_ratio/min": 0.4790920615196228,
"sampling/sampling_logp_difference/max": 0.7358624935150146,
"sampling/sampling_logp_difference/mean": 0.025531694293022156,
"step": 25,
"step_time": 92.37763964700025
},
{
"clip_ratio/high_max": 0.02074831852223724,
"clip_ratio/high_mean": 0.014075796061661094,
"clip_ratio/low_mean": 0.024038826406467706,
"clip_ratio/low_min": 0.004687500186264515,
"clip_ratio/region_mean": 0.0381146224681288,
"entropy": 0.24146342556923628,
"epoch": 0.00026,
"grad_norm": 1.08539617061615,
"kl": 0.242179695982486,
"learning_rate": 7.142857142857143e-05,
"loss": 0.0152,
"step": 26,
"step_time": 27.09601488199405
},
{
"clip_ratio/high_max": 0.004924242617562413,
"clip_ratio/high_mean": 0.0024621213087812066,
"clip_ratio/low_mean": 0.0015625000814907253,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004024621390271932,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9714.0,
"completions/max_terminated_length": 9714.0,
"completions/mean_length": 7341.125,
"completions/mean_terminated_length": 7341.125,
"completions/min_length": 834.0,
"completions/min_terminated_length": 834.0,
"entropy": 0.24662253353744745,
"epoch": 0.00027,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.0926475524902344,
"kl": 0.201888975687325,
"learning_rate": 7.428571428571429e-05,
"loss": -0.0651,
"num_tokens": 4014835.0,
"reward": 0.26619410514831543,
"reward_std": 0.6366387009620667,
"rewards/rollout_eval_reward_func/mean": 0.31529471278190613,
"rewards/rollout_eval_reward_func/std": 0.3177616000175476,
"rewards/rollout_reward_func/mean": 0.26619410514831543,
"rewards/rollout_reward_func/std": 0.6645346879959106,
"sampling/importance_sampling_ratio/max": 1.7210402488708496,
"sampling/importance_sampling_ratio/mean": 0.9990845918655396,
"sampling/importance_sampling_ratio/min": 0.46208029985427856,
"sampling/sampling_logp_difference/max": 0.7720166444778442,
"sampling/sampling_logp_difference/mean": 0.024712545797228813,
"step": 27,
"step_time": 90.07543276499928
},
{
"clip_ratio/high_max": 0.033208509092219174,
"clip_ratio/high_mean": 0.018557379313278943,
"clip_ratio/low_mean": 0.035281969350762665,
"clip_ratio/low_min": 0.011458333698101342,
"clip_ratio/region_mean": 0.05383934878045693,
"entropy": 0.24193121027201414,
"epoch": 0.00028,
"grad_norm": 0.9876235127449036,
"kl": 0.26401366433128715,
"learning_rate": 7.714285714285715e-05,
"loss": -0.073,
"step": 28,
"step_time": 27.219164144002207
},
{
"clip_ratio/high_max": 0.0010775862028822303,
"clip_ratio/high_mean": 0.0005387931014411151,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0005387931014411151,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9855.0,
"completions/max_terminated_length": 9855.0,
"completions/mean_length": 7361.875,
"completions/mean_terminated_length": 7361.875,
"completions/min_length": 842.0,
"completions/min_terminated_length": 842.0,
"entropy": 0.21860306337475777,
"epoch": 0.00029,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9292377233505249,
"kl": 0.20060417288914323,
"learning_rate": 8e-05,
"loss": -0.0977,
"num_tokens": 4278506.0,
"reward": 0.30670806765556335,
"reward_std": 0.652392566204071,
"rewards/rollout_eval_reward_func/mean": 0.3278709352016449,
"rewards/rollout_eval_reward_func/std": 0.31351709365844727,
"rewards/rollout_reward_func/mean": 0.30670806765556335,
"rewards/rollout_reward_func/std": 0.6815608143806458,
"sampling/importance_sampling_ratio/max": 1.4481010437011719,
"sampling/importance_sampling_ratio/mean": 1.0026426315307617,
"sampling/importance_sampling_ratio/min": 0.5693169832229614,
"sampling/sampling_logp_difference/max": 0.5633178949356079,
"sampling/sampling_logp_difference/mean": 0.01894025132060051,
"step": 29,
"step_time": 88.37378997200358
},
{
"clip_ratio/high_max": 0.02580322092399001,
"clip_ratio/high_mean": 0.015042814193293452,
"clip_ratio/low_mean": 0.015608090267051011,
"clip_ratio/low_min": 0.0020833334419876337,
"clip_ratio/region_mean": 0.030650904460344464,
"entropy": 0.2232473948970437,
"epoch": 0.0003,
"grad_norm": 0.6086679697036743,
"kl": 0.19415233470499516,
"learning_rate": 8.285714285714287e-05,
"loss": -0.1081,
"step": 30,
"step_time": 28.619991764000588
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0005208333604969084,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0005208333604969084,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10726.0,
"completions/max_terminated_length": 10726.0,
"completions/mean_length": 7164.65625,
"completions/mean_terminated_length": 7164.65625,
"completions/min_length": 470.0,
"completions/min_terminated_length": 470.0,
"entropy": 0.23761425912380219,
"epoch": 0.00031,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.050552487373352,
"kl": 0.25638002483174205,
"learning_rate": 8.571428571428571e-05,
"loss": 0.012,
"num_tokens": 4536097.0,
"reward": 0.3345087766647339,
"reward_std": 0.5485296249389648,
"rewards/rollout_eval_reward_func/mean": 0.3090701103210449,
"rewards/rollout_eval_reward_func/std": 0.32714226841926575,
"rewards/rollout_reward_func/mean": 0.3345087766647339,
"rewards/rollout_reward_func/std": 0.6012357473373413,
"sampling/importance_sampling_ratio/max": 1.438549518585205,
"sampling/importance_sampling_ratio/mean": 1.0011037588119507,
"sampling/importance_sampling_ratio/min": 0.6349728107452393,
"sampling/sampling_logp_difference/max": 0.45417308807373047,
"sampling/sampling_logp_difference/mean": 0.015337169170379639,
"step": 31,
"step_time": 92.49027231299806
},
{
"clip_ratio/high_max": 0.03391559107694775,
"clip_ratio/high_mean": 0.018867517996113747,
"clip_ratio/low_mean": 0.044338769221212715,
"clip_ratio/low_min": 0.008333333535119891,
"clip_ratio/region_mean": 0.06320628756657243,
"entropy": 0.22916866652667522,
"epoch": 0.00032,
"grad_norm": 1.028586745262146,
"kl": 0.3105860697105527,
"learning_rate": 8.857142857142857e-05,
"loss": 0.0055,
"step": 32,
"step_time": 29.399824877003994
},
{
"clip_ratio/high_max": 0.0024519230937585235,
"clip_ratio/high_mean": 0.0012259615468792617,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0012259615468792617,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10128.0,
"completions/max_terminated_length": 10128.0,
"completions/mean_length": 7357.46875,
"completions/mean_terminated_length": 7357.46875,
"completions/min_length": 1917.0,
"completions/min_terminated_length": 1917.0,
"entropy": 0.2557551637291908,
"epoch": 0.00033,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9717881083488464,
"kl": 0.2046954189427197,
"learning_rate": 9.142857142857143e-05,
"loss": 0.0245,
"num_tokens": 4799621.0,
"reward": 0.35216301679611206,
"reward_std": 0.6164546608924866,
"rewards/rollout_eval_reward_func/mean": 0.3365091383457184,
"rewards/rollout_eval_reward_func/std": 0.3354848027229309,
"rewards/rollout_reward_func/mean": 0.35216301679611206,
"rewards/rollout_reward_func/std": 0.6309141516685486,
"sampling/importance_sampling_ratio/max": 1.333243727684021,
"sampling/importance_sampling_ratio/mean": 1.0005223751068115,
"sampling/importance_sampling_ratio/min": 0.7339702248573303,
"sampling/sampling_logp_difference/max": 0.30928683280944824,
"sampling/sampling_logp_difference/mean": 0.014704002998769283,
"step": 33,
"step_time": 89.53553034700417
},
{
"clip_ratio/high_max": 0.01991061063017696,
"clip_ratio/high_mean": 0.011966300604399294,
"clip_ratio/low_mean": 0.02272569522028789,
"clip_ratio/low_min": 0.009722222457639873,
"clip_ratio/region_mean": 0.03469199570827186,
"entropy": 0.2428069869056344,
"epoch": 0.00034,
"grad_norm": 0.685612142086029,
"kl": 0.2513351505622268,
"learning_rate": 9.428571428571429e-05,
"loss": 0.0129,
"step": 34,
"step_time": 28.25809028400181
},
{
"clip_ratio/high_max": 0.0021990741370245814,
"clip_ratio/high_mean": 0.0010995370685122907,
"clip_ratio/low_mean": 0.0010416667209938169,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0021412037895061076,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9665.0,
"completions/max_terminated_length": 9665.0,
"completions/mean_length": 8000.09375,
"completions/mean_terminated_length": 8000.09375,
"completions/min_length": 4295.0,
"completions/min_terminated_length": 4295.0,
"entropy": 0.2354184165596962,
"epoch": 0.00035,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.040405035018921,
"kl": 0.1770010399632156,
"learning_rate": 9.714285714285715e-05,
"loss": 0.1517,
"num_tokens": 5084103.0,
"reward": 0.3385156989097595,
"reward_std": 0.5189785957336426,
"rewards/rollout_eval_reward_func/mean": 0.23996442556381226,
"rewards/rollout_eval_reward_func/std": 0.31991085410118103,
"rewards/rollout_reward_func/mean": 0.3385156989097595,
"rewards/rollout_reward_func/std": 0.5693588852882385,
"sampling/importance_sampling_ratio/max": 1.4071576595306396,
"sampling/importance_sampling_ratio/mean": 0.9996304512023926,
"sampling/importance_sampling_ratio/min": 0.5387703776359558,
"sampling/sampling_logp_difference/max": 0.6184659004211426,
"sampling/sampling_logp_difference/mean": 0.015029089525341988,
"step": 35,
"step_time": 95.05921310200392
},
{
"clip_ratio/high_max": 0.026263557723723352,
"clip_ratio/high_mean": 0.014173445466440171,
"clip_ratio/low_mean": 0.02787990286014974,
"clip_ratio/low_min": 0.007291667046956718,
"clip_ratio/region_mean": 0.04205334832658991,
"entropy": 0.21858789399266243,
"epoch": 0.00036,
"grad_norm": 1.0455042123794556,
"kl": 0.2051441869698465,
"learning_rate": 0.0001,
"loss": 0.1403,
"step": 36,
"step_time": 27.85193802100366
},
{
"clip_ratio/high_max": 0.007164939888752997,
"clip_ratio/high_mean": 0.0035824699443764985,
"clip_ratio/low_mean": 0.0015625000232830644,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005144969967659563,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9514.0,
"completions/max_terminated_length": 9514.0,
"completions/mean_length": 6029.25,
"completions/mean_terminated_length": 6029.25,
"completions/min_length": 1061.0,
"completions/min_terminated_length": 1061.0,
"entropy": 0.21716525312513113,
"epoch": 0.00037,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8604521751403809,
"kl": 0.23097522975876927,
"learning_rate": 9.999736485702831e-05,
"loss": -0.0709,
"num_tokens": 5305345.0,
"reward": 0.41453179717063904,
"reward_std": 0.7797224521636963,
"rewards/rollout_eval_reward_func/mean": 0.4568089246749878,
"rewards/rollout_eval_reward_func/std": 0.28734299540519714,
"rewards/rollout_reward_func/mean": 0.41453179717063904,
"rewards/rollout_reward_func/std": 0.755694568157196,
"sampling/importance_sampling_ratio/max": 1.4738141298294067,
"sampling/importance_sampling_ratio/mean": 1.000828742980957,
"sampling/importance_sampling_ratio/min": 0.7324953079223633,
"sampling/sampling_logp_difference/max": 0.3878536820411682,
"sampling/sampling_logp_difference/mean": 0.013184964656829834,
"step": 37,
"step_time": 76.87407001600332
},
{
"clip_ratio/high_max": 0.04774210066534579,
"clip_ratio/high_mean": 0.02752261853311211,
"clip_ratio/low_mean": 0.03158482233993709,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.059107440523803234,
"entropy": 0.21499714627861977,
"epoch": 0.00038,
"grad_norm": 1.026845932006836,
"kl": 0.3676267918199301,
"learning_rate": 9.998945979845876e-05,
"loss": -0.0694,
"step": 38,
"step_time": 27.58343887600313
},
{
"clip_ratio/high_max": 0.006285919691435993,
"clip_ratio/high_mean": 0.0031429598457179964,
"clip_ratio/low_mean": 0.0010416667209938169,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004184626566711813,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9353.0,
"completions/max_terminated_length": 9353.0,
"completions/mean_length": 6221.78125,
"completions/mean_terminated_length": 6221.78125,
"completions/min_length": 1175.0,
"completions/min_terminated_length": 1175.0,
"entropy": 0.21314978785812855,
"epoch": 0.00039,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1063776016235352,
"kl": 0.28443425707519054,
"learning_rate": 9.997628593527586e-05,
"loss": 0.1657,
"num_tokens": 5533203.0,
"reward": 0.5931290984153748,
"reward_std": 0.5068180561065674,
"rewards/rollout_eval_reward_func/mean": 0.4369918704032898,
"rewards/rollout_eval_reward_func/std": 0.2919425666332245,
"rewards/rollout_reward_func/mean": 0.5931290984153748,
"rewards/rollout_reward_func/std": 0.6152276396751404,
"sampling/importance_sampling_ratio/max": 1.4768017530441284,
"sampling/importance_sampling_ratio/mean": 0.9989122152328491,
"sampling/importance_sampling_ratio/min": 0.7442160248756409,
"sampling/sampling_logp_difference/max": 0.3898787498474121,
"sampling/sampling_logp_difference/mean": 0.011076296679675579,
"step": 39,
"step_time": 80.26773473300273
},
{
"clip_ratio/high_max": 0.03581550612580031,
"clip_ratio/high_mean": 0.021467003040015697,
"clip_ratio/low_mean": 0.019476010755170137,
"clip_ratio/low_min": 0.0031250000465661287,
"clip_ratio/region_mean": 0.04094301396980882,
"entropy": 0.2001811731606722,
"epoch": 0.0004,
"grad_norm": 0.8571550250053406,
"kl": 0.39517259504646063,
"learning_rate": 9.995784511894694e-05,
"loss": 0.1561,
"step": 40,
"step_time": 26.113719172002675
},
{
"clip_ratio/high_max": 0.0027173913549631834,
"clip_ratio/high_mean": 0.0013586956774815917,
"clip_ratio/low_mean": 0.003238224715460092,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0045969203929416835,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9876.0,
"completions/max_terminated_length": 9876.0,
"completions/mean_length": 7216.5625,
"completions/mean_terminated_length": 7216.5625,
"completions/min_length": 1879.0,
"completions/min_terminated_length": 1879.0,
"entropy": 0.2681358586996794,
"epoch": 0.00041,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3309671878814697,
"kl": 0.2410402470268309,
"learning_rate": 9.993413994116206e-05,
"loss": 0.1903,
"num_tokens": 5792478.0,
"reward": 0.471214234828949,
"reward_std": 0.5625734329223633,
"rewards/rollout_eval_reward_func/mean": 0.3643292784690857,
"rewards/rollout_eval_reward_func/std": 0.34053289890289307,
"rewards/rollout_reward_func/mean": 0.471214234828949,
"rewards/rollout_reward_func/std": 0.6072424650192261,
"sampling/importance_sampling_ratio/max": 1.8356192111968994,
"sampling/importance_sampling_ratio/mean": 1.0007987022399902,
"sampling/importance_sampling_ratio/min": 0.4829617738723755,
"sampling/sampling_logp_difference/max": 0.7278177738189697,
"sampling/sampling_logp_difference/mean": 0.014709306880831718,
"step": 41,
"step_time": 87.47009326799707
},
{
"clip_ratio/high_max": 0.034506134572438896,
"clip_ratio/high_mean": 0.01836913888109848,
"clip_ratio/low_mean": 0.03956068912521005,
"clip_ratio/low_min": 0.012500000651925802,
"clip_ratio/region_mean": 0.05792982783168554,
"entropy": 0.27205855678766966,
"epoch": 0.00042,
"grad_norm": 1.0188957452774048,
"kl": 0.30527770798653364,
"learning_rate": 9.990517373346957e-05,
"loss": 0.1841,
"step": 42,
"step_time": 27.952364619004584
},
{
"clip_ratio/high_max": 0.005300949211232364,
"clip_ratio/high_mean": 0.002650474605616182,
"clip_ratio/low_mean": 0.0015625000814907253,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004212974687106907,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10094.0,
"completions/max_terminated_length": 10094.0,
"completions/mean_length": 6369.84375,
"completions/mean_terminated_length": 6369.84375,
"completions/min_length": 701.0,
"completions/min_terminated_length": 701.0,
"entropy": 0.24548510648310184,
"epoch": 0.00043,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8371849656105042,
"kl": 0.22607734380289912,
"learning_rate": 9.98709505668081e-05,
"loss": -0.1383,
"num_tokens": 6024570.0,
"reward": 0.5083565711975098,
"reward_std": 0.7129669785499573,
"rewards/rollout_eval_reward_func/mean": 0.4181910753250122,
"rewards/rollout_eval_reward_func/std": 0.3106958866119385,
"rewards/rollout_reward_func/mean": 0.5083565711975098,
"rewards/rollout_reward_func/std": 0.679851770401001,
"sampling/importance_sampling_ratio/max": 1.6035348176956177,
"sampling/importance_sampling_ratio/mean": 1.0009479522705078,
"sampling/importance_sampling_ratio/min": 0.7113155722618103,
"sampling/sampling_logp_difference/max": 0.4722104072570801,
"sampling/sampling_logp_difference/mean": 0.010827964171767235,
"step": 43,
"step_time": 81.8608712560017
},
{
"clip_ratio/high_max": 0.022805775748565793,
"clip_ratio/high_mean": 0.01218413794413209,
"clip_ratio/low_mean": 0.026488096278626472,
"clip_ratio/low_min": 0.0020833334419876337,
"clip_ratio/region_mean": 0.03867223463021219,
"entropy": 0.2484031356871128,
"epoch": 0.00044,
"grad_norm": 0.6352972388267517,
"kl": 0.24903920874930918,
"learning_rate": 9.983147525093428e-05,
"loss": -0.1456,
"step": 44,
"step_time": 28.312056484001005
},
{
"clip_ratio/high_max": 0.0020833334419876337,
"clip_ratio/high_mean": 0.0010416667209938169,
"clip_ratio/low_mean": 0.0010416667209938169,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0020833334419876337,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10030.0,
"completions/max_terminated_length": 10030.0,
"completions/mean_length": 7470.40625,
"completions/mean_terminated_length": 7470.40625,
"completions/min_length": 3212.0,
"completions/min_terminated_length": 3212.0,
"entropy": 0.26859680097550154,
"epoch": 0.00045,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9950742125511169,
"kl": 0.2715269709005952,
"learning_rate": 9.978675333374685e-05,
"loss": 0.1354,
"num_tokens": 6292193.0,
"reward": 0.31536591053009033,
"reward_std": 0.626213550567627,
"rewards/rollout_eval_reward_func/mean": 0.2950965166091919,
"rewards/rollout_eval_reward_func/std": 0.3288768529891968,
"rewards/rollout_reward_func/mean": 0.31536591053009033,
"rewards/rollout_reward_func/std": 0.6272794604301453,
"sampling/importance_sampling_ratio/max": 1.2761257886886597,
"sampling/importance_sampling_ratio/mean": 0.9995177388191223,
"sampling/importance_sampling_ratio/min": 0.6398259401321411,
"sampling/sampling_logp_difference/max": 0.44655919075012207,
"sampling/sampling_logp_difference/mean": 0.01289924792945385,
"step": 45,
"step_time": 89.98842330299703
},
{
"clip_ratio/high_max": 0.029475471819750965,
"clip_ratio/high_mean": 0.017039196158293635,
"clip_ratio/low_mean": 0.035884891636669636,
"clip_ratio/low_min": 0.014583333861082792,
"clip_ratio/region_mean": 0.05292408773675561,
"entropy": 0.25596251618117094,
"epoch": 0.00046,
"grad_norm": 1.0492225885391235,
"kl": 0.4555607410147786,
"learning_rate": 9.973679110050689e-05,
"loss": 0.1236,
"step": 46,
"step_time": 28.10059149600238
},
{
"clip_ratio/high_max": 0.005558473523706198,
"clip_ratio/high_mean": 0.002779236761853099,
"clip_ratio/low_mean": 0.0031250001629814506,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005904236924834549,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10171.0,
"completions/max_terminated_length": 10171.0,
"completions/mean_length": 7720.34375,
"completions/mean_terminated_length": 7720.34375,
"completions/min_length": 2255.0,
"completions/min_terminated_length": 2255.0,
"entropy": 0.21848125476390123,
"epoch": 0.00047,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9580699801445007,
"kl": 0.2126072864048183,
"learning_rate": 9.968159557295458e-05,
"loss": 0.2391,
"num_tokens": 6567972.0,
"reward": 0.585047721862793,
"reward_std": 0.4849390387535095,
"rewards/rollout_eval_reward_func/mean": 0.35200709104537964,
"rewards/rollout_eval_reward_func/std": 0.33855971693992615,
"rewards/rollout_reward_func/mean": 0.585047721862793,
"rewards/rollout_reward_func/std": 0.4694308936595917,
"sampling/importance_sampling_ratio/max": 1.3900582790374756,
"sampling/importance_sampling_ratio/mean": 1.0005149841308594,
"sampling/importance_sampling_ratio/min": 0.5463369488716125,
"sampling/sampling_logp_difference/max": 0.6045193672180176,
"sampling/sampling_logp_difference/mean": 0.012745920568704605,
"step": 47,
"step_time": 91.15270540599704
},
{
"clip_ratio/high_max": 0.03133936191443354,
"clip_ratio/high_mean": 0.017232181096915156,
"clip_ratio/low_mean": 0.04218750132713467,
"clip_ratio/low_min": 0.01145833358168602,
"clip_ratio/region_mean": 0.059419682365842164,
"entropy": 0.23045554850250483,
"epoch": 0.00048,
"grad_norm": 1.2474925518035889,
"kl": 0.18294932693243027,
"learning_rate": 9.962117450832225e-05,
"loss": 0.238,
"step": 48,
"step_time": 29.046616760999314
},
{
"clip_ratio/high_max": 0.006842764443717897,
"clip_ratio/high_mean": 0.0034213822218589485,
"clip_ratio/low_mean": 0.0015625000232830644,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004983882245142013,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10017.0,
"completions/max_terminated_length": 10017.0,
"completions/mean_length": 7918.40625,
"completions/mean_terminated_length": 7918.40625,
"completions/min_length": 1876.0,
"completions/min_terminated_length": 1876.0,
"entropy": 0.24847039952874184,
"epoch": 0.00049,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1504110097885132,
"kl": 0.3213502997532487,
"learning_rate": 9.955553639824423e-05,
"loss": 0.1906,
"num_tokens": 6849638.0,
"reward": 0.39189645648002625,
"reward_std": 0.5209037065505981,
"rewards/rollout_eval_reward_func/mean": 0.2815040647983551,
"rewards/rollout_eval_reward_func/std": 0.332853227853775,
"rewards/rollout_reward_func/mean": 0.39189645648002625,
"rewards/rollout_reward_func/std": 0.5881980061531067,
"sampling/importance_sampling_ratio/max": 1.4030216932296753,
"sampling/importance_sampling_ratio/mean": 0.9992052316665649,
"sampling/importance_sampling_ratio/min": 0.6490213871002197,
"sampling/sampling_logp_difference/max": 0.43228960037231445,
"sampling/sampling_logp_difference/mean": 0.011766092851758003,
"step": 49,
"step_time": 91.98302743600289
},
{
"clip_ratio/high_max": 0.030021664802916348,
"clip_ratio/high_mean": 0.01896916568512097,
"clip_ratio/low_mean": 0.02840909146470949,
"clip_ratio/low_min": 0.0031250001629814506,
"clip_ratio/region_mean": 0.04737825732445344,
"entropy": 0.22083801217377186,
"epoch": 0.0005,
"grad_norm": 1.493245005607605,
"kl": 0.6161252139136195,
"learning_rate": 9.948469046756344e-05,
"loss": 0.1882,
"step": 50,
"step_time": 29.706524382998396
},
{
"clip_ratio/high_max": 0.007615459966473281,
"clip_ratio/high_mean": 0.0038077299832366407,
"clip_ratio/low_mean": 0.0010416667209938169,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0048493967042304575,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10057.0,
"completions/max_terminated_length": 10057.0,
"completions/mean_length": 7061.03125,
"completions/mean_terminated_length": 7061.03125,
"completions/min_length": 2525.0,
"completions/min_terminated_length": 2525.0,
"entropy": 0.24380221962928772,
"epoch": 0.00051,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1951247453689575,
"kl": 0.28687382210046053,
"learning_rate": 9.940864667303489e-05,
"loss": 0.1425,
"num_tokens": 7103728.0,
"reward": 0.406146377325058,
"reward_std": 0.6755715608596802,
"rewards/rollout_eval_reward_func/mean": 0.3859247863292694,
"rewards/rollout_eval_reward_func/std": 0.33643871545791626,
"rewards/rollout_reward_func/mean": 0.406146377325058,
"rewards/rollout_reward_func/std": 0.6774359345436096,
"sampling/importance_sampling_ratio/max": 1.367674469947815,
"sampling/importance_sampling_ratio/mean": 0.9991032481193542,
"sampling/importance_sampling_ratio/min": 0.6542518734931946,
"sampling/sampling_logp_difference/max": 0.4242628812789917,
"sampling/sampling_logp_difference/mean": 0.012621527537703514,
"step": 51,
"step_time": 85.52853098199739
},
{
"clip_ratio/high_max": 0.023708798456937075,
"clip_ratio/high_mean": 0.015155438333749771,
"clip_ratio/low_mean": 0.02644535672152415,
"clip_ratio/low_min": 0.009695513173937798,
"clip_ratio/region_mean": 0.04160079546272755,
"entropy": 0.24589570611715317,
"epoch": 0.00052,
"grad_norm": 0.6901561617851257,
"kl": 0.2809536149725318,
"learning_rate": 9.932741570192633e-05,
"loss": 0.1278,
"step": 52,
"step_time": 28.923457664002854
},
{
"clip_ratio/high_max": 0.0011160714784637094,
"clip_ratio/high_mean": 0.0005580357392318547,
"clip_ratio/low_mean": 0.0005208333604969084,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001078869099728763,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10169.0,
"completions/max_terminated_length": 10169.0,
"completions/mean_length": 7814.28125,
"completions/mean_terminated_length": 7814.28125,
"completions/min_length": 1989.0,
"completions/min_terminated_length": 1989.0,
"entropy": 0.21275948453694582,
"epoch": 0.00053,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7513790130615234,
"kl": 0.23512054793536663,
"learning_rate": 9.924100897051629e-05,
"loss": 0.1945,
"num_tokens": 7382261.0,
"reward": 0.42753756046295166,
"reward_std": 0.49785035848617554,
"rewards/rollout_eval_reward_func/mean": 0.26969003677368164,
"rewards/rollout_eval_reward_func/std": 0.3341839611530304,
"rewards/rollout_reward_func/mean": 0.42753756046295166,
"rewards/rollout_reward_func/std": 0.49307680130004883,
"sampling/importance_sampling_ratio/max": 1.3325144052505493,
"sampling/importance_sampling_ratio/mean": 0.9995752573013306,
"sampling/importance_sampling_ratio/min": 0.6147154569625854,
"sampling/sampling_logp_difference/max": 0.48659586906433105,
"sampling/sampling_logp_difference/mean": 0.010477245785295963,
"step": 53,
"step_time": 89.77598898800352
},
{
"clip_ratio/high_max": 0.014756215270608664,
"clip_ratio/high_mean": 0.007378107635304332,
"clip_ratio/low_mean": 0.026041667733807117,
"clip_ratio/low_min": 0.008333333651535213,
"clip_ratio/region_mean": 0.03341977560194209,
"entropy": 0.20128578413277864,
"epoch": 0.00054,
"grad_norm": 0.570249080657959,
"kl": 0.24723996873944998,
"learning_rate": 9.914943862248966e-05,
"loss": 0.1836,
"step": 54,
"step_time": 28.66781206799169
},
{
"clip_ratio/high_max": 0.005409664008766413,
"clip_ratio/high_mean": 0.0027048320043832064,
"clip_ratio/low_mean": 0.0005208333604969084,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003225665364880115,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9612.0,
"completions/max_terminated_length": 9612.0,
"completions/mean_length": 7466.40625,
"completions/mean_terminated_length": 7466.40625,
"completions/min_length": 897.0,
"completions/min_terminated_length": 897.0,
"entropy": 0.2242852784693241,
"epoch": 0.00055,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7776551246643066,
"kl": 0.22325131320394576,
"learning_rate": 9.905271752723088e-05,
"loss": 0.0206,
"num_tokens": 7648812.0,
"reward": 0.40199464559555054,
"reward_std": 0.5598407983779907,
"rewards/rollout_eval_reward_func/mean": 0.3375253975391388,
"rewards/rollout_eval_reward_func/std": 0.351279616355896,
"rewards/rollout_reward_func/mean": 0.40199464559555054,
"rewards/rollout_reward_func/std": 0.5975609421730042,
"sampling/importance_sampling_ratio/max": 1.317135214805603,
"sampling/importance_sampling_ratio/mean": 0.9976714849472046,
"sampling/importance_sampling_ratio/min": 0.6417545676231384,
"sampling/sampling_logp_difference/max": 0.4435492753982544,
"sampling/sampling_logp_difference/mean": 0.012365585193037987,
"step": 55,
"step_time": 90.48158546899867
},
{
"clip_ratio/high_max": 0.02967093954794109,
"clip_ratio/high_mean": 0.01639796979725361,
"clip_ratio/low_mean": 0.017361111822538078,
"clip_ratio/low_min": 0.0031250001629814506,
"clip_ratio/region_mean": 0.03375908185262233,
"entropy": 0.2281673550605774,
"epoch": 0.00056,
"grad_norm": 0.48372626304626465,
"kl": 0.23605143558233976,
"learning_rate": 9.895085927801542e-05,
"loss": 0.0086,
"step": 56,
"step_time": 27.291444884000157
},
{
"clip_ratio/high_max": 0.003557769814506173,
"clip_ratio/high_mean": 0.0017788849072530866,
"clip_ratio/low_mean": 0.0015625000814907253,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003341384930536151,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10322.0,
"completions/max_terminated_length": 10322.0,
"completions/mean_length": 6645.3125,
"completions/mean_terminated_length": 6645.3125,
"completions/min_length": 1995.0,
"completions/min_terminated_length": 1995.0,
"entropy": 0.22339679207652807,
"epoch": 0.00057,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7947802543640137,
"kl": 0.33472199738025665,
"learning_rate": 9.884387819009922e-05,
"loss": 0.0286,
"num_tokens": 7889241.0,
"reward": 0.3272937536239624,
"reward_std": 0.7356898784637451,
"rewards/rollout_eval_reward_func/mean": 0.38528963923454285,
"rewards/rollout_eval_reward_func/std": 0.3158987760543823,
"rewards/rollout_reward_func/mean": 0.3272937536239624,
"rewards/rollout_reward_func/std": 0.7287615537643433,
"sampling/importance_sampling_ratio/max": 1.519856333732605,
"sampling/importance_sampling_ratio/mean": 1.0008394718170166,
"sampling/importance_sampling_ratio/min": 0.6888355612754822,
"sampling/sampling_logp_difference/max": 0.41861581802368164,
"sampling/sampling_logp_difference/mean": 0.01188460923731327,
"step": 57,
"step_time": 83.6079965079989
},
{
"clip_ratio/high_max": 0.02337649872060865,
"clip_ratio/high_mean": 0.012729916197713464,
"clip_ratio/low_mean": 0.03550771565642208,
"clip_ratio/low_min": 0.013886852888390422,
"clip_ratio/region_mean": 0.048237632028758526,
"entropy": 0.23247116059064865,
"epoch": 0.00058,
"grad_norm": 0.6895915269851685,
"kl": 0.30278117302805185,
"learning_rate": 9.873178929870695e-05,
"loss": 0.0178,
"step": 58,
"step_time": 29.01562165299947
},
{
"clip_ratio/high_max": 0.006458333344198763,
"clip_ratio/high_mean": 0.00375000003259629,
"clip_ratio/low_mean": 0.0010416667209938169,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004791666753590107,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10121.0,
"completions/max_terminated_length": 10121.0,
"completions/mean_length": 7354.9375,
"completions/mean_terminated_length": 7354.9375,
"completions/min_length": 1114.0,
"completions/min_terminated_length": 1114.0,
"entropy": 0.2855970785021782,
"epoch": 0.00059,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1581138372421265,
"kl": 0.30998079385608435,
"learning_rate": 9.86146083569188e-05,
"loss": -0.077,
"num_tokens": 8152533.0,
"reward": 0.12930195033550262,
"reward_std": 0.6660091876983643,
"rewards/rollout_eval_reward_func/mean": 0.33841466903686523,
"rewards/rollout_eval_reward_func/std": 0.3268774151802063,
"rewards/rollout_reward_func/mean": 0.12930195033550262,
"rewards/rollout_reward_func/std": 0.7711123824119568,
"sampling/importance_sampling_ratio/max": 1.4381568431854248,
"sampling/importance_sampling_ratio/mean": 0.9980136156082153,
"sampling/importance_sampling_ratio/min": 0.7020198106765747,
"sampling/sampling_logp_difference/max": 0.36336231231689453,
"sampling/sampling_logp_difference/mean": 0.016959059983491898,
"step": 59,
"step_time": 87.87077508199764
},
{
"clip_ratio/high_max": 0.048061754438094795,
"clip_ratio/high_mean": 0.031483913655392826,
"clip_ratio/low_mean": 0.04418836906552315,
"clip_ratio/low_min": 0.007291666814126074,
"clip_ratio/region_mean": 0.07567228260450065,
"entropy": 0.26963882334530354,
"epoch": 0.0006,
"grad_norm": 1.0022964477539062,
"kl": 0.30027929320931435,
"learning_rate": 9.84923518334567e-05,
"loss": -0.0828,
"step": 60,
"step_time": 28.71259851099967
},
{
"clip_ratio/high_max": 0.01005121401976794,
"clip_ratio/high_mean": 0.005546440428588539,
"clip_ratio/low_mean": 0.0020026409183628857,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007549081346951425,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10379.0,
"completions/max_terminated_length": 10379.0,
"completions/mean_length": 6955.875,
"completions/mean_terminated_length": 6955.875,
"completions/min_length": 2081.0,
"completions/min_terminated_length": 2081.0,
"entropy": 0.23451983137056231,
"epoch": 0.00061,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1702102422714233,
"kl": 0.28609952982515097,
"learning_rate": 9.83650369103696e-05,
"loss": 0.0631,
"num_tokens": 8403186.0,
"reward": 0.3940112888813019,
"reward_std": 0.67255699634552,
"rewards/rollout_eval_reward_func/mean": 0.3715701103210449,
"rewards/rollout_eval_reward_func/std": 0.3261474072933197,
"rewards/rollout_reward_func/mean": 0.3940112888813019,
"rewards/rollout_reward_func/std": 0.6762000322341919,
"sampling/importance_sampling_ratio/max": 1.3093942403793335,
"sampling/importance_sampling_ratio/mean": 1.0008020401000977,
"sampling/importance_sampling_ratio/min": 0.5961512923240662,
"sampling/sampling_logp_difference/max": 0.5172607898712158,
"sampling/sampling_logp_difference/mean": 0.014042183756828308,
"step": 61,
"step_time": 86.51144317899707
},
{
"clip_ratio/high_max": 0.051156656933017075,
"clip_ratio/high_mean": 0.03779221937293187,
"clip_ratio/low_mean": 0.05211732583120465,
"clip_ratio/low_min": 0.024354460649192333,
"clip_ratio/region_mean": 0.08990954549517483,
"entropy": 0.21548824943602085,
"epoch": 0.00062,
"grad_norm": 1.1680642366409302,
"kl": 0.5453370595350862,
"learning_rate": 9.823268148061883e-05,
"loss": 0.0666,
"step": 62,
"step_time": 28.28093677799916
},
{
"clip_ratio/high_max": 0.009642903693020344,
"clip_ratio/high_mean": 0.004821451846510172,
"clip_ratio/low_mean": 0.0010416667209938169,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005863118567503989,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9611.0,
"completions/max_terminated_length": 9611.0,
"completions/mean_length": 5474.5,
"completions/mean_terminated_length": 5474.5,
"completions/min_length": 1264.0,
"completions/min_terminated_length": 1264.0,
"entropy": 0.20994199626147747,
"epoch": 0.00063,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5162609815597534,
"kl": 0.3376044826582074,
"learning_rate": 9.809530414556335e-05,
"loss": 0.1386,
"num_tokens": 8606212.0,
"reward": 0.6158473491668701,
"reward_std": 0.705132782459259,
"rewards/rollout_eval_reward_func/mean": 0.5119410753250122,
"rewards/rollout_eval_reward_func/std": 0.2654803693294525,
"rewards/rollout_reward_func/mean": 0.6158473491668701,
"rewards/rollout_reward_func/std": 0.6767383813858032,
"sampling/importance_sampling_ratio/max": 1.9123412370681763,
"sampling/importance_sampling_ratio/mean": 0.9994137287139893,
"sampling/importance_sampling_ratio/min": 0.6006231904029846,
"sampling/sampling_logp_difference/max": 0.6483283042907715,
"sampling/sampling_logp_difference/mean": 0.015111252665519714,
"step": 63,
"step_time": 74.64896667399807
},
{
"clip_ratio/high_max": 0.05132549628615379,
"clip_ratio/high_mean": 0.030718339723534882,
"clip_ratio/low_mean": 0.028882576967589557,
"clip_ratio/low_min": 0.0031250000465661287,
"clip_ratio/region_mean": 0.05960091657470912,
"entropy": 0.20065013086423278,
"epoch": 0.00064,
"grad_norm": 1.244667649269104,
"kl": 0.453593029640615,
"learning_rate": 9.79529242123455e-05,
"loss": 0.1234,
"step": 64,
"step_time": 24.8986287849948
},
{
"clip_ratio/high_max": 0.0077537596225738525,
"clip_ratio/high_mean": 0.0038768798112869263,
"clip_ratio/low_mean": 0.0005208333604969084,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004397713171783835,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10299.0,
"completions/max_terminated_length": 10299.0,
"completions/mean_length": 6500.03125,
"completions/mean_terminated_length": 6500.03125,
"completions/min_length": 1712.0,
"completions/min_terminated_length": 1712.0,
"entropy": 0.14482268318533897,
"epoch": 0.00065,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9947667121887207,
"kl": 0.2627185983583331,
"learning_rate": 9.780556169117757e-05,
"loss": 0.0665,
"num_tokens": 8841902.0,
"reward": 0.6678237915039062,
"reward_std": 0.5866862535476685,
"rewards/rollout_eval_reward_func/mean": 0.5013973712921143,
"rewards/rollout_eval_reward_func/std": 0.27832266688346863,
"rewards/rollout_reward_func/mean": 0.6678237915039062,
"rewards/rollout_reward_func/std": 0.5921808481216431,
"sampling/importance_sampling_ratio/max": 1.4597694873809814,
"sampling/importance_sampling_ratio/mean": 0.99915611743927,
"sampling/importance_sampling_ratio/min": 0.27695003151893616,
"sampling/sampling_logp_difference/max": 1.2839181423187256,
"sampling/sampling_logp_difference/mean": 0.010844534263014793,
"step": 65,
"step_time": 80.86000475000401
},
{
"clip_ratio/high_max": 0.025044884881936014,
"clip_ratio/high_mean": 0.014345359115395695,
"clip_ratio/low_mean": 0.02013494382845238,
"clip_ratio/low_min": 0.0020833334419876337,
"clip_ratio/region_mean": 0.03448030271101743,
"entropy": 0.13131517032161355,
"epoch": 0.00066,
"grad_norm": 0.4750834107398987,
"kl": 0.34219094878062606,
"learning_rate": 9.765323729252955e-05,
"loss": 0.0561,
"step": 66,
"step_time": 28.661124781996477
},
{
"clip_ratio/high_max": 0.009476827806793153,
"clip_ratio/high_mean": 0.0062825315981172025,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0062825315981172025,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10269.0,
"completions/max_terminated_length": 10269.0,
"completions/mean_length": 6501.5,
"completions/mean_terminated_length": 6501.5,
"completions/min_length": 724.0,
"completions/min_terminated_length": 724.0,
"entropy": 0.14845013478770852,
"epoch": 0.00067,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9048762917518616,
"kl": 0.3337597157806158,
"learning_rate": 9.749597242421838e-05,
"loss": 0.0677,
"num_tokens": 9077833.0,
"reward": 0.6164548397064209,
"reward_std": 0.5683386325836182,
"rewards/rollout_eval_reward_func/mean": 0.4744664430618286,
"rewards/rollout_eval_reward_func/std": 0.29613611102104187,
"rewards/rollout_reward_func/mean": 0.6164548397064209,
"rewards/rollout_reward_func/std": 0.6236394643783569,
"sampling/importance_sampling_ratio/max": 1.9927904605865479,
"sampling/importance_sampling_ratio/mean": 1.0013047456741333,
"sampling/importance_sampling_ratio/min": 0.5228504538536072,
"sampling/sampling_logp_difference/max": 0.6895358562469482,
"sampling/sampling_logp_difference/mean": 0.011342051438987255,
"step": 67,
"step_time": 79.70687562199964
},
{
"clip_ratio/high_max": 0.0376884457655251,
"clip_ratio/high_mean": 0.026128767582122236,
"clip_ratio/low_mean": 0.026416301843710244,
"clip_ratio/low_min": 0.007291666814126074,
"clip_ratio/region_mean": 0.0525450695422478,
"entropy": 0.15412914380431175,
"epoch": 0.00068,
"grad_norm": 0.8491650223731995,
"kl": 0.3899666126817465,
"learning_rate": 9.733378918839942e-05,
"loss": 0.0638,
"step": 68,
"step_time": 27.40538086699962
},
{
"clip_ratio/high_max": 0.006514550419524312,
"clip_ratio/high_mean": 0.003257275209762156,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003257275209762156,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10202.0,
"completions/max_terminated_length": 10202.0,
"completions/mean_length": 5860.65625,
"completions/mean_terminated_length": 5860.65625,
"completions/min_length": 540.0,
"completions/min_terminated_length": 540.0,
"entropy": 0.16269859950989485,
"epoch": 0.00069,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9143983721733093,
"kl": 0.47777214366942644,
"learning_rate": 9.716671037846007e-05,
"loss": 0.1152,
"num_tokens": 9293397.0,
"reward": 0.6956244707107544,
"reward_std": 0.5506021976470947,
"rewards/rollout_eval_reward_func/mean": 0.5125762224197388,
"rewards/rollout_eval_reward_func/std": 0.2857610881328583,
"rewards/rollout_reward_func/mean": 0.6956244707107544,
"rewards/rollout_reward_func/std": 0.5588130354881287,
"sampling/importance_sampling_ratio/max": 1.4407436847686768,
"sampling/importance_sampling_ratio/mean": 1.0004699230194092,
"sampling/importance_sampling_ratio/min": 0.5672728419303894,
"sampling/sampling_logp_difference/max": 0.5669147968292236,
"sampling/sampling_logp_difference/mean": 0.010705020278692245,
"step": 69,
"step_time": 77.93740563500614
},
{
"clip_ratio/high_max": 0.04634982522111386,
"clip_ratio/high_mean": 0.029493737209122628,
"clip_ratio/low_mean": 0.01730769290588796,
"clip_ratio/low_min": 0.004166666767559946,
"clip_ratio/region_mean": 0.046801429823972285,
"entropy": 0.1784980888478458,
"epoch": 0.0007,
"grad_norm": 0.7063129544258118,
"kl": 0.3514184970408678,
"learning_rate": 9.699475947581644e-05,
"loss": 0.1049,
"step": 70,
"step_time": 27.06573885999751
},
{
"clip_ratio/high_max": 0.0018382353009656072,
"clip_ratio/high_mean": 0.0009191176504828036,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0009191176504828036,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10593.0,
"completions/max_terminated_length": 10593.0,
"completions/mean_length": 6225.28125,
"completions/mean_terminated_length": 6225.28125,
"completions/min_length": 1544.0,
"completions/min_terminated_length": 1544.0,
"entropy": 0.17604797054082155,
"epoch": 0.00071,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.7393732070922852,
"kl": 0.20170354284346104,
"learning_rate": 9.681796064661319e-05,
"loss": 0.0413,
"num_tokens": 9520372.0,
"reward": 0.8103519678115845,
"reward_std": 0.3980957269668579,
"rewards/rollout_eval_reward_func/mean": 0.5907012224197388,
"rewards/rollout_eval_reward_func/std": 0.20860876142978668,
"rewards/rollout_reward_func/mean": 0.8103519678115845,
"rewards/rollout_reward_func/std": 0.49828964471817017,
"sampling/importance_sampling_ratio/max": 1.5257266759872437,
"sampling/importance_sampling_ratio/mean": 0.9991195201873779,
"sampling/importance_sampling_ratio/min": 0.6470949649810791,
"sampling/sampling_logp_difference/max": 0.43526220321655273,
"sampling/sampling_logp_difference/mean": 0.010150602087378502,
"step": 71,
"step_time": 79.67722541299918
},
{
"clip_ratio/high_max": 0.01454339677002281,
"clip_ratio/high_mean": 0.008729609136935323,
"clip_ratio/low_mean": 0.009114583488553762,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01784419280011207,
"entropy": 0.18162205442786217,
"epoch": 0.00072,
"grad_norm": 0.44046881794929504,
"kl": 0.20182663016021252,
"learning_rate": 9.663633873832725e-05,
"loss": 0.0328,
"step": 72,
"step_time": 28.538212690000364
},
{
"clip_ratio/high_max": 0.004232634324580431,
"clip_ratio/high_mean": 0.0021163171622902155,
"clip_ratio/low_mean": 0.0005208333604969084,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.002637150522787124,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10142.0,
"completions/max_terminated_length": 10142.0,
"completions/mean_length": 6840.59375,
"completions/mean_terminated_length": 6840.59375,
"completions/min_length": 1013.0,
"completions/min_terminated_length": 1013.0,
"entropy": 0.20604060776531696,
"epoch": 0.00073,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6912176609039307,
"kl": 0.26183000626042485,
"learning_rate": 9.644991927627566e-05,
"loss": -0.0088,
"num_tokens": 9767000.0,
"reward": 0.7756590247154236,
"reward_std": 0.5361872911453247,
"rewards/rollout_eval_reward_func/mean": 0.5909552574157715,
"rewards/rollout_eval_reward_func/std": 0.2465948760509491,
"rewards/rollout_reward_func/mean": 0.7756590247154236,
"rewards/rollout_reward_func/std": 0.5321318507194519,
"sampling/importance_sampling_ratio/max": 1.2645851373672485,
"sampling/importance_sampling_ratio/mean": 1.0009121894836426,
"sampling/importance_sampling_ratio/min": 0.6386132836341858,
"sampling/sampling_logp_difference/max": 0.4484562873840332,
"sampling/sampling_logp_difference/mean": 0.010102368891239166,
"step": 73,
"step_time": 82.14820753000458
},
{
"clip_ratio/high_max": 0.02557993505615741,
"clip_ratio/high_mean": 0.01748599053826183,
"clip_ratio/low_mean": 0.009895833674818277,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.027381824154872447,
"entropy": 0.22169003915041685,
"epoch": 0.00074,
"grad_norm": 0.42521047592163086,
"kl": 0.23322301171720028,
"learning_rate": 9.625872846002834e-05,
"loss": -0.0155,
"step": 74,
"step_time": 28.134478513999056
},
{
"clip_ratio/high_max": 0.008986742584966123,
"clip_ratio/high_mean": 0.005014204594772309,
"clip_ratio/low_mean": 0.002018229220993817,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007032433815766126,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9857.0,
"completions/max_terminated_length": 9857.0,
"completions/mean_length": 7195.375,
"completions/mean_terminated_length": 7195.375,
"completions/min_length": 768.0,
"completions/min_terminated_length": 768.0,
"entropy": 0.27833056077361107,
"epoch": 0.00075,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8185970187187195,
"kl": 0.24367811996489763,
"learning_rate": 9.606279315972582e-05,
"loss": -0.1492,
"num_tokens": 10025024.0,
"reward": 0.2978099584579468,
"reward_std": 0.6597497463226318,
"rewards/rollout_eval_reward_func/mean": 0.32901421189308167,
"rewards/rollout_eval_reward_func/std": 0.3157320022583008,
"rewards/rollout_reward_func/mean": 0.2978099584579468,
"rewards/rollout_reward_func/std": 0.690675675868988,
"sampling/importance_sampling_ratio/max": 1.4156017303466797,
"sampling/importance_sampling_ratio/mean": 1.0000808238983154,
"sampling/importance_sampling_ratio/min": 0.6558278799057007,
"sampling/sampling_logp_difference/max": 0.4218568801879883,
"sampling/sampling_logp_difference/mean": 0.013327672146260738,
"step": 75,
"step_time": 88.96669697499601
},
{
"epoch": 0.00075,
"eval_clip_ratio/high_max": 0.0,
"eval_clip_ratio/high_mean": 0.0,
"eval_clip_ratio/low_mean": 0.0,
"eval_clip_ratio/low_min": 0.0,
"eval_clip_ratio/region_mean": 0.0,
"eval_completions/clipped_ratio": 0.0,
"eval_completions/max_length": 9194.0,
"eval_completions/max_terminated_length": 9194.0,
"eval_completions/mean_length": 7026.0375,
"eval_completions/mean_terminated_length": 7026.0375,
"eval_completions/min_length": 4333.95,
"eval_completions/min_terminated_length": 4333.95,
"eval_entropy": 0.3085056647658348,
"eval_frac_reward_zero_std": 1.0,
"eval_kl": 0.22236853390932082,
"eval_loss": 0.0002063037100015208,
"eval_num_tokens": 10025024.0,
"eval_reward": 0.35444250535219907,
"eval_reward_std": 0.0,
"eval_rewards/rollout_eval_reward_func/mean": 0.3484247986227274,
"eval_rewards/rollout_eval_reward_func/std": 0.26531881298869847,
"eval_rewards/rollout_reward_func/mean": 0.35444250535219907,
"eval_rewards/rollout_reward_func/std": 0.5791118375957012,
"eval_runtime": 161.4965,
"eval_samples_per_second": 0.062,
"eval_sampling/importance_sampling_ratio/max": 1.1964155852794647,
"eval_sampling/importance_sampling_ratio/mean": 1.0003154128789902,
"eval_sampling/importance_sampling_ratio/min": 0.7968822807073593,
"eval_sampling/sampling_logp_difference/max": 0.2617991387844086,
"eval_sampling/sampling_logp_difference/mean": 0.01210988024249673,
"eval_steps_per_second": 0.019,
"step": 75
}
],
"logging_steps": 1.0,
"max_steps": 300,
"num_input_tokens_seen": 10025024,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}