{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 200, "global_step": 955, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010471204188481676, "grad_norm": 26.562393188476562, "kl": 0.03359118103981018, "learning_rate": 0.0, "logits/chosen": 1025103936.0, "logits/rejected": 1063107072.0, "logps/chosen": -192.39359907670453, "logps/rejected": -244.60918598790323, "loss": 1.9877, "rewards/chosen": -0.0024960009437618833, "rewards/margins": -0.003168148462927353, "rewards/rejected": 0.0006721475191654697, "step": 1 }, { "epoch": 0.010471204188481676, "grad_norm": 28.173646926879883, "kl": 0.047210514545440674, "learning_rate": 4.6875e-08, "logits/chosen": 1111284224.0, "logits/rejected": 1036883072.0, "logps/chosen": -273.04239220563846, "logps/rejected": -251.92859175774134, "loss": 1.9821, "rewards/chosen": -0.003001420849789039, "rewards/margins": -0.003218886387716663, "rewards/rejected": 0.0002174655379276241, "step": 10 }, { "epoch": 0.020942408376963352, "grad_norm": 28.309864044189453, "kl": 0.06180702522397041, "learning_rate": 9.895833333333332e-08, "logits/chosen": 1098295552.0, "logits/rejected": 1058195840.0, "logps/chosen": -271.31404378742513, "logps/rejected": -259.71006944444446, "loss": 1.9823, "rewards/chosen": -0.003387315544539583, "rewards/margins": -0.0018305458493822638, "rewards/rejected": -0.001556769695157319, "step": 20 }, { "epoch": 0.031413612565445025, "grad_norm": 25.917030334472656, "kl": 0.017666548490524292, "learning_rate": 1.5104166666666664e-07, "logits/chosen": 1014291776.0, "logits/rejected": 938894528.0, "logps/chosen": -286.8935106981982, "logps/rejected": -240.47707145765472, "loss": 1.9792, "rewards/chosen": -0.009706638238809488, "rewards/margins": 0.0046446070076055414, "rewards/rejected": -0.01435124524641503, "step": 30 }, { "epoch": 0.041884816753926704, "grad_norm": 29.217172622680664, "kl": 0.0, "learning_rate": 2.03125e-07, "logits/chosen": 1001239232.0, "logits/rejected": 1008372032.0, "logps/chosen": -301.35429748822605, "logps/rejected": -276.2920392690513, "loss": 1.9909, "rewards/chosen": -0.04018252386215904, "rewards/margins": 0.004611896940786439, "rewards/rejected": -0.04479442080294548, "step": 40 }, { "epoch": 0.05235602094240838, "grad_norm": 31.018491744995117, "kl": 0.0, "learning_rate": 2.552083333333333e-07, "logits/chosen": 1010273408.0, "logits/rejected": 899531584.0, "logps/chosen": -303.4438942307692, "logps/rejected": -267.7524305555556, "loss": 1.9613, "rewards/chosen": -0.1181301762507512, "rewards/margins": 0.01815716176388174, "rewards/rejected": -0.13628733801463294, "step": 50 }, { "epoch": 0.06282722513089005, "grad_norm": 28.290821075439453, "kl": 0.0, "learning_rate": 3.0729166666666665e-07, "logits/chosen": 1054024704.0, "logits/rejected": 1066224448.0, "logps/chosen": -294.36489550473186, "logps/rejected": -273.374661377709, "loss": 1.9469, "rewards/chosen": -0.2557775455318415, "rewards/margins": 0.006258757347657351, "rewards/rejected": -0.26203630287949886, "step": 60 }, { "epoch": 0.07329842931937172, "grad_norm": 33.95917892456055, "kl": 0.0, "learning_rate": 3.59375e-07, "logits/chosen": 942386816.0, "logits/rejected": 1006898688.0, "logps/chosen": -281.92285470257235, "logps/rejected": -277.8022416413374, "loss": 1.8777, "rewards/chosen": -0.5341391609412681, "rewards/margins": 0.026281655126598524, "rewards/rejected": -0.5604208160678666, "step": 70 }, { "epoch": 0.08376963350785341, "grad_norm": 30.24815559387207, "kl": 0.0, "learning_rate": 4.114583333333333e-07, "logits/chosen": 1031609536.0, "logits/rejected": 974328320.0, "logps/chosen": -310.20034934915776, "logps/rejected": -262.0218301435407, "loss": 1.8281, "rewards/chosen": -0.771320757785581, "rewards/margins": 0.06435381405328466, "rewards/rejected": -0.8356745718388656, "step": 80 }, { "epoch": 0.09424083769633508, "grad_norm": 16.658273696899414, "kl": 0.0, "learning_rate": 4.6354166666666664e-07, "logits/chosen": 902042816.0, "logits/rejected": 935183872.0, "logps/chosen": -264.3142224409449, "logps/rejected": -268.2765988372093, "loss": 1.7369, "rewards/chosen": -1.3310041292445867, "rewards/margins": 0.07459388280725432, "rewards/rejected": -1.405598012051841, "step": 90 }, { "epoch": 0.10471204188481675, "grad_norm": 12.389727592468262, "kl": 0.0, "learning_rate": 4.999849525959245e-07, "logits/chosen": 959631872.0, "logits/rejected": 1010654720.0, "logps/chosen": -312.21802805280527, "logps/rejected": -273.6338788946588, "loss": 1.6219, "rewards/chosen": -1.7981488822710396, "rewards/margins": 0.140241108108226, "rewards/rejected": -1.9383899903792656, "step": 100 }, { "epoch": 0.11518324607329843, "grad_norm": 7.393283367156982, "kl": 0.0, "learning_rate": 4.997174935782199e-07, "logits/chosen": 880968384.0, "logits/rejected": 880545600.0, "logps/chosen": -305.7040970062208, "logps/rejected": -269.9880543563579, "loss": 1.6532, "rewards/chosen": -2.356570989866835, "rewards/margins": 0.14298288817476612, "rewards/rejected": -2.4995538780416013, "step": 110 }, { "epoch": 0.1256544502617801, "grad_norm": 4.027650356292725, "kl": 0.0, "learning_rate": 4.9911605954668e-07, "logits/chosen": 906821824.0, "logits/rejected": 1005957952.0, "logps/chosen": -298.43577188940094, "logps/rejected": -318.1883942766296, "loss": 1.6413, "rewards/chosen": -3.1073929981518815, "rewards/margins": 0.22084922039939014, "rewards/rejected": -3.3282422185512717, "step": 120 }, { "epoch": 0.13612565445026178, "grad_norm": 2.4550304412841797, "kl": 0.0, "learning_rate": 4.981814548660135e-07, "logits/chosen": 905374080.0, "logits/rejected": 1031035072.0, "logps/chosen": -317.7083851575456, "logps/rejected": -292.74079117429835, "loss": 1.518, "rewards/chosen": -3.76897818848466, "rewards/margins": 0.27336776851774003, "rewards/rejected": -4.0423459570024, "step": 130 }, { "epoch": 0.14659685863874344, "grad_norm": 1.2676219940185547, "kl": 0.0, "learning_rate": 4.969149294871417e-07, "logits/chosen": 896367616.0, "logits/rejected": 964537536.0, "logps/chosen": -305.43130990415335, "logps/rejected": -322.4831804281346, "loss": 1.5691, "rewards/chosen": -4.445092149435903, "rewards/margins": 0.32064222855052726, "rewards/rejected": -4.76573437798643, "step": 140 }, { "epoch": 0.15706806282722513, "grad_norm": 1.0475122928619385, "kl": 0.0, "learning_rate": 4.953181772754997e-07, "logits/chosen": 898162304.0, "logits/rejected": 908483776.0, "logps/chosen": -313.5234494274809, "logps/rejected": -306.161225, "loss": 1.636, "rewards/chosen": -4.89772967855439, "rewards/margins": 0.43406836832061035, "rewards/rejected": -5.331798046875, "step": 150 }, { "epoch": 0.16753926701570682, "grad_norm": 1.9686732292175293, "kl": 0.0, "learning_rate": 4.93393333745642e-07, "logits/chosen": 860213888.0, "logits/rejected": 877230592.0, "logps/chosen": -309.0058340097403, "logps/rejected": -306.6567676957831, "loss": 1.5387, "rewards/chosen": -4.939952503551137, "rewards/margins": 0.4787038111608366, "rewards/rejected": -5.418656314711973, "step": 160 }, { "epoch": 0.17801047120418848, "grad_norm": 3.1122231483459473, "kl": 0.0, "learning_rate": 4.9114297320518e-07, "logits/chosen": 974107328.0, "logits/rejected": 970027520.0, "logps/chosen": -326.03355211598745, "logps/rejected": -319.89719626168227, "loss": 1.589, "rewards/chosen": -4.719719291854428, "rewards/margins": 0.561926200746818, "rewards/rejected": -5.281645492601246, "step": 170 }, { "epoch": 0.18848167539267016, "grad_norm": 2.5136497020721436, "kl": 0.0, "learning_rate": 4.885701053118751e-07, "logits/chosen": 924908736.0, "logits/rejected": 961720640.0, "logps/chosen": -318.30974690880987, "logps/rejected": -317.9285643759874, "loss": 1.6112, "rewards/chosen": -4.51682306257245, "rewards/margins": 0.5941317162585138, "rewards/rejected": -5.110954778830964, "step": 180 }, { "epoch": 0.19895287958115182, "grad_norm": 7.213411331176758, "kl": 0.0, "learning_rate": 4.856781710484872e-07, "logits/chosen": 928431808.0, "logits/rejected": 952974528.0, "logps/chosen": -314.60199860446573, "logps/rejected": -322.4887299004594, "loss": 1.5518, "rewards/chosen": -4.191326427307616, "rewards/margins": 0.6273537327708683, "rewards/rejected": -4.818680160078484, "step": 190 }, { "epoch": 0.2094240837696335, "grad_norm": 6.813438892364502, "kl": 0.0, "learning_rate": 4.824710381207655e-07, "logits/chosen": 918448448.0, "logits/rejected": 1015918208.0, "logps/chosen": -321.83199356913184, "logps/rejected": -311.08847834346506, "loss": 1.5375, "rewards/chosen": -3.782886762711013, "rewards/margins": 0.6459696202677101, "rewards/rejected": -4.428856382978723, "step": 200 }, { "epoch": 0.2198952879581152, "grad_norm": 8.781408309936523, "kl": 0.0, "learning_rate": 4.789529957847353e-07, "logits/chosen": 1004952576.0, "logits/rejected": 891462464.0, "logps/chosen": -323.2924141221374, "logps/rejected": -297.6743, "loss": 1.5864, "rewards/chosen": -3.252991561307252, "rewards/margins": 0.762442423067748, "rewards/rejected": -4.015433984375, "step": 210 }, { "epoch": 0.23036649214659685, "grad_norm": 13.12763500213623, "kl": 0.0, "learning_rate": 4.751287491101977e-07, "logits/chosen": 946857856.0, "logits/rejected": 880448896.0, "logps/chosen": -304.32392760093165, "logps/rejected": -287.22454795597486, "loss": 1.5538, "rewards/chosen": -2.888836025451281, "rewards/margins": 0.7221485368767455, "rewards/rejected": -3.6109845623280266, "step": 220 }, { "epoch": 0.24083769633507854, "grad_norm": 11.393705368041992, "kl": 0.0, "learning_rate": 4.710034126881159e-07, "logits/chosen": 1049158464.0, "logits/rejected": 824376000.0, "logps/chosen": -318.3151382823872, "logps/rejected": -307.4688817453626, "loss": 1.6174, "rewards/chosen": -2.438205946654385, "rewards/margins": 0.8078717468795524, "rewards/rejected": -3.2460776935339375, "step": 230 }, { "epoch": 0.2513089005235602, "grad_norm": 11.578465461730957, "kl": 0.0, "learning_rate": 4.665825037903035e-07, "logits/chosen": 1024492544.0, "logits/rejected": 929939776.0, "logps/chosen": -296.2886893297381, "logps/rejected": -285.70906794770207, "loss": 1.5392, "rewards/chosen": -2.06980766684322, "rewards/margins": 0.8852973484474891, "rewards/rejected": -2.9551050152907092, "step": 240 }, { "epoch": 0.2617801047120419, "grad_norm": 13.729632377624512, "kl": 0.0, "learning_rate": 4.618719349905619e-07, "logits/chosen": 1081932800.0, "logits/rejected": 961173696.0, "logps/chosen": -308.0947280534351, "logps/rejected": -284.4129, "loss": 1.5103, "rewards/chosen": -1.88463274540792, "rewards/margins": 0.9027238952170802, "rewards/rejected": -2.787356640625, "step": 250 }, { "epoch": 0.27225130890052357, "grad_norm": 11.3008394241333, "kl": 0.0, "learning_rate": 4.568780062571374e-07, "logits/chosen": 972363776.0, "logits/rejected": 1046053376.0, "logps/chosen": -290.8611027644231, "logps/rejected": -295.7996379573171, "loss": 1.4671, "rewards/chosen": -1.9210040752704327, "rewards/margins": 0.9912621675244415, "rewards/rejected": -2.9122662427948742, "step": 260 }, { "epoch": 0.28272251308900526, "grad_norm": 11.210180282592773, "kl": 0.0, "learning_rate": 4.516073965270717e-07, "logits/chosen": 927748608.0, "logits/rejected": 893388288.0, "logps/chosen": -287.66021126760563, "logps/rejected": -304.2968262480499, "loss": 1.485, "rewards/chosen": -1.9320662473102503, "rewards/margins": 1.088795397702425, "rewards/rejected": -3.0208616450126753, "step": 270 }, { "epoch": 0.2931937172774869, "grad_norm": 12.282171249389648, "kl": 0.0, "learning_rate": 4.460671547737158e-07, "logits/chosen": 914422144.0, "logits/rejected": 952285568.0, "logps/chosen": -319.30807877813504, "logps/rejected": -283.1123195288754, "loss": 1.4245, "rewards/chosen": -1.9814747506782557, "rewards/margins": 1.1337076795079144, "rewards/rejected": -3.11518243018617, "step": 280 }, { "epoch": 0.3036649214659686, "grad_norm": 11.814573287963867, "kl": 0.0, "learning_rate": 4.40264690579353e-07, "logits/chosen": 972118656.0, "logits/rejected": 909624512.0, "logps/chosen": -312.8792158917683, "logps/rejected": -280.78568209134613, "loss": 1.5032, "rewards/chosen": -1.850838079685118, "rewards/margins": 1.1238724336987964, "rewards/rejected": -2.9747105133839145, "step": 290 }, { "epoch": 0.31413612565445026, "grad_norm": 12.68067455291748, "kl": 0.0, "learning_rate": 4.3420776422553916e-07, "logits/chosen": 953990016.0, "logits/rejected": 940757632.0, "logps/chosen": -299.58254491590213, "logps/rejected": -287.3024161341853, "loss": 1.4921, "rewards/chosen": -1.8043567529147553, "rewards/margins": 1.231686164355113, "rewards/rejected": -3.036042917269868, "step": 300 }, { "epoch": 0.32460732984293195, "grad_norm": 17.389904022216797, "kl": 0.0, "learning_rate": 4.279044763144141e-07, "logits/chosen": 894258816.0, "logits/rejected": 1021757120.0, "logps/chosen": -280.6739217252396, "logps/rejected": -316.6185254204893, "loss": 1.4539, "rewards/chosen": -1.810631115215655, "rewards/margins": 1.282302842878764, "rewards/rejected": -3.092933958094419, "step": 310 }, { "epoch": 0.33507853403141363, "grad_norm": 10.321409225463867, "kl": 0.0, "learning_rate": 4.213632569348639e-07, "logits/chosen": 1014880128.0, "logits/rejected": 907480960.0, "logps/chosen": -302.61937225475845, "logps/rejected": -298.2493718592965, "loss": 1.5305, "rewards/chosen": -1.9286555322337116, "rewards/margins": 1.3392826695172306, "rewards/rejected": -3.267938201750942, "step": 320 }, { "epoch": 0.34554973821989526, "grad_norm": 12.460442543029785, "kl": 0.0, "learning_rate": 4.145928543880249e-07, "logits/chosen": 917175104.0, "logits/rejected": 969322880.0, "logps/chosen": -300.56881911532383, "logps/rejected": -297.01260625965995, "loss": 1.4141, "rewards/chosen": -1.6344111775523302, "rewards/margins": 1.6734750551804751, "rewards/rejected": -3.3078862327328054, "step": 330 }, { "epoch": 0.35602094240837695, "grad_norm": 11.16015338897705, "kl": 0.0, "learning_rate": 4.076023234872057e-07, "logits/chosen": 866838528.0, "logits/rejected": 971704128.0, "logps/chosen": -298.1501665993538, "logps/rejected": -291.4291792738275, "loss": 1.3923, "rewards/chosen": -1.5310064509920234, "rewards/margins": 1.6836444980908056, "rewards/rejected": -3.214650949082829, "step": 340 }, { "epoch": 0.36649214659685864, "grad_norm": 27.34151268005371, "kl": 0.0, "learning_rate": 4.004010134478771e-07, "logits/chosen": 968713728.0, "logits/rejected": 972469376.0, "logps/chosen": -286.62965374228395, "logps/rejected": -288.3266663370253, "loss": 1.4522, "rewards/chosen": -1.4199869603286555, "rewards/margins": 1.5951061066956722, "rewards/rejected": -3.0150930670243277, "step": 350 }, { "epoch": 0.3769633507853403, "grad_norm": 16.52737045288086, "kl": 0.0, "learning_rate": 3.9299855538392534e-07, "logits/chosen": 922831360.0, "logits/rejected": 947811008.0, "logps/chosen": -301.2358689263323, "logps/rejected": -295.06773267133957, "loss": 1.4126, "rewards/chosen": -1.6193985415850314, "rewards/margins": 1.8232337354788317, "rewards/rejected": -3.442632277063863, "step": 360 }, { "epoch": 0.387434554973822, "grad_norm": 16.960041046142578, "kl": 0.0, "learning_rate": 3.8540484942689075e-07, "logits/chosen": 870200512.0, "logits/rejected": 973426560.0, "logps/chosen": -289.64174462579615, "logps/rejected": -309.79447852760734, "loss": 1.461, "rewards/chosen": -1.5883331298828125, "rewards/margins": 1.8990898834415741, "rewards/rejected": -3.4874230133243866, "step": 370 }, { "epoch": 0.39790575916230364, "grad_norm": 16.1551456451416, "kl": 0.0, "learning_rate": 3.77630051485419e-07, "logits/chosen": 954556544.0, "logits/rejected": 868749824.0, "logps/chosen": -303.6716906342183, "logps/rejected": -294.0799159053156, "loss": 1.4297, "rewards/chosen": -1.4528007394796276, "rewards/margins": 1.913596953432021, "rewards/rejected": -3.3663976929116486, "step": 380 }, { "epoch": 0.4083769633507853, "grad_norm": 18.04342269897461, "kl": 0.0, "learning_rate": 3.696845596626342e-07, "logits/chosen": 914389696.0, "logits/rejected": 942570880.0, "logps/chosen": -274.45240575396826, "logps/rejected": -291.5584855769231, "loss": 1.4241, "rewards/chosen": -1.014939953031994, "rewards/margins": 1.9614178444439674, "rewards/rejected": -2.9763577974759614, "step": 390 }, { "epoch": 0.418848167539267, "grad_norm": 19.24696159362793, "kl": 0.0, "learning_rate": 3.61579000349597e-07, "logits/chosen": 901277504.0, "logits/rejected": 911895168.0, "logps/chosen": -296.7239105504587, "logps/rejected": -297.9510033945687, "loss": 1.3318, "rewards/chosen": -1.093078053325688, "rewards/margins": 2.3687205688787847, "rewards/rejected": -3.4617986222044728, "step": 400 }, { "epoch": 0.4293193717277487, "grad_norm": 20.188373565673828, "kl": 0.0, "learning_rate": 3.5332421401344837e-07, "logits/chosen": 755553600.0, "logits/rejected": 901826880.0, "logps/chosen": -295.43673716329965, "logps/rejected": -291.0846164358601, "loss": 1.3484, "rewards/chosen": -1.2031269523029777, "rewards/margins": 2.3843341133174665, "rewards/rejected": -3.5874610656204444, "step": 410 }, { "epoch": 0.4397905759162304, "grad_norm": 39.75468444824219, "kl": 0.0, "learning_rate": 3.4493124069924635e-07, "logits/chosen": 849826752.0, "logits/rejected": 898505088.0, "logps/chosen": -305.4299, "logps/rejected": -290.353697519084, "loss": 1.3488, "rewards/chosen": -1.8245701171875, "rewards/margins": 2.5110345315481872, "rewards/rejected": -4.335604648735687, "step": 420 }, { "epoch": 0.450261780104712, "grad_norm": 23.038509368896484, "kl": 0.0, "learning_rate": 3.3641130526488335e-07, "logits/chosen": 801155392.0, "logits/rejected": 885279936.0, "logps/chosen": -271.98359007064363, "logps/rejected": -319.79568429237946, "loss": 1.4282, "rewards/chosen": -1.701683727126668, "rewards/margins": 2.225468583210618, "rewards/rejected": -3.927152310337286, "step": 430 }, { "epoch": 0.4607329842931937, "grad_norm": 16.816984176635742, "kl": 0.0, "learning_rate": 3.2777580236883473e-07, "logits/chosen": 781878144.0, "logits/rejected": 840188736.0, "logps/chosen": -268.33087060702877, "logps/rejected": -298.1252628058104, "loss": 1.3415, "rewards/chosen": -0.9628440198806909, "rewards/margins": 2.604564550575158, "rewards/rejected": -3.5674085704558487, "step": 440 }, { "epoch": 0.4712041884816754, "grad_norm": 23.657085418701172, "kl": 0.0, "learning_rate": 3.1903628123081196e-07, "logits/chosen": 815678464.0, "logits/rejected": 777286144.0, "logps/chosen": -294.5112335015528, "logps/rejected": -298.4295892295597, "loss": 1.3785, "rewards/chosen": -1.723951446343653, "rewards/margins": 2.6521049608890515, "rewards/rejected": -4.376056407232705, "step": 450 }, { "epoch": 0.4816753926701571, "grad_norm": 19.01488494873047, "kl": 0.0, "learning_rate": 3.1020443018570556e-07, "logits/chosen": 727440000.0, "logits/rejected": 849506112.0, "logps/chosen": -292.6208908279221, "logps/rejected": -297.79659262048193, "loss": 1.3359, "rewards/chosen": -1.9514271129261365, "rewards/margins": 2.62783696486942, "rewards/rejected": -4.579264077795557, "step": 460 }, { "epoch": 0.49214659685863876, "grad_norm": 33.731056213378906, "kl": 0.0, "learning_rate": 3.0129206105147343e-07, "logits/chosen": 776329344.0, "logits/rejected": 846505344.0, "logps/chosen": -313.49515701468187, "logps/rejected": -295.306128185907, "loss": 1.3287, "rewards/chosen": -2.3697519722420473, "rewards/margins": 2.4085628498391, "rewards/rejected": -4.778314822081147, "step": 470 }, { "epoch": 0.5026178010471204, "grad_norm": 13.92732048034668, "kl": 0.0, "learning_rate": 2.923110933318805e-07, "logits/chosen": 808196672.0, "logits/rejected": 800854272.0, "logps/chosen": -298.69172108208954, "logps/rejected": -294.38888319672134, "loss": 1.4221, "rewards/chosen": -2.5513560736357275, "rewards/margins": 2.401674872189068, "rewards/rejected": -4.953030945824795, "step": 480 }, { "epoch": 0.5130890052356021, "grad_norm": 19.206186294555664, "kl": 0.0, "learning_rate": 2.832735382752194e-07, "logits/chosen": 850527616.0, "logits/rejected": 800958400.0, "logps/chosen": -290.59834418070443, "logps/rejected": -309.14478668261563, "loss": 1.3764, "rewards/chosen": -2.0100744460662328, "rewards/margins": 2.6875198147989985, "rewards/rejected": -4.697594260865231, "step": 490 }, { "epoch": 0.5235602094240838, "grad_norm": 27.097524642944336, "kl": 0.0, "learning_rate": 2.741914828103307e-07, "logits/chosen": 786880896.0, "logits/rejected": 811829952.0, "logps/chosen": -284.3116169544741, "logps/rejected": -294.09241349144634, "loss": 1.3165, "rewards/chosen": -1.4351621714641385, "rewards/margins": 2.9411340899569347, "rewards/rejected": -4.376296261421073, "step": 500 }, { "epoch": 0.5340314136125655, "grad_norm": 24.57645034790039, "kl": 0.0, "learning_rate": 2.650770733814065e-07, "logits/chosen": 794909632.0, "logits/rejected": 800322944.0, "logps/chosen": -303.6320275119617, "logps/rejected": -303.80400555130166, "loss": 1.3389, "rewards/chosen": -2.452097623542165, "rewards/margins": 2.8188633259218467, "rewards/rejected": -5.270960949464012, "step": 510 }, { "epoch": 0.5445026178010471, "grad_norm": 19.90850067138672, "kl": 0.0, "learning_rate": 2.55942499703198e-07, "logits/chosen": 832917312.0, "logits/rejected": 859018368.0, "logps/chosen": -311.358175, "logps/rejected": -306.0612118320611, "loss": 1.3487, "rewards/chosen": -2.985473828125, "rewards/margins": 2.5981758319417936, "rewards/rejected": -5.5836496600667935, "step": 520 }, { "epoch": 0.5549738219895288, "grad_norm": 16.22838020324707, "kl": 0.0, "learning_rate": 2.467999784583527e-07, "logits/chosen": 785304256.0, "logits/rejected": 842632128.0, "logps/chosen": -284.2848192891374, "logps/rejected": -302.64994266055044, "loss": 1.299, "rewards/chosen": -1.9525462385183705, "rewards/margins": 3.111523503573564, "rewards/rejected": -5.0640697420919345, "step": 530 }, { "epoch": 0.5654450261780105, "grad_norm": 19.957088470458984, "kl": 0.0, "learning_rate": 2.3766173695868388e-07, "logits/chosen": 769985984.0, "logits/rejected": 797255360.0, "logps/chosen": -298.26605570444104, "logps/rejected": -304.75356359649123, "loss": 1.3787, "rewards/chosen": -1.5651508699332408, "rewards/margins": 2.674011544714885, "rewards/rejected": -4.239162414648126, "step": 540 }, { "epoch": 0.5759162303664922, "grad_norm": 20.141708374023438, "kl": 0.0, "learning_rate": 2.285399967922253e-07, "logits/chosen": 732652224.0, "logits/rejected": 825995904.0, "logps/chosen": -283.12699680511184, "logps/rejected": -302.75038226299694, "loss": 1.2983, "rewards/chosen": -2.2529322225064896, "rewards/margins": 2.993755080389153, "rewards/rejected": -5.2466873028956424, "step": 550 }, { "epoch": 0.5863874345549738, "grad_norm": 15.83252239227295, "kl": 0.0, "learning_rate": 2.194469574779397e-07, "logits/chosen": 891087488.0, "logits/rejected": 787483520.0, "logps/chosen": -304.4279685128983, "logps/rejected": -305.6478713768116, "loss": 1.3612, "rewards/chosen": -2.3803220062950494, "rewards/margins": 2.7306727350495557, "rewards/rejected": -5.110994741344605, "step": 560 }, { "epoch": 0.5968586387434555, "grad_norm": 15.746390342712402, "kl": 0.0, "learning_rate": 2.1039478014994441e-07, "logits/chosen": 725274944.0, "logits/rejected": 853497664.0, "logps/chosen": -282.81245139968894, "logps/rejected": -308.07447017268447, "loss": 1.331, "rewards/chosen": -1.969779339400272, "rewards/margins": 2.9806961407557324, "rewards/rejected": -4.950475480156005, "step": 570 }, { "epoch": 0.6073298429319371, "grad_norm": 29.05588150024414, "kl": 0.0, "learning_rate": 2.0139557129307149e-07, "logits/chosen": 805204992.0, "logits/rejected": 881275392.0, "logps/chosen": -307.06715266719743, "logps/rejected": -325.5736915260736, "loss": 1.2908, "rewards/chosen": -1.7291349180185112, "rewards/margins": 3.099390742209633, "rewards/rejected": -4.828525660228144, "step": 580 }, { "epoch": 0.6178010471204188, "grad_norm": 17.282276153564453, "kl": 0.0, "learning_rate": 1.9246136655151808e-07, "logits/chosen": 853613056.0, "logits/rejected": 822100032.0, "logps/chosen": -309.79278100775196, "logps/rejected": -327.5703248031496, "loss": 1.296, "rewards/chosen": -2.4475913350896317, "rewards/margins": 3.021821112621983, "rewards/rejected": -5.469412447711615, "step": 590 }, { "epoch": 0.6282722513089005, "grad_norm": 25.881441116333008, "kl": 0.0, "learning_rate": 1.8360411463223873e-07, "logits/chosen": 790009792.0, "logits/rejected": 845724032.0, "logps/chosen": -297.8154809220986, "logps/rejected": -315.4985839093702, "loss": 1.3103, "rewards/chosen": -2.263746592122913, "rewards/margins": 3.04722330995082, "rewards/rejected": -5.310969902073733, "step": 600 }, { "epoch": 0.6387434554973822, "grad_norm": 22.745113372802734, "kl": 0.0, "learning_rate": 1.7483566132460865e-07, "logits/chosen": 793115456.0, "logits/rejected": 847319680.0, "logps/chosen": -307.8780409414557, "logps/rejected": -297.18407600308643, "loss": 1.3389, "rewards/chosen": -2.189051567753659, "rewards/margins": 2.7282397673640113, "rewards/rejected": -4.91729133511767, "step": 610 }, { "epoch": 0.6492146596858639, "grad_norm": 20.82432746887207, "kl": 0.0, "learning_rate": 1.66167733657731e-07, "logits/chosen": 836790912.0, "logits/rejected": 851649024.0, "logps/chosen": -310.7592703349282, "logps/rejected": -311.3528426493109, "loss": 1.3303, "rewards/chosen": -2.0225349195075757, "rewards/margins": 2.746800419791812, "rewards/rejected": -4.769335339299388, "step": 620 }, { "epoch": 0.6596858638743456, "grad_norm": 18.315628051757812, "kl": 0.0, "learning_rate": 1.5761192421657456e-07, "logits/chosen": 792098688.0, "logits/rejected": 826501376.0, "logps/chosen": -301.0860373402556, "logps/rejected": -318.67146884556576, "loss": 1.2914, "rewards/chosen": -1.6394025418705072, "rewards/margins": 3.416822596102543, "rewards/rejected": -5.05622513797305, "step": 630 }, { "epoch": 0.6701570680628273, "grad_norm": 16.926488876342773, "kl": 0.0, "learning_rate": 1.491796756379185e-07, "logits/chosen": 887689536.0, "logits/rejected": 823059328.0, "logps/chosen": -321.0725746268657, "logps/rejected": -304.9668545081967, "loss": 1.3763, "rewards/chosen": -1.9418799556902986, "rewards/margins": 2.9874629458209716, "rewards/rejected": -4.92934290151127, "step": 640 }, { "epoch": 0.680628272251309, "grad_norm": 17.46025276184082, "kl": 0.0, "learning_rate": 1.4088226530684071e-07, "logits/chosen": 877960000.0, "logits/rejected": 800502720.0, "logps/chosen": -305.0225134408602, "logps/rejected": -312.8841166534181, "loss": 1.2991, "rewards/chosen": -1.6010805941580262, "rewards/margins": 3.246691709970551, "rewards/rejected": -4.847772304128577, "step": 650 }, { "epoch": 0.6910994764397905, "grad_norm": 26.456403732299805, "kl": 0.0, "learning_rate": 1.327307902742142e-07, "logits/chosen": 874241024.0, "logits/rejected": 855177088.0, "logps/chosen": -293.12502403846156, "logps/rejected": -322.4185267857143, "loss": 1.2745, "rewards/chosen": -1.854320537860577, "rewards/margins": 3.6010327297485505, "rewards/rejected": -5.455353267609127, "step": 660 }, { "epoch": 0.7015706806282722, "grad_norm": 27.566020965576172, "kl": 0.0, "learning_rate": 1.2473615241538523e-07, "logits/chosen": 803011584.0, "logits/rejected": 778086784.0, "logps/chosen": -281.4911978390462, "logps/rejected": -322.3012366584565, "loss": 1.3525, "rewards/chosen": -1.6165469492245714, "rewards/margins": 3.3331648926781376, "rewards/rejected": -4.949711841902709, "step": 670 }, { "epoch": 0.7120418848167539, "grad_norm": 25.231483459472656, "kl": 0.0, "learning_rate": 1.169090438498816e-07, "logits/chosen": 864072320.0, "logits/rejected": 894087488.0, "logps/chosen": -299.5445031298905, "logps/rejected": -311.1689498829953, "loss": 1.3068, "rewards/chosen": -1.4237583978261932, "rewards/margins": 3.4037450148103123, "rewards/rejected": -4.8275034126365055, "step": 680 }, { "epoch": 0.7225130890052356, "grad_norm": 26.85382652282715, "kl": 0.0, "learning_rate": 1.0925993264165045e-07, "logits/chosen": 801657152.0, "logits/rejected": 836854656.0, "logps/chosen": -295.48494908146967, "logps/rejected": -316.611381880734, "loss": 1.2955, "rewards/chosen": -1.5278358337597344, "rewards/margins": 3.3747778148211136, "rewards/rejected": -4.902613648580848, "step": 690 }, { "epoch": 0.7329842931937173, "grad_norm": 17.67808723449707, "kl": 0.0, "learning_rate": 1.0179904879894998e-07, "logits/chosen": 839250560.0, "logits/rejected": 829687552.0, "logps/chosen": -294.1292613636364, "logps/rejected": -317.5498929127726, "loss": 1.3023, "rewards/chosen": -1.7617466845856191, "rewards/margins": 3.402093731107527, "rewards/rejected": -5.163840415693146, "step": 700 }, { "epoch": 0.743455497382199, "grad_norm": 32.00407791137695, "kl": 0.0, "learning_rate": 9.453637059262117e-08, "logits/chosen": 800788096.0, "logits/rejected": 837777152.0, "logps/chosen": -289.1109255725191, "logps/rejected": -294.386675, "loss": 1.3705, "rewards/chosen": -1.946717147244752, "rewards/margins": 2.817464102755248, "rewards/rejected": -4.76418125, "step": 710 }, { "epoch": 0.7539267015706806, "grad_norm": 18.53364372253418, "kl": 0.0, "learning_rate": 8.748161121103406e-08, "logits/chosen": 801277312.0, "logits/rejected": 835388288.0, "logps/chosen": -298.15263287401575, "logps/rejected": -325.6966812015504, "loss": 1.3015, "rewards/chosen": -1.6334436131274606, "rewards/margins": 3.3554996554045164, "rewards/rejected": -4.988943268531977, "step": 720 }, { "epoch": 0.7643979057591623, "grad_norm": 18.17774772644043, "kl": 0.0, "learning_rate": 8.064420576955965e-08, "logits/chosen": 859945600.0, "logits/rejected": 924831360.0, "logps/chosen": -302.85615234375, "logps/rejected": -323.9920654296875, "loss": 1.2968, "rewards/chosen": -2.0160938262939454, "rewards/margins": 3.482906723022461, "rewards/rejected": -5.499000549316406, "step": 730 }, { "epoch": 0.774869109947644, "grad_norm": 24.694583892822266, "kl": 0.0, "learning_rate": 7.403329869193922e-08, "logits/chosen": 838223552.0, "logits/rejected": 778052224.0, "logps/chosen": -290.2984591013825, "logps/rejected": -299.4383942766296, "loss": 1.2544, "rewards/chosen": -1.8794061569940477, "rewards/margins": 3.8346927506222084, "rewards/rejected": -5.714098907616256, "step": 740 }, { "epoch": 0.7853403141361257, "grad_norm": 29.033084869384766, "kl": 0.0, "learning_rate": 6.765773148042858e-08, "logits/chosen": 843541504.0, "logits/rejected": 823506752.0, "logps/chosen": -299.42698732718895, "logps/rejected": -302.11419415739266, "loss": 1.3466, "rewards/chosen": -2.019189603134601, "rewards/margins": 2.8317887448135313, "rewards/rejected": -4.850978347948132, "step": 750 }, { "epoch": 0.7958115183246073, "grad_norm": 19.56595230102539, "kl": 0.0, "learning_rate": 6.152603089107139e-08, "logits/chosen": 810777344.0, "logits/rejected": 779764736.0, "logps/chosen": -289.197298325723, "logps/rejected": -302.0749899678973, "loss": 1.3425, "rewards/chosen": -1.8063266237395357, "rewards/margins": 3.0717198437062905, "rewards/rejected": -4.878046467445826, "step": 760 }, { "epoch": 0.806282722513089, "grad_norm": 14.122668266296387, "kl": 0.0, "learning_rate": 5.5646397529920175e-08, "logits/chosen": 837819072.0, "logits/rejected": 859991232.0, "logps/chosen": -314.30802861685214, "logps/rejected": -308.1335925499232, "loss": 1.1956, "rewards/chosen": -1.5554460243504074, "rewards/margins": 3.727686596137304, "rewards/rejected": -5.283132620487711, "step": 770 }, { "epoch": 0.8167539267015707, "grad_norm": 22.549423217773438, "kl": 0.0, "learning_rate": 5.002669488545111e-08, "logits/chosen": 790213888.0, "logits/rejected": 885866496.0, "logps/chosen": -293.48640188834156, "logps/rejected": -321.5396562965723, "loss": 1.3008, "rewards/chosen": -1.921995266317734, "rewards/margins": 2.910402012696797, "rewards/rejected": -4.832397279014531, "step": 780 }, { "epoch": 0.8272251308900523, "grad_norm": 21.376644134521484, "kl": 0.0, "learning_rate": 4.467443881184646e-08, "logits/chosen": 782350336.0, "logits/rejected": 819218560.0, "logps/chosen": -300.98388364779873, "logps/rejected": -297.211810947205, "loss": 1.3223, "rewards/chosen": -2.0677747426542847, "rewards/margins": 2.908827941982749, "rewards/rejected": -4.976602684637034, "step": 790 }, { "epoch": 0.837696335078534, "grad_norm": 27.827619552612305, "kl": 0.0, "learning_rate": 3.959678747720488e-08, "logits/chosen": 906698944.0, "logits/rejected": 851121024.0, "logps/chosen": -293.82601851851854, "logps/rejected": -310.759375, "loss": 1.3648, "rewards/chosen": -2.1147281901041666, "rewards/margins": 3.1817343938317837, "rewards/rejected": -5.29646258393595, "step": 800 }, { "epoch": 0.8481675392670157, "grad_norm": 14.492793083190918, "kl": 0.0, "learning_rate": 3.480053179012654e-08, "logits/chosen": 731670336.0, "logits/rejected": 864448640.0, "logps/chosen": -282.0983207472178, "logps/rejected": -311.6561539938556, "loss": 1.3694, "rewards/chosen": -2.1266589141991257, "rewards/margins": 2.7596034315910045, "rewards/rejected": -4.88626234579013, "step": 810 }, { "epoch": 0.8586387434554974, "grad_norm": 22.540645599365234, "kl": 0.0, "learning_rate": 3.029208631747446e-08, "logits/chosen": 822572672.0, "logits/rejected": 801164608.0, "logps/chosen": -287.1926763803681, "logps/rejected": -311.2378085191083, "loss": 1.2763, "rewards/chosen": -1.883757983248658, "rewards/margins": 3.543778513442226, "rewards/rejected": -5.427536496690884, "step": 820 }, { "epoch": 0.8691099476439791, "grad_norm": 17.336292266845703, "kl": 0.0, "learning_rate": 2.607748070546037e-08, "logits/chosen": 845615360.0, "logits/rejected": 904532160.0, "logps/chosen": -289.08657827287067, "logps/rejected": -321.46013931888547, "loss": 1.3094, "rewards/chosen": -1.9600795420938486, "rewards/margins": 3.3928427380435355, "rewards/rejected": -5.352922280137384, "step": 830 }, { "epoch": 0.8795811518324608, "grad_norm": 26.244890213012695, "kl": 0.0, "learning_rate": 2.2162351615526544e-08, "logits/chosen": 830761088.0, "logits/rejected": 874489984.0, "logps/chosen": -318.6343998015873, "logps/rejected": -321.2511057692308, "loss": 1.3163, "rewards/chosen": -2.282879929315476, "rewards/margins": 2.976137874170101, "rewards/rejected": -5.259017803485577, "step": 840 }, { "epoch": 0.8900523560209425, "grad_norm": 18.969242095947266, "kl": 0.0, "learning_rate": 1.8551935185811717e-08, "logits/chosen": 761686912.0, "logits/rejected": 807224448.0, "logps/chosen": -299.0131200396825, "logps/rejected": -326.48896634615386, "loss": 1.3118, "rewards/chosen": -2.2417308020213293, "rewards/margins": 3.3625240056709784, "rewards/rejected": -5.604254807692308, "step": 850 }, { "epoch": 0.900523560209424, "grad_norm": 13.49955940246582, "kl": 0.0, "learning_rate": 1.5251060028279612e-08, "logits/chosen": 844396160.0, "logits/rejected": 831924288.0, "logps/chosen": -285.6634949768161, "logps/rejected": -329.8748518957346, "loss": 1.3469, "rewards/chosen": -2.111665280561727, "rewards/margins": 3.1700718436237, "rewards/rejected": -5.281737124185427, "step": 860 }, { "epoch": 0.9109947643979057, "grad_norm": 20.407133102416992, "kl": 0.0, "learning_rate": 1.2264140770878839e-08, "logits/chosen": 813423744.0, "logits/rejected": 872637824.0, "logps/chosen": -313.7548781695721, "logps/rejected": -317.90100154083206, "loss": 1.3324, "rewards/chosen": -2.2133790997053286, "rewards/margins": 3.005966768938739, "rewards/rejected": -5.2193458686440675, "step": 870 }, { "epoch": 0.9214659685863874, "grad_norm": 21.028409957885742, "kl": 0.0, "learning_rate": 9.59517215336922e-09, "logits/chosen": 710001472.0, "logits/rejected": 798854528.0, "logps/chosen": -295.04663047730827, "logps/rejected": -312.5884847893916, "loss": 1.3053, "rewards/chosen": -2.2264215173855635, "rewards/margins": 3.319438070904218, "rewards/rejected": -5.545859588289781, "step": 880 }, { "epoch": 0.9319371727748691, "grad_norm": 35.28827667236328, "kl": 0.0, "learning_rate": 7.247723684711382e-09, "logits/chosen": 821099200.0, "logits/rejected": 816860864.0, "logps/chosen": -284.8080221036585, "logps/rejected": -317.58358373397436, "loss": 1.3087, "rewards/chosen": -1.8735839099418827, "rewards/margins": 3.3290528088081173, "rewards/rejected": -5.20263671875, "step": 890 }, { "epoch": 0.9424083769633508, "grad_norm": 20.1422176361084, "kl": 0.0, "learning_rate": 5.224934869164976e-09, "logits/chosen": 833211072.0, "logits/rejected": 892030464.0, "logps/chosen": -308.6479436790924, "logps/rejected": -323.03117929864254, "loss": 1.3294, "rewards/chosen": -2.350066396753444, "rewards/margins": 2.840480389869859, "rewards/rejected": -5.190546786623303, "step": 900 }, { "epoch": 0.9528795811518325, "grad_norm": 15.786822319030762, "kl": 0.0, "learning_rate": 3.529511007479946e-09, "logits/chosen": 850338304.0, "logits/rejected": 878503936.0, "logps/chosen": -307.224846390169, "logps/rejected": -304.68362480127183, "loss": 1.3362, "rewards/chosen": -2.205735917098694, "rewards/margins": 3.0281383811425617, "rewards/rejected": -5.233874298241256, "step": 910 }, { "epoch": 0.9633507853403142, "grad_norm": 23.114709854125977, "kl": 0.0, "learning_rate": 2.1637195787966857e-09, "logits/chosen": 768496128.0, "logits/rejected": 914081792.0, "logps/chosen": -307.8621338282504, "logps/rejected": -306.88308599695586, "loss": 1.2942, "rewards/chosen": -1.9231529174608748, "rewards/margins": 3.125122639640533, "rewards/rejected": -5.048275557101408, "step": 920 }, { "epoch": 0.9738219895287958, "grad_norm": 22.419374465942383, "kl": 0.0, "learning_rate": 1.1293872080934963e-09, "logits/chosen": 777531392.0, "logits/rejected": 896280960.0, "logps/chosen": -298.2679078733766, "logps/rejected": -321.1836643448795, "loss": 1.2511, "rewards/chosen": -2.0406327681107954, "rewards/margins": 3.466080363713376, "rewards/rejected": -5.5067131318241715, "step": 930 }, { "epoch": 0.9842931937172775, "grad_norm": 22.132793426513672, "kl": 0.0, "learning_rate": 4.2789722323760546e-10, "logits/chosen": 845816704.0, "logits/rejected": 823620416.0, "logps/chosen": -302.37687125748505, "logps/rejected": -313.2297794117647, "loss": 1.3619, "rewards/chosen": -1.8776080651197604, "rewards/margins": 3.33271708870479, "rewards/rejected": -5.2103251538245505, "step": 940 }, { "epoch": 0.9947643979057592, "grad_norm": 22.317419052124023, "kl": 0.0, "learning_rate": 6.018780490690822e-11, "logits/chosen": 842274304.0, "logits/rejected": 781750656.0, "logps/chosen": -298.93238636363634, "logps/rejected": -305.31222278225806, "loss": 1.3093, "rewards/chosen": -2.095315459280303, "rewards/margins": 3.475892485528165, "rewards/rejected": -5.571207944808468, "step": 950 }, { "epoch": 1.0, "step": 955, "total_flos": 0.0, "train_loss": 1.4430672781629712, "train_runtime": 11696.6719, "train_samples_per_second": 10.453, "train_steps_per_second": 0.082 } ], "logging_steps": 10, "max_steps": 955, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }