{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 200, "global_step": 955, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010471204188481676, "grad_norm": 4.605347633361816, "kl": 0.03192205727100372, "learning_rate": 0.0, "logits/chosen": -333891584.0, "logits/rejected": -265467136.0, "logps/chosen": -199.38503196022728, "logps/rejected": -248.56980846774192, "loss": 2.0, "rewards/chosen": -0.0006534994551629732, "rewards/margins": 0.0002265003958638695, "rewards/rejected": -0.0008799998510268427, "step": 1 }, { "epoch": 0.010471204188481676, "grad_norm": 4.578050136566162, "kl": 0.056504733860492706, "learning_rate": 4.6875e-08, "logits/chosen": -294903168.0, "logits/rejected": -293959072.0, "logps/chosen": -280.9227300995025, "logps/rejected": -255.27308173952642, "loss": 2.0001, "rewards/chosen": 1.1030201542239086e-05, "rewards/margins": -7.682456811541358e-05, "rewards/rejected": 8.785476965765266e-05, "step": 10 }, { "epoch": 0.020942408376963352, "grad_norm": 4.967247009277344, "kl": 0.05700124055147171, "learning_rate": 9.895833333333332e-08, "logits/chosen": -323860896.0, "logits/rejected": -310657184.0, "logps/chosen": -279.0876918038922, "logps/rejected": -261.8295292075163, "loss": 1.9999, "rewards/chosen": 0.0003772152219703811, "rewards/margins": 0.00016149118441195642, "rewards/rejected": 0.0002157240375584247, "step": 20 }, { "epoch": 0.031413612565445025, "grad_norm": 4.6223273277282715, "kl": 0.08535922318696976, "learning_rate": 1.5104166666666664e-07, "logits/chosen": -308568672.0, "logits/rejected": -295664416.0, "logps/chosen": -294.7120636261261, "logps/rejected": -242.7735901872964, "loss": 2.0, "rewards/chosen": 0.0009878192756985042, "rewards/margins": 9.039931823623067e-05, "rewards/rejected": 0.0008974199574622735, "step": 30 }, { "epoch": 0.041884816753926704, "grad_norm": 5.402420520782471, "kl": 0.20308740437030792, "learning_rate": 2.03125e-07, "logits/chosen": -302993152.0, "logits/rejected": -312362304.0, "logps/chosen": -306.06473214285717, "logps/rejected": -278.4209029937792, "loss": 1.9996, "rewards/chosen": 0.003333506067656273, "rewards/margins": 0.0008349405391781992, "rewards/rejected": 0.0024985655284780737, "step": 40 }, { "epoch": 0.05235602094240838, "grad_norm": 5.194827556610107, "kl": 0.34073737263679504, "learning_rate": 2.552083333333333e-07, "logits/chosen": -299651232.0, "logits/rejected": -298563392.0, "logps/chosen": -311.1816826923077, "logps/rejected": -268.8357142857143, "loss": 1.9988, "rewards/chosen": 0.007691890276395358, "rewards/margins": 0.0022930555786114188, "rewards/rejected": 0.005398834697783939, "step": 50 }, { "epoch": 0.06282722513089005, "grad_norm": 5.1792426109313965, "kl": 0.5095587372779846, "learning_rate": 3.0729166666666665e-07, "logits/chosen": -302699456.0, "logits/rejected": -313209856.0, "logps/chosen": -299.3851291403785, "logps/rejected": -272.6368034055728, "loss": 1.9976, "rewards/chosen": 0.012803548518039451, "rewards/margins": 0.004993875586639943, "rewards/rejected": 0.007809672931399508, "step": 60 }, { "epoch": 0.07329842931937172, "grad_norm": 5.62812614440918, "kl": 0.324247807264328, "learning_rate": 3.59375e-07, "logits/chosen": -282744192.0, "logits/rejected": -310504768.0, "logps/chosen": -281.65147709003213, "logps/rejected": -274.1980433130699, "loss": 1.9967, "rewards/chosen": 0.017537592308314285, "rewards/margins": 0.007583660111415985, "rewards/rejected": 0.0099539321968983, "step": 70 }, { "epoch": 0.08376963350785341, "grad_norm": 5.519657135009766, "kl": 0.19997477531433105, "learning_rate": 4.114583333333333e-07, "logits/chosen": -320332992.0, "logits/rejected": -294599264.0, "logps/chosen": -309.67646918070443, "logps/rejected": -254.8375697767145, "loss": 1.9917, "rewards/chosen": 0.025304861492256293, "rewards/margins": 0.01606653321257549, "rewards/rejected": 0.009238328279680803, "step": 80 }, { "epoch": 0.09424083769633508, "grad_norm": 4.8804121017456055, "kl": 0.039679840207099915, "learning_rate": 4.6354166666666664e-07, "logits/chosen": -292024928.0, "logits/rejected": -305713184.0, "logps/chosen": -255.28255413385827, "logps/rejected": -255.59004360465116, "loss": 1.9908, "rewards/chosen": 0.0300817714901421, "rewards/margins": 0.018853704668711096, "rewards/rejected": 0.011228066821431005, "step": 90 }, { "epoch": 0.10471204188481675, "grad_norm": 5.486176490783691, "kl": 0.0, "learning_rate": 4.999849525959245e-07, "logits/chosen": -303109792.0, "logits/rejected": -348087872.0, "logps/chosen": -298.6987417491749, "logps/rejected": -256.3508902077151, "loss": 1.9828, "rewards/chosen": 0.040776573785460825, "rewards/margins": 0.03677532848489619, "rewards/rejected": 0.004001245300564639, "step": 100 }, { "epoch": 0.11518324607329843, "grad_norm": 5.567543029785156, "kl": 0.0, "learning_rate": 4.997174935782199e-07, "logits/chosen": -289720032.0, "logits/rejected": -312817568.0, "logps/chosen": -288.18920101088645, "logps/rejected": -248.39857240973313, "loss": 1.9794, "rewards/chosen": 0.026099390412563483, "rewards/margins": 0.04113652131036093, "rewards/rejected": -0.015037130897797445, "step": 110 }, { "epoch": 0.1256544502617801, "grad_norm": 5.6301703453063965, "kl": 0.0, "learning_rate": 4.9911605954668e-07, "logits/chosen": -322413632.0, "logits/rejected": -313895360.0, "logps/chosen": -272.9092261904762, "logps/rejected": -290.2653765898251, "loss": 1.971, "rewards/chosen": 0.01783415798767371, "rewards/margins": 0.058556688091881776, "rewards/rejected": -0.040722530104208066, "step": 120 }, { "epoch": 0.13612565445026178, "grad_norm": 5.450737953186035, "kl": 0.0, "learning_rate": 4.981814548660135e-07, "logits/chosen": -298956864.0, "logits/rejected": -361433408.0, "logps/chosen": -287.07413453565505, "logps/rejected": -262.082371676514, "loss": 1.9574, "rewards/chosen": 0.0099076030661613, "rewards/margins": 0.08200543773355305, "rewards/rejected": -0.07209783466739175, "step": 130 }, { "epoch": 0.14659685863874344, "grad_norm": 5.516458511352539, "kl": 0.0, "learning_rate": 4.969149294871417e-07, "logits/chosen": -338851456.0, "logits/rejected": -332391360.0, "logps/chosen": -274.5335463258786, "logps/rejected": -291.4233084862385, "loss": 1.9518, "rewards/chosen": -0.05588988526560628, "rewards/margins": 0.09285041070450553, "rewards/rejected": -0.14874029597011182, "step": 140 }, { "epoch": 0.15706806282722513, "grad_norm": 7.548930644989014, "kl": 0.0, "learning_rate": 4.953181772754997e-07, "logits/chosen": -356664576.0, "logits/rejected": -329555744.0, "logps/chosen": -280.82564408396945, "logps/rejected": -277.920425, "loss": 1.9325, "rewards/chosen": -0.08151665025084984, "rewards/margins": 0.14491142592102516, "rewards/rejected": -0.226428076171875, "step": 150 }, { "epoch": 0.16753926701570682, "grad_norm": 8.331445693969727, "kl": 0.0, "learning_rate": 4.93393333745642e-07, "logits/chosen": -344808288.0, "logits/rejected": -352486720.0, "logps/chosen": -282.0112621753247, "logps/rejected": -285.5078125, "loss": 1.9096, "rewards/chosen": -0.13890903027026685, "rewards/margins": 0.1691989662911742, "rewards/rejected": -0.30810799656144106, "step": 160 }, { "epoch": 0.17801047120418848, "grad_norm": 27.522336959838867, "kl": 0.0, "learning_rate": 4.9114297320518e-07, "logits/chosen": -395774560.0, "logits/rejected": -387744128.0, "logps/chosen": -317.2337382445141, "logps/rejected": -320.66834598909657, "loss": 1.9024, "rewards/chosen": -0.29974554175493484, "rewards/margins": 0.20640403544826313, "rewards/rejected": -0.506149577203198, "step": 170 }, { "epoch": 0.18848167539267016, "grad_norm": 14.592561721801758, "kl": 0.0, "learning_rate": 4.885701053118751e-07, "logits/chosen": -390902016.0, "logits/rejected": -382734656.0, "logps/chosen": -309.92156105100463, "logps/rejected": -319.86998913902056, "loss": 1.8962, "rewards/chosen": -0.27517016261954214, "rewards/margins": 0.23024947963151926, "rewards/rejected": -0.5054196422510614, "step": 180 }, { "epoch": 0.19895287958115182, "grad_norm": 16.364885330200195, "kl": 0.0, "learning_rate": 4.856781710484872e-07, "logits/chosen": -377519712.0, "logits/rejected": -384991200.0, "logps/chosen": -317.04276315789474, "logps/rejected": -343.468415007657, "loss": 1.8553, "rewards/chosen": -0.35698912892805523, "rewards/margins": 0.303288443633679, "rewards/rejected": -0.6602775725617342, "step": 190 }, { "epoch": 0.2094240837696335, "grad_norm": 13.272473335266113, "kl": 0.0, "learning_rate": 4.824710381207655e-07, "logits/chosen": -397011136.0, "logits/rejected": -412777728.0, "logps/chosen": -346.1264821141479, "logps/rejected": -359.5524316109422, "loss": 1.8447, "rewards/chosen": -0.5472822771961666, "rewards/margins": 0.3435558820577278, "rewards/rejected": -0.8908381592538944, "step": 200 }, { "epoch": 0.2094240837696335, "eval_kl": 0.0, "eval_logits/chosen": -401673280.0, "eval_logits/rejected": -397073248.0, "eval_logps/chosen": -350.8658125, "eval_logps/rejected": -366.78053125, "eval_loss": 0.464598149061203, "eval_rewards/chosen": -0.6301004028320313, "eval_rewards/margins": 0.3681937866210937, "eval_rewards/rejected": -0.998294189453125, "eval_runtime": 173.013, "eval_samples_per_second": 23.12, "eval_steps_per_second": 0.722, "step": 200 }, { "epoch": 0.2198952879581152, "grad_norm": 12.835283279418945, "kl": 0.0, "learning_rate": 4.789529957847353e-07, "logits/chosen": -377839680.0, "logits/rejected": -360811104.0, "logps/chosen": -342.6239265267176, "logps/rejected": -337.57145, "loss": 1.8619, "rewards/chosen": -0.4235861275942271, "rewards/margins": 0.35626909701514786, "rewards/rejected": -0.779855224609375, "step": 210 }, { "epoch": 0.23036649214659685, "grad_norm": 11.761977195739746, "kl": 0.0, "learning_rate": 4.751287491101977e-07, "logits/chosen": -363950592.0, "logits/rejected": -346935360.0, "logps/chosen": -327.6511063664596, "logps/rejected": -331.7046481918239, "loss": 1.8678, "rewards/chosen": -0.45187747552528146, "rewards/margins": 0.33270157958908075, "rewards/rejected": -0.7845790551143622, "step": 220 }, { "epoch": 0.24083769633507854, "grad_norm": 14.949240684509277, "kl": 0.0, "learning_rate": 4.710034126881159e-07, "logits/chosen": -387794592.0, "logits/rejected": -332998048.0, "logps/chosen": -345.15252001455605, "logps/rejected": -363.9196089797639, "loss": 1.8654, "rewards/chosen": -0.42814500744859896, "rewards/margins": 0.43583469882511516, "rewards/rejected": -0.8639797062737141, "step": 230 }, { "epoch": 0.2513089005235602, "grad_norm": 28.15343475341797, "kl": 0.0, "learning_rate": 4.665825037903035e-07, "logits/chosen": -384477856.0, "logits/rejected": -373712864.0, "logps/chosen": -335.26793624807397, "logps/rejected": -360.7674574088748, "loss": 1.8256, "rewards/chosen": -0.5404140196155143, "rewards/margins": 0.46967980234986806, "rewards/rejected": -1.0100938219653823, "step": 240 }, { "epoch": 0.2617801047120419, "grad_norm": 13.149576187133789, "kl": 0.0, "learning_rate": 4.618719349905619e-07, "logits/chosen": -401351104.0, "logits/rejected": -374455584.0, "logps/chosen": -363.6633110687023, "logps/rejected": -375.2053, "loss": 1.8287, "rewards/chosen": -0.6616745810472329, "rewards/margins": 0.5001745400465172, "rewards/rejected": -1.16184912109375, "step": 250 }, { "epoch": 0.27225130890052357, "grad_norm": 16.712739944458008, "kl": 0.0, "learning_rate": 4.568780062571374e-07, "logits/chosen": -386476864.0, "logits/rejected": -400174592.0, "logps/chosen": -339.71411758814105, "logps/rejected": -382.5932736280488, "loss": 1.7953, "rewards/chosen": -0.6005113063714443, "rewards/margins": 0.5215838481814806, "rewards/rejected": -1.122095154552925, "step": 260 }, { "epoch": 0.28272251308900526, "grad_norm": 22.0365047454834, "kl": 0.0, "learning_rate": 4.516073965270717e-07, "logits/chosen": -384655008.0, "logits/rejected": -365928352.0, "logps/chosen": -338.5719385758998, "logps/rejected": -392.81059867394697, "loss": 1.8065, "rewards/chosen": -0.6256347083149941, "rewards/margins": 0.5414030889195515, "rewards/rejected": -1.1670377972345456, "step": 270 }, { "epoch": 0.2931937172774869, "grad_norm": 43.19585037231445, "kl": 0.0, "learning_rate": 4.460671547737158e-07, "logits/chosen": -361381056.0, "logits/rejected": -369068928.0, "logps/chosen": -408.50211012861735, "logps/rejected": -410.9393047112462, "loss": 1.7984, "rewards/chosen": -1.0311282240692825, "rewards/margins": 0.5366287766074842, "rewards/rejected": -1.5677570006767667, "step": 280 }, { "epoch": 0.3036649214659686, "grad_norm": 16.71083641052246, "kl": 0.0, "learning_rate": 4.40264690579353e-07, "logits/chosen": -398537024.0, "logits/rejected": -368430528.0, "logps/chosen": -387.8846227134146, "logps/rejected": -399.3923527644231, "loss": 1.8419, "rewards/chosen": -0.9159838048423209, "rewards/margins": 0.5333353988523405, "rewards/rejected": -1.4493192036946614, "step": 290 }, { "epoch": 0.31413612565445026, "grad_norm": 13.06988525390625, "kl": 0.0, "learning_rate": 4.3420776422553916e-07, "logits/chosen": -379714016.0, "logits/rejected": -362430816.0, "logps/chosen": -351.87913321865443, "logps/rejected": -380.3795926517572, "loss": 1.8073, "rewards/chosen": -0.6257981233275994, "rewards/margins": 0.5890270041247269, "rewards/rejected": -1.2148251274523263, "step": 300 }, { "epoch": 0.32460732984293195, "grad_norm": 23.772066116333008, "kl": 0.0, "learning_rate": 4.279044763144141e-07, "logits/chosen": -356793760.0, "logits/rejected": -387742400.0, "logps/chosen": -313.406973841853, "logps/rejected": -383.028311353211, "loss": 1.7819, "rewards/chosen": -0.4327624141217801, "rewards/margins": 0.5333329298850529, "rewards/rejected": -0.966095344006833, "step": 310 }, { "epoch": 0.33507853403141363, "grad_norm": 16.960844039916992, "kl": 0.0, "learning_rate": 4.213632569348639e-07, "logits/chosen": -431567776.0, "logits/rejected": -367267008.0, "logps/chosen": -342.6104959736457, "logps/rejected": -379.4291771356784, "loss": 1.8291, "rewards/chosen": -0.5189258134207998, "rewards/margins": 0.6056777136546817, "rewards/rejected": -1.1246035270754815, "step": 320 }, { "epoch": 0.34554973821989526, "grad_norm": 38.159210205078125, "kl": 0.0, "learning_rate": 4.145928543880249e-07, "logits/chosen": -397418144.0, "logits/rejected": -396005696.0, "logps/chosen": -347.52253653238546, "logps/rejected": -389.09370170015455, "loss": 1.7527, "rewards/chosen": -0.5500312793123026, "rewards/margins": 0.6948817658722029, "rewards/rejected": -1.2449130451845054, "step": 330 }, { "epoch": 0.35602094240837695, "grad_norm": 17.87345314025879, "kl": 0.0, "learning_rate": 4.076023234872057e-07, "logits/chosen": -360866112.0, "logits/rejected": -396226624.0, "logps/chosen": -372.8658622778675, "logps/rejected": -422.6806448562784, "loss": 1.7247, "rewards/chosen": -0.8265112659657714, "rewards/margins": 0.7862990622537822, "rewards/rejected": -1.6128103282195536, "step": 340 }, { "epoch": 0.36649214659685864, "grad_norm": 32.15557861328125, "kl": 0.0, "learning_rate": 4.004010134478771e-07, "logits/chosen": -402362112.0, "logits/rejected": -383912448.0, "logps/chosen": -347.5367717978395, "logps/rejected": -395.23214992088606, "loss": 1.7853, "rewards/chosen": -0.6819350160198447, "rewards/margins": 0.6560503486768028, "rewards/rejected": -1.3379853646966475, "step": 350 }, { "epoch": 0.3769633507853403, "grad_norm": 17.032840728759766, "kl": 0.0, "learning_rate": 3.9299855538392534e-07, "logits/chosen": -373061568.0, "logits/rejected": -376975744.0, "logps/chosen": -340.9822198275862, "logps/rejected": -385.4313181464174, "loss": 1.7507, "rewards/chosen": -0.4902129457288401, "rewards/margins": 0.722005627060052, "rewards/rejected": -1.2122185727888921, "step": 360 }, { "epoch": 0.387434554973822, "grad_norm": 22.606733322143555, "kl": 0.0, "learning_rate": 3.8540484942689075e-07, "logits/chosen": -371107936.0, "logits/rejected": -383718688.0, "logps/chosen": -353.1767515923567, "logps/rejected": -418.5740030674847, "loss": 1.7464, "rewards/chosen": -0.7054660578442228, "rewards/margins": 0.7065043030345636, "rewards/rejected": -1.4119703608787864, "step": 370 }, { "epoch": 0.39790575916230364, "grad_norm": 33.263973236083984, "kl": 0.0, "learning_rate": 3.77630051485419e-07, "logits/chosen": -406904672.0, "logits/rejected": -344286496.0, "logps/chosen": -403.9385601032448, "logps/rejected": -432.968853820598, "loss": 1.8567, "rewards/chosen": -1.0629785588357301, "rewards/margins": 0.6323324203547287, "rewards/rejected": -1.6953109791904588, "step": 380 }, { "epoch": 0.4083769633507853, "grad_norm": 18.568082809448242, "kl": 0.0, "learning_rate": 3.696845596626342e-07, "logits/chosen": -359421728.0, "logits/rejected": -367630624.0, "logps/chosen": -348.77261904761906, "logps/rejected": -418.41769230769233, "loss": 1.7356, "rewards/chosen": -0.7873753138950893, "rewards/margins": 0.7420602517599587, "rewards/rejected": -1.529435565655048, "step": 390 }, { "epoch": 0.418848167539267, "grad_norm": 23.56498146057129, "kl": 0.0, "learning_rate": 3.61579000349597e-07, "logits/chosen": -379061824.0, "logits/rejected": -363287360.0, "logps/chosen": -362.0563360091743, "logps/rejected": -416.0301767172524, "loss": 1.7296, "rewards/chosen": -0.6648301990754014, "rewards/margins": 0.84632178197367, "rewards/rejected": -1.5111519810490714, "step": 400 }, { "epoch": 0.418848167539267, "eval_kl": 0.0, "eval_logits/chosen": -377831392.0, "eval_logits/rejected": -377408832.0, "eval_logps/chosen": -356.89978125, "eval_logps/rejected": -406.783625, "eval_loss": 0.44080978631973267, "eval_rewards/chosen": -0.690440185546875, "eval_rewards/margins": 0.7078846435546874, "eval_rewards/rejected": -1.3983248291015624, "eval_runtime": 172.5358, "eval_samples_per_second": 23.184, "eval_steps_per_second": 0.724, "step": 400 }, { "epoch": 0.4293193717277487, "grad_norm": 25.634418487548828, "kl": 0.0, "learning_rate": 3.5332421401344837e-07, "logits/chosen": -325346176.0, "logits/rejected": -386428736.0, "logps/chosen": -357.732086489899, "logps/rejected": -401.4339467930029, "loss": 1.6995, "rewards/chosen": -0.6876018793895992, "rewards/margins": 0.7492819858333873, "rewards/rejected": -1.4368838652229865, "step": 410 }, { "epoch": 0.4397905759162304, "grad_norm": 34.874794006347656, "kl": 0.0, "learning_rate": 3.4493124069924635e-07, "logits/chosen": -378771648.0, "logits/rejected": -384461664.0, "logps/chosen": -364.9864, "logps/rejected": -393.6061545801527, "loss": 1.7407, "rewards/chosen": -0.69082021484375, "rewards/margins": 0.7534859971374045, "rewards/rejected": -1.4443062119811545, "step": 420 }, { "epoch": 0.450261780104712, "grad_norm": 35.7071533203125, "kl": 0.0, "learning_rate": 3.3641130526488335e-07, "logits/chosen": -346615360.0, "logits/rejected": -370164512.0, "logps/chosen": -328.9187352825746, "logps/rejected": -424.24766718507, "loss": 1.7463, "rewards/chosen": -0.6537484209428964, "rewards/margins": 0.7605128467354668, "rewards/rejected": -1.4142612676783632, "step": 430 }, { "epoch": 0.4607329842931937, "grad_norm": 34.8936653137207, "kl": 0.0, "learning_rate": 3.2777580236883473e-07, "logits/chosen": -361869248.0, "logits/rejected": -375545024.0, "logps/chosen": -328.751697284345, "logps/rejected": -397.5517010703364, "loss": 1.7409, "rewards/chosen": -0.610927008973143, "rewards/margins": 0.7168888955566736, "rewards/rejected": -1.3278159045298166, "step": 440 }, { "epoch": 0.4712041884816754, "grad_norm": 26.618633270263672, "kl": 0.0, "learning_rate": 3.1903628123081196e-07, "logits/chosen": -384088768.0, "logits/rejected": -362557504.0, "logps/chosen": -352.25322690217394, "logps/rejected": -407.9415290880503, "loss": 1.7293, "rewards/chosen": -0.6755538845654601, "rewards/margins": 0.846232595264532, "rewards/rejected": -1.521786479829992, "step": 450 }, { "epoch": 0.4816753926701571, "grad_norm": 19.443235397338867, "kl": 0.0, "learning_rate": 3.1020443018570556e-07, "logits/chosen": -358506400.0, "logits/rejected": -400381632.0, "logps/chosen": -348.9179941152597, "logps/rejected": -395.03962725903614, "loss": 1.7259, "rewards/chosen": -0.6845747216955408, "rewards/margins": 0.7222973956802122, "rewards/rejected": -1.406872117375753, "step": 460 }, { "epoch": 0.49214659685863876, "grad_norm": 36.631107330322266, "kl": 0.0, "learning_rate": 3.0129206105147343e-07, "logits/chosen": -353789856.0, "logits/rejected": -394140160.0, "logps/chosen": -369.1834216965742, "logps/rejected": -395.943871814093, "loss": 1.7266, "rewards/chosen": -0.7301152837607056, "rewards/margins": 0.7226990500391913, "rewards/rejected": -1.4528143337998969, "step": 470 }, { "epoch": 0.5026178010471204, "grad_norm": 21.622982025146484, "kl": 0.0, "learning_rate": 2.923110933318805e-07, "logits/chosen": -380953024.0, "logits/rejected": -351669664.0, "logps/chosen": -346.1928404850746, "logps/rejected": -385.3263575819672, "loss": 1.7825, "rewards/chosen": -0.6672405185984142, "rewards/margins": 0.7526233229743317, "rewards/rejected": -1.4198638415727458, "step": 480 }, { "epoch": 0.5130890052356021, "grad_norm": 27.391277313232422, "kl": 0.0, "learning_rate": 2.832735382752194e-07, "logits/chosen": -384934912.0, "logits/rejected": -371643968.0, "logps/chosen": -372.62961810872895, "logps/rejected": -431.1251993620415, "loss": 1.7894, "rewards/chosen": -0.93115365231384, "rewards/margins": 0.7237101584915826, "rewards/rejected": -1.6548638108054226, "step": 490 }, { "epoch": 0.5235602094240838, "grad_norm": 30.544750213623047, "kl": 0.0, "learning_rate": 2.741914828103307e-07, "logits/chosen": -364308672.0, "logits/rejected": -375439488.0, "logps/chosen": -370.71887264521195, "logps/rejected": -424.57008164852255, "loss": 1.7381, "rewards/chosen": -0.924701876318436, "rewards/margins": 0.7957396423197833, "rewards/rejected": -1.7204415186382194, "step": 500 }, { "epoch": 0.5340314136125655, "grad_norm": 24.350994110107422, "kl": 0.0, "learning_rate": 2.650770733814065e-07, "logits/chosen": -367684672.0, "logits/rejected": -364714048.0, "logps/chosen": -355.05851275917064, "logps/rejected": -403.8284360643185, "loss": 1.7188, "rewards/chosen": -0.6844002512273226, "rewards/margins": 0.8209749551518888, "rewards/rejected": -1.5053752063792114, "step": 510 }, { "epoch": 0.5445026178010471, "grad_norm": 28.53436279296875, "kl": 0.0, "learning_rate": 2.55942499703198e-07, "logits/chosen": -379056736.0, "logits/rejected": -379016544.0, "logps/chosen": -345.9064, "logps/rejected": -384.8177719465649, "loss": 1.7248, "rewards/chosen": -0.563288232421875, "rewards/margins": 0.7492281678643845, "rewards/rejected": -1.3125164002862595, "step": 520 }, { "epoch": 0.5549738219895288, "grad_norm": 12.310104370117188, "kl": 0.0, "learning_rate": 2.467999784583527e-07, "logits/chosen": -348551552.0, "logits/rejected": -371971776.0, "logps/chosen": -327.1228284744409, "logps/rejected": -392.4502102446483, "loss": 1.7112, "rewards/chosen": -0.5498965327351238, "rewards/margins": 0.8324248862370666, "rewards/rejected": -1.3823214189721904, "step": 530 }, { "epoch": 0.5654450261780105, "grad_norm": 26.28302574157715, "kl": 0.0, "learning_rate": 2.3766173695868388e-07, "logits/chosen": -378826880.0, "logits/rejected": -363562816.0, "logps/chosen": -364.27648832312406, "logps/rejected": -418.0172448165869, "loss": 1.7646, "rewards/chosen": -0.7452207042466261, "rewards/margins": 0.772901577888043, "rewards/rejected": -1.5181222821346692, "step": 540 }, { "epoch": 0.5759162303664922, "grad_norm": 17.37626075744629, "kl": 0.0, "learning_rate": 2.285399967922253e-07, "logits/chosen": -378287168.0, "logits/rejected": -397669600.0, "logps/chosen": -360.6157647763578, "logps/rejected": -439.8442756116208, "loss": 1.6957, "rewards/chosen": -0.9246350370656949, "rewards/margins": 0.944558783712688, "rewards/rejected": -1.869193820778383, "step": 550 }, { "epoch": 0.5863874345549738, "grad_norm": 24.048419952392578, "kl": 0.0, "learning_rate": 2.194469574779397e-07, "logits/chosen": -419567904.0, "logits/rejected": -379702528.0, "logps/chosen": -370.91926688163886, "logps/rejected": -425.7548309178744, "loss": 1.7624, "rewards/chosen": -0.8229443285280729, "rewards/margins": 0.8691681994635736, "rewards/rejected": -1.6921125279916465, "step": 560 }, { "epoch": 0.5968586387434555, "grad_norm": 14.945625305175781, "kl": 0.0, "learning_rate": 2.1039478014994441e-07, "logits/chosen": -369516832.0, "logits/rejected": -357956992.0, "logps/chosen": -322.2456745723173, "logps/rejected": -398.4457908163265, "loss": 1.7312, "rewards/chosen": -0.5156455507174621, "rewards/margins": 0.8550450031483149, "rewards/rejected": -1.370690553865777, "step": 570 }, { "epoch": 0.6073298429319371, "grad_norm": 13.910249710083008, "kl": 0.0, "learning_rate": 2.0139557129307149e-07, "logits/chosen": -369174880.0, "logits/rejected": -375438400.0, "logps/chosen": -355.16543093152865, "logps/rejected": -419.0519555214724, "loss": 1.7166, "rewards/chosen": -0.5668097817973726, "rewards/margins": 0.8209082784679168, "rewards/rejected": -1.3877180602652894, "step": 580 }, { "epoch": 0.6178010471204188, "grad_norm": 31.937427520751953, "kl": 0.0, "learning_rate": 1.9246136655151808e-07, "logits/chosen": -388834208.0, "logits/rejected": -366008416.0, "logps/chosen": -362.90542635658915, "logps/rejected": -438.9683070866142, "loss": 1.7186, "rewards/chosen": -0.7052005738250969, "rewards/margins": 0.9188863017900605, "rewards/rejected": -1.6240868756151574, "step": 590 }, { "epoch": 0.6282722513089005, "grad_norm": 51.06322479248047, "kl": 0.0, "learning_rate": 1.8360411463223873e-07, "logits/chosen": -373022624.0, "logits/rejected": -388438080.0, "logps/chosen": -361.36163553259144, "logps/rejected": -437.6943644393241, "loss": 1.685, "rewards/chosen": -0.7795385412267737, "rewards/margins": 0.9562707380671012, "rewards/rejected": -1.7358092792938749, "step": 600 }, { "epoch": 0.6282722513089005, "eval_kl": 0.0, "eval_logits/chosen": -388254240.0, "eval_logits/rejected": -387494368.0, "eval_logps/chosen": -383.711, "eval_logps/rejected": -454.13559375, "eval_loss": 0.43252766132354736, "eval_rewards/chosen": -0.9585521240234375, "eval_rewards/margins": 0.9132923583984375, "eval_rewards/rejected": -1.871844482421875, "eval_runtime": 172.7536, "eval_samples_per_second": 23.154, "eval_steps_per_second": 0.724, "step": 600 }, { "epoch": 0.6387434554973822, "grad_norm": 17.530250549316406, "kl": 0.0, "learning_rate": 1.7483566132460865e-07, "logits/chosen": -372182848.0, "logits/rejected": -386128032.0, "logps/chosen": -404.1522943037975, "logps/rejected": -436.9659047067901, "loss": 1.7515, "rewards/chosen": -1.0911100363429589, "rewards/margins": 0.7743034881434416, "rewards/rejected": -1.8654135244864005, "step": 610 }, { "epoch": 0.6492146596858639, "grad_norm": 71.81634521484375, "kl": 0.0, "learning_rate": 1.66167733657731e-07, "logits/chosen": -379878784.0, "logits/rejected": -385352000.0, "logps/chosen": -417.7991178229665, "logps/rejected": -460.5967649310873, "loss": 1.7452, "rewards/chosen": -1.1937847715435606, "rewards/margins": 0.7742432750921593, "rewards/rejected": -1.9680280466357198, "step": 620 }, { "epoch": 0.6596858638743456, "grad_norm": 35.158390045166016, "kl": 0.0, "learning_rate": 1.5761192421657456e-07, "logits/chosen": -363958816.0, "logits/rejected": -387630624.0, "logps/chosen": -395.60520666932905, "logps/rejected": -463.6549120795107, "loss": 1.693, "rewards/chosen": -1.0146022223816893, "rewards/margins": 0.9162212408016057, "rewards/rejected": -1.930823463183295, "step": 630 }, { "epoch": 0.6701570680628273, "grad_norm": 73.86211395263672, "kl": 0.0, "learning_rate": 1.491796756379185e-07, "logits/chosen": -397256448.0, "logits/rejected": -357917472.0, "logps/chosen": -384.6437266791045, "logps/rejected": -425.7797643442623, "loss": 1.7584, "rewards/chosen": -0.7558601493266092, "rewards/margins": 0.921012729451567, "rewards/rejected": -1.6768728787781761, "step": 640 }, { "epoch": 0.680628272251309, "grad_norm": 19.859880447387695, "kl": 0.0, "learning_rate": 1.4088226530684071e-07, "logits/chosen": -384827904.0, "logits/rejected": -371061408.0, "logps/chosen": -354.42405913978496, "logps/rejected": -410.13796701112875, "loss": 1.7188, "rewards/chosen": -0.5760806504116264, "rewards/margins": 0.9016887737114857, "rewards/rejected": -1.477769424123112, "step": 650 }, { "epoch": 0.6910994764397905, "grad_norm": 36.21221923828125, "kl": 0.0, "learning_rate": 1.327307902742142e-07, "logits/chosen": -416035360.0, "logits/rejected": -385663392.0, "logps/chosen": -344.91139423076925, "logps/rejected": -437.5813492063492, "loss": 1.686, "rewards/chosen": -0.6315081317608173, "rewards/margins": 1.0616693003448374, "rewards/rejected": -1.6931774321056547, "step": 660 }, { "epoch": 0.7015706806282722, "grad_norm": 44.68547821044922, "kl": 0.0, "learning_rate": 1.2473615241538523e-07, "logits/chosen": -376409280.0, "logits/rejected": -334625152.0, "logps/chosen": -340.43985190014905, "logps/rejected": -424.7399938423645, "loss": 1.776, "rewards/chosen": -0.6765481917584528, "rewards/margins": 0.8097976725234128, "rewards/rejected": -1.4863458642818657, "step": 670 }, { "epoch": 0.7120418848167539, "grad_norm": 29.446016311645508, "kl": 0.0, "learning_rate": 1.169090438498816e-07, "logits/chosen": -381945856.0, "logits/rejected": -390224416.0, "logps/chosen": -359.84859154929575, "logps/rejected": -424.1903276131045, "loss": 1.6951, "rewards/chosen": -0.6581172555078736, "rewards/margins": 0.9331115450113348, "rewards/rejected": -1.5912288005192083, "step": 680 }, { "epoch": 0.7225130890052356, "grad_norm": 30.748411178588867, "kl": 0.0, "learning_rate": 1.0925993264165045e-07, "logits/chosen": -370050240.0, "logits/rejected": -376552000.0, "logps/chosen": -363.6959115415335, "logps/rejected": -440.83008409785936, "loss": 1.6934, "rewards/chosen": -0.7725032526083266, "rewards/margins": 0.9274107755477323, "rewards/rejected": -1.699914028156059, "step": 690 }, { "epoch": 0.7329842931937173, "grad_norm": 29.660114288330078, "kl": 0.0, "learning_rate": 1.0179904879894998e-07, "logits/chosen": -384338432.0, "logits/rejected": -357984064.0, "logps/chosen": -360.97984423981194, "logps/rejected": -440.06215926791276, "loss": 1.7067, "rewards/chosen": -0.7834205747024393, "rewards/margins": 0.9409068216784992, "rewards/rejected": -1.7243273963809385, "step": 700 }, { "epoch": 0.743455497382199, "grad_norm": 42.167049407958984, "kl": 0.0, "learning_rate": 9.453637059262117e-08, "logits/chosen": -355324928.0, "logits/rejected": -358938272.0, "logps/chosen": -350.62712309160304, "logps/rejected": -409.406275, "loss": 1.7458, "rewards/chosen": -0.7520553472387882, "rewards/margins": 0.8503729730737118, "rewards/rejected": -1.6024283203125, "step": 710 }, { "epoch": 0.7539267015706806, "grad_norm": 76.56432342529297, "kl": 0.0, "learning_rate": 8.748161121103406e-08, "logits/chosen": -380407232.0, "logits/rejected": -380544576.0, "logps/chosen": -358.77977362204723, "logps/rejected": -443.77843992248063, "loss": 1.683, "rewards/chosen": -0.6690234086644931, "rewards/margins": 0.979472193019131, "rewards/rejected": -1.648495601683624, "step": 720 }, { "epoch": 0.7643979057591623, "grad_norm": 20.125774383544922, "kl": 0.0, "learning_rate": 8.064420576955965e-08, "logits/chosen": -379323904.0, "logits/rejected": -390768192.0, "logps/chosen": -373.331298828125, "logps/rejected": -438.97998046875, "loss": 1.7388, "rewards/chosen": -0.8371871948242188, "rewards/margins": 0.8446130752563475, "rewards/rejected": -1.6818002700805663, "step": 730 }, { "epoch": 0.774869109947644, "grad_norm": 66.10313415527344, "kl": 0.0, "learning_rate": 7.403329869193922e-08, "logits/chosen": -371630016.0, "logits/rejected": -358382208.0, "logps/chosen": -354.93711597542244, "logps/rejected": -428.1827305246423, "loss": 1.6854, "rewards/chosen": -0.7562084285894297, "rewards/margins": 1.0796500847377761, "rewards/rejected": -1.8358585133272058, "step": 740 }, { "epoch": 0.7853403141361257, "grad_norm": 57.931419372558594, "kl": 0.0, "learning_rate": 6.765773148042858e-08, "logits/chosen": -389279648.0, "logits/rejected": -367319104.0, "logps/chosen": -360.12411194316434, "logps/rejected": -412.91921701112875, "loss": 1.7553, "rewards/chosen": -0.7227982434625816, "rewards/margins": 0.8275232536434399, "rewards/rejected": -1.5503214971060215, "step": 750 }, { "epoch": 0.7958115183246073, "grad_norm": 35.246177673339844, "kl": 0.0, "learning_rate": 6.152603089107139e-08, "logits/chosen": -364429152.0, "logits/rejected": -349219520.0, "logps/chosen": -340.38173040334857, "logps/rejected": -400.4701043338684, "loss": 1.7462, "rewards/chosen": -0.6163304066186264, "rewards/margins": 0.8346214283156634, "rewards/rejected": -1.4509518349342898, "step": 760 }, { "epoch": 0.806282722513089, "grad_norm": 16.510046005249023, "kl": 0.0, "learning_rate": 5.5646397529920175e-08, "logits/chosen": -363931648.0, "logits/rejected": -370810304.0, "logps/chosen": -365.2676868044515, "logps/rejected": -408.1744911674347, "loss": 1.6917, "rewards/chosen": -0.5827012297079939, "rewards/margins": 0.925817065462897, "rewards/rejected": -1.5085182951708909, "step": 770 }, { "epoch": 0.8167539267015707, "grad_norm": 15.439537048339844, "kl": 0.0, "learning_rate": 5.002669488545111e-08, "logits/chosen": -354813536.0, "logits/rejected": -418654816.0, "logps/chosen": -342.4249794745484, "logps/rejected": -417.48695976154994, "loss": 1.6949, "rewards/chosen": -0.5906535432060755, "rewards/margins": 0.8275922754763947, "rewards/rejected": -1.4182458186824702, "step": 780 }, { "epoch": 0.8272251308900523, "grad_norm": 53.54264450073242, "kl": 0.0, "learning_rate": 4.467443881184646e-08, "logits/chosen": -380888512.0, "logits/rejected": -383134880.0, "logps/chosen": -345.58564268867923, "logps/rejected": -391.497573757764, "loss": 1.7238, "rewards/chosen": -0.5928520826423693, "rewards/margins": 0.8389644449197424, "rewards/rejected": -1.4318165275621118, "step": 790 }, { "epoch": 0.837696335078534, "grad_norm": 39.76443862915039, "kl": 0.0, "learning_rate": 3.959678747720488e-08, "logits/chosen": -406014528.0, "logits/rejected": -366457024.0, "logps/chosen": -337.72622685185183, "logps/rejected": -407.47355371900824, "loss": 1.7464, "rewards/chosen": -0.5733218948929398, "rewards/margins": 0.919490181391926, "rewards/rejected": -1.4928120762848658, "step": 800 }, { "epoch": 0.837696335078534, "eval_kl": 0.0, "eval_logits/chosen": -377414720.0, "eval_logits/rejected": -376930848.0, "eval_logps/chosen": -345.01959375, "eval_logps/rejected": -411.8449375, "eval_loss": 0.43189236521720886, "eval_rewards/chosen": -0.5716385498046875, "eval_rewards/margins": 0.87729931640625, "eval_rewards/rejected": -1.4489378662109376, "eval_runtime": 172.2229, "eval_samples_per_second": 23.226, "eval_steps_per_second": 0.726, "step": 800 }, { "epoch": 0.8481675392670157, "grad_norm": 59.97751998901367, "kl": 0.0, "learning_rate": 3.480053179012654e-08, "logits/chosen": -355148352.0, "logits/rejected": -381496640.0, "logps/chosen": -333.25054650238474, "logps/rejected": -400.6155193932412, "loss": 1.7686, "rewards/chosen": -0.6660681695741008, "rewards/margins": 0.6893864232698413, "rewards/rejected": -1.355454592843942, "step": 810 }, { "epoch": 0.8586387434554974, "grad_norm": 36.40221405029297, "kl": 0.0, "learning_rate": 3.029208631747446e-08, "logits/chosen": -377712128.0, "logits/rejected": -356546464.0, "logps/chosen": -329.12806748466255, "logps/rejected": -420.65381170382165, "loss": 1.6857, "rewards/chosen": -0.5312460214813794, "rewards/margins": 1.0716216395270302, "rewards/rejected": -1.6028676610084096, "step": 820 }, { "epoch": 0.8691099476439791, "grad_norm": 49.2255859375, "kl": 0.0, "learning_rate": 2.607748070546037e-08, "logits/chosen": -374321088.0, "logits/rejected": -385330848.0, "logps/chosen": -340.50354889589903, "logps/rejected": -425.66645704334366, "loss": 1.7034, "rewards/chosen": -0.6285861523745564, "rewards/margins": 0.9371930228080578, "rewards/rejected": -1.5657791751826142, "step": 830 }, { "epoch": 0.8795811518324608, "grad_norm": 68.4431381225586, "kl": 0.0, "learning_rate": 2.2162351615526544e-08, "logits/chosen": -379910592.0, "logits/rejected": -405686880.0, "logps/chosen": -363.5833333333333, "logps/rejected": -413.6287980769231, "loss": 1.7074, "rewards/chosen": -0.6024742005363343, "rewards/margins": 0.8925232641571753, "rewards/rejected": -1.4949974646935096, "step": 840 }, { "epoch": 0.8900523560209425, "grad_norm": 47.56254577636719, "kl": 0.0, "learning_rate": 1.8551935185811717e-08, "logits/chosen": -358018880.0, "logits/rejected": -372178848.0, "logps/chosen": -346.9014880952381, "logps/rejected": -434.3848557692308, "loss": 1.6874, "rewards/chosen": -0.6287514338417659, "rewards/margins": 0.9804288170596764, "rewards/rejected": -1.6091802509014423, "step": 850 }, { "epoch": 0.900523560209424, "grad_norm": 39.030643463134766, "kl": 0.0, "learning_rate": 1.5251060028279612e-08, "logits/chosen": -388987392.0, "logits/rejected": -356903168.0, "logps/chosen": -339.5614374034003, "logps/rejected": -434.5506516587678, "loss": 1.7297, "rewards/chosen": -0.6679287516904946, "rewards/margins": 0.8691605675235654, "rewards/rejected": -1.53708931921406, "step": 860 }, { "epoch": 0.9109947643979057, "grad_norm": 14.048391342163086, "kl": 0.0, "learning_rate": 1.2264140770878839e-08, "logits/chosen": -378864544.0, "logits/rejected": -394341504.0, "logps/chosen": -366.41154912836765, "logps/rejected": -426.361999229584, "loss": 1.7101, "rewards/chosen": -0.6661284578205477, "rewards/margins": 0.9078862146924531, "rewards/rejected": -1.5740146725130009, "step": 870 }, { "epoch": 0.9214659685863874, "grad_norm": 54.477298736572266, "kl": 0.0, "learning_rate": 9.59517215336922e-09, "logits/chosen": -356389440.0, "logits/rejected": -364706528.0, "logps/chosen": -348.5319102112676, "logps/rejected": -433.04197542901716, "loss": 1.6794, "rewards/chosen": -0.686365930500538, "rewards/margins": 1.0420925633079645, "rewards/rejected": -1.7284584938085024, "step": 880 }, { "epoch": 0.9319371727748691, "grad_norm": 38.72111129760742, "kl": 0.0, "learning_rate": 7.247723684711382e-09, "logits/chosen": -372304384.0, "logits/rejected": -349962176.0, "logps/chosen": -332.2313262195122, "logps/rejected": -413.34995993589746, "loss": 1.7364, "rewards/chosen": -0.5879382156744236, "rewards/margins": 0.8648360278026398, "rewards/rejected": -1.4527742434770634, "step": 890 }, { "epoch": 0.9424083769633508, "grad_norm": 65.32742309570312, "kl": 0.0, "learning_rate": 5.224934869164976e-09, "logits/chosen": -384202432.0, "logits/rejected": -398582304.0, "logps/chosen": -358.9657617504052, "logps/rejected": -431.56499811463044, "loss": 1.7016, "rewards/chosen": -0.6722601545881027, "rewards/margins": 0.8951942798599554, "rewards/rejected": -1.567454434448058, "step": 900 }, { "epoch": 0.9528795811518325, "grad_norm": 28.859342575073242, "kl": 0.0, "learning_rate": 3.529511007479946e-09, "logits/chosen": -394682496.0, "logits/rejected": -370364992.0, "logps/chosen": -354.0907018049155, "logps/rejected": -399.780231518283, "loss": 1.7362, "rewards/chosen": -0.6060863676525298, "rewards/margins": 0.8388409995411804, "rewards/rejected": -1.4449273671937102, "step": 910 }, { "epoch": 0.9633507853403142, "grad_norm": 65.4843521118164, "kl": 0.0, "learning_rate": 2.1637195787966857e-09, "logits/chosen": -364587168.0, "logits/rejected": -388965792.0, "logps/chosen": -356.6702698635634, "logps/rejected": -412.88955479452056, "loss": 1.6953, "rewards/chosen": -0.5981720117657755, "rewards/margins": 0.9335980412093957, "rewards/rejected": -1.5317700529751712, "step": 920 }, { "epoch": 0.9738219895287958, "grad_norm": 41.27497482299805, "kl": 0.0, "learning_rate": 1.1293872080934963e-09, "logits/chosen": -348726528.0, "logits/rejected": -397161920.0, "logps/chosen": -344.23691152597405, "logps/rejected": -425.8096291415663, "loss": 1.6647, "rewards/chosen": -0.600016011820211, "rewards/margins": 0.9757914906506097, "rewards/rejected": -1.5758075024708207, "step": 930 }, { "epoch": 0.9842931937172775, "grad_norm": 26.048463821411133, "kl": 0.0, "learning_rate": 4.2789722323760546e-10, "logits/chosen": -391641184.0, "logits/rejected": -366561632.0, "logps/chosen": -350.878531998503, "logps/rejected": -422.56597222222223, "loss": 1.7305, "rewards/chosen": -0.594213657036513, "rewards/margins": 0.995801031293246, "rewards/rejected": -1.590014688329759, "step": 940 }, { "epoch": 0.9947643979057592, "grad_norm": 47.700260162353516, "kl": 0.0, "learning_rate": 6.018780490690822e-11, "logits/chosen": -406117312.0, "logits/rejected": -365830848.0, "logps/chosen": -347.9480587121212, "logps/rejected": -418.8353578629032, "loss": 1.7033, "rewards/chosen": -0.6121012832179215, "rewards/margins": 1.0529334477548606, "rewards/rejected": -1.6650347309727822, "step": 950 }, { "epoch": 1.0, "step": 955, "total_flos": 0.0, "train_loss": 1.7875109602643557, "train_runtime": 10843.0021, "train_samples_per_second": 11.276, "train_steps_per_second": 0.088 } ], "logging_steps": 10, "max_steps": 955, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }