{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998691270776077, "eval_steps": 1000, "global_step": 1910, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005234916895694281, "grad_norm": 3.790904594664249, "learning_rate": 5.2356020942408376e-08, "logits/chosen": -1.074317216873169, "logits/rejected": -1.2653461694717407, "logps/chosen": -0.4452144503593445, "logps/rejected": -0.6091843247413635, "loss": 1.4151, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 4.849554898100905e-05, "rewards/rejected": -3.19069076795131e-05, "step": 10 }, { "epoch": 0.010469833791388562, "grad_norm": 4.012740047235236, "learning_rate": 1.0471204188481675e-07, "logits/chosen": -0.9944978952407837, "logits/rejected": -1.1899915933609009, "logps/chosen": -0.4916958212852478, "logps/rejected": -0.6457526683807373, "loss": 2.9024, "rewards/accuracies": 0.5625, "rewards/chosen": 5.545483509195037e-05, "rewards/rejected": 4.941503357258625e-05, "step": 20 }, { "epoch": 0.015704750687082842, "grad_norm": 7.854381323221543, "learning_rate": 1.5706806282722514e-07, "logits/chosen": -1.0470011234283447, "logits/rejected": -1.308021068572998, "logps/chosen": -0.4594300389289856, "logps/rejected": -0.6046071648597717, "loss": 1.2899, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -3.4633787436177954e-05, "rewards/rejected": -0.00032249835203401744, "step": 30 }, { "epoch": 0.020939667582777124, "grad_norm": 3.6452610558813654, "learning_rate": 2.094240837696335e-07, "logits/chosen": -1.1655104160308838, "logits/rejected": -1.3452240228652954, "logps/chosen": -0.39312028884887695, "logps/rejected": -0.5556824207305908, "loss": 2.2173, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -8.240896568167955e-05, "rewards/rejected": -0.001136856502853334, "step": 40 }, { "epoch": 0.026174584478471406, "grad_norm": 8.500475399649632, "learning_rate": 2.6178010471204185e-07, "logits/chosen": -1.1437081098556519, "logits/rejected": -1.4434831142425537, "logps/chosen": -0.43456870317459106, "logps/rejected": -0.5780390501022339, "loss": 1.2709, "rewards/accuracies": 0.6875, "rewards/chosen": -0.00010843189375009388, "rewards/rejected": -0.001327984849922359, "step": 50 }, { "epoch": 0.031409501374165684, "grad_norm": 20.273102817060455, "learning_rate": 3.1413612565445027e-07, "logits/chosen": -1.0886688232421875, "logits/rejected": -1.2838691473007202, "logps/chosen": -0.44190168380737305, "logps/rejected": -0.6152902841567993, "loss": 1.8135, "rewards/accuracies": 0.75, "rewards/chosen": -0.0007149001467041671, "rewards/rejected": -0.005795066244900227, "step": 60 }, { "epoch": 0.036644418269859966, "grad_norm": 43.16254505746086, "learning_rate": 3.6649214659685864e-07, "logits/chosen": -1.094167709350586, "logits/rejected": -1.3319257497787476, "logps/chosen": -0.4520147740840912, "logps/rejected": -0.6239620447158813, "loss": 1.7838, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.000759888265747577, "rewards/rejected": -0.0066768391989171505, "step": 70 }, { "epoch": 0.04187933516555425, "grad_norm": 11.998623689285427, "learning_rate": 4.18848167539267e-07, "logits/chosen": -1.2943776845932007, "logits/rejected": -1.503999948501587, "logps/chosen": -0.3685997724533081, "logps/rejected": -0.5420706272125244, "loss": 2.3502, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.0021785215940326452, "rewards/rejected": -0.016134750097990036, "step": 80 }, { "epoch": 0.04711425206124853, "grad_norm": 24.994033392121455, "learning_rate": 4.712041884816754e-07, "logits/chosen": -1.2916871309280396, "logits/rejected": -1.3448667526245117, "logps/chosen": -0.4025228023529053, "logps/rejected": -0.5868708491325378, "loss": 1.8316, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.007973430678248405, "rewards/rejected": -0.02014215663075447, "step": 90 }, { "epoch": 0.05234916895694281, "grad_norm": 22.171843701204743, "learning_rate": 5.235602094240837e-07, "logits/chosen": -1.4637973308563232, "logits/rejected": -1.6466014385223389, "logps/chosen": -0.32837918400764465, "logps/rejected": -0.515870213508606, "loss": 1.8409, "rewards/accuracies": 0.75, "rewards/chosen": -0.01690755970776081, "rewards/rejected": -0.050148021429777145, "step": 100 }, { "epoch": 0.057584085852637086, "grad_norm": 41.913524204299165, "learning_rate": 5.759162303664922e-07, "logits/chosen": -1.4786913394927979, "logits/rejected": -1.5899611711502075, "logps/chosen": -0.42642760276794434, "logps/rejected": -0.6284693479537964, "loss": 2.2794, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.009459340013563633, "rewards/rejected": -0.022919194772839546, "step": 110 }, { "epoch": 0.06281900274833137, "grad_norm": 9.042363746985718, "learning_rate": 6.282722513089005e-07, "logits/chosen": -1.6049375534057617, "logits/rejected": -1.756801962852478, "logps/chosen": -0.47461098432540894, "logps/rejected": -0.6772693991661072, "loss": 0.8005, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.014842224307358265, "rewards/rejected": -0.04914752393960953, "step": 120 }, { "epoch": 0.06805391964402566, "grad_norm": 7.811632000475067, "learning_rate": 6.806282722513089e-07, "logits/chosen": -1.5858232975006104, "logits/rejected": -1.7477552890777588, "logps/chosen": -0.46297797560691833, "logps/rejected": -0.6604179739952087, "loss": 0.9989, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0641578882932663, "rewards/rejected": -0.09264906495809555, "step": 130 }, { "epoch": 0.07328883653971993, "grad_norm": 15.65125219657381, "learning_rate": 7.329842931937173e-07, "logits/chosen": -1.5584498643875122, "logits/rejected": -1.709670066833496, "logps/chosen": -0.5157219171524048, "logps/rejected": -0.7746927738189697, "loss": 0.9063, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.09854600578546524, "rewards/rejected": -0.13169622421264648, "step": 140 }, { "epoch": 0.0785237534354142, "grad_norm": 44.24701426992428, "learning_rate": 7.853403141361256e-07, "logits/chosen": -1.7312242984771729, "logits/rejected": -1.827368140220642, "logps/chosen": -0.4762954115867615, "logps/rejected": -0.7440527081489563, "loss": 1.3761, "rewards/accuracies": 0.75, "rewards/chosen": -0.1354602426290512, "rewards/rejected": -0.22626717388629913, "step": 150 }, { "epoch": 0.0837586703311085, "grad_norm": 44.94371714749407, "learning_rate": 8.37696335078534e-07, "logits/chosen": -1.8007497787475586, "logits/rejected": -1.8725961446762085, "logps/chosen": -0.4830542504787445, "logps/rejected": -0.7854688763618469, "loss": 1.1893, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.07177692651748657, "rewards/rejected": -0.15431641042232513, "step": 160 }, { "epoch": 0.08899358722680277, "grad_norm": 31.115198767499926, "learning_rate": 8.900523560209424e-07, "logits/chosen": -1.9976160526275635, "logits/rejected": -2.054600954055786, "logps/chosen": -0.539434015750885, "logps/rejected": -0.8588225245475769, "loss": 1.3249, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.12421885877847672, "rewards/rejected": -0.3051489591598511, "step": 170 }, { "epoch": 0.09422850412249706, "grad_norm": 12.347508697538823, "learning_rate": 9.424083769633508e-07, "logits/chosen": -2.005187749862671, "logits/rejected": -1.9666427373886108, "logps/chosen": -0.5240647196769714, "logps/rejected": -0.9816449880599976, "loss": 1.2205, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.0668339729309082, "rewards/rejected": -0.3174983561038971, "step": 180 }, { "epoch": 0.09946342101819133, "grad_norm": 2.227548071196379, "learning_rate": 9.947643979057591e-07, "logits/chosen": -2.117922782897949, "logits/rejected": -2.1063754558563232, "logps/chosen": -0.6270676851272583, "logps/rejected": -1.067392110824585, "loss": 0.5393, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.05558066442608833, "rewards/rejected": -0.2207319438457489, "step": 190 }, { "epoch": 0.10469833791388562, "grad_norm": 7.953337477610359, "learning_rate": 9.999323662872996e-07, "logits/chosen": -2.4052085876464844, "logits/rejected": -2.478701591491699, "logps/chosen": -0.6346315145492554, "logps/rejected": -1.130063772201538, "loss": 0.4773, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.12082117795944214, "rewards/rejected": -0.4421643316745758, "step": 200 }, { "epoch": 0.1099332548095799, "grad_norm": 21.087545681911077, "learning_rate": 9.996985942280678e-07, "logits/chosen": -2.4521114826202393, "logits/rejected": -2.5669069290161133, "logps/chosen": -0.6473835110664368, "logps/rejected": -1.1176555156707764, "loss": 1.0686, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.11380704492330551, "rewards/rejected": -0.4065336287021637, "step": 210 }, { "epoch": 0.11516817170527417, "grad_norm": 15.632346870535166, "learning_rate": 9.99297926897573e-07, "logits/chosen": -2.7503812313079834, "logits/rejected": -2.812741994857788, "logps/chosen": -0.585160493850708, "logps/rejected": -1.0489227771759033, "loss": 0.2239, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.1262103021144867, "rewards/rejected": -0.5222524404525757, "step": 220 }, { "epoch": 0.12040308860096846, "grad_norm": 12.519053406194372, "learning_rate": 9.987304981154493e-07, "logits/chosen": -2.837965965270996, "logits/rejected": -2.9414877891540527, "logps/chosen": -0.722270131111145, "logps/rejected": -1.3173713684082031, "loss": 0.654, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.14661520719528198, "rewards/rejected": -0.5090414881706238, "step": 230 }, { "epoch": 0.12563800549666274, "grad_norm": 289.81551499279595, "learning_rate": 9.979964973983e-07, "logits/chosen": -2.9277539253234863, "logits/rejected": -3.028458833694458, "logps/chosen": -0.7722570300102234, "logps/rejected": -1.3954808712005615, "loss": 0.5797, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.2518690526485443, "rewards/rejected": -0.5512933135032654, "step": 240 }, { "epoch": 0.130872922392357, "grad_norm": 7.898569451837497, "learning_rate": 9.970961698964024e-07, "logits/chosen": -2.903446912765503, "logits/rejected": -2.9949727058410645, "logps/chosen": -0.6554209589958191, "logps/rejected": -1.3086931705474854, "loss": 0.2079, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.12546400725841522, "rewards/rejected": -0.540800929069519, "step": 250 }, { "epoch": 0.1361078392880513, "grad_norm": 35.37908090460439, "learning_rate": 9.960298163118284e-07, "logits/chosen": -2.862393617630005, "logits/rejected": -3.070669174194336, "logps/chosen": -0.6949301958084106, "logps/rejected": -1.335038185119629, "loss": 0.2834, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1361745148897171, "rewards/rejected": -0.5548567771911621, "step": 260 }, { "epoch": 0.1413427561837456, "grad_norm": 2.8575991741134716, "learning_rate": 9.94797792798013e-07, "logits/chosen": -3.01228666305542, "logits/rejected": -3.385458469390869, "logps/chosen": -0.6274330615997314, "logps/rejected": -1.239761471748352, "loss": 0.6421, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.14010125398635864, "rewards/rejected": -0.600482702255249, "step": 270 }, { "epoch": 0.14657767307943986, "grad_norm": 59.20898505824435, "learning_rate": 9.934005108408016e-07, "logits/chosen": -3.242931842803955, "logits/rejected": -3.3713455200195312, "logps/chosen": -0.7664941549301147, "logps/rejected": -1.5041484832763672, "loss": 0.2062, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.10625378042459488, "rewards/rejected": -0.4088156819343567, "step": 280 }, { "epoch": 0.15181258997513414, "grad_norm": 67.37145724737809, "learning_rate": 9.918384371210175e-07, "logits/chosen": -3.297367572784424, "logits/rejected": -3.390179395675659, "logps/chosen": -0.6818624138832092, "logps/rejected": -1.3045790195465088, "loss": 0.2825, "rewards/accuracies": 0.8125, "rewards/chosen": -0.14947417378425598, "rewards/rejected": -0.5264440774917603, "step": 290 }, { "epoch": 0.1570475068708284, "grad_norm": 6.991708239819734, "learning_rate": 9.901120933585937e-07, "logits/chosen": -2.914552688598633, "logits/rejected": -2.9296183586120605, "logps/chosen": -0.7332116961479187, "logps/rejected": -1.3207851648330688, "loss": 0.2746, "rewards/accuracies": 0.8125, "rewards/chosen": -0.1801702082157135, "rewards/rejected": -0.4402903616428375, "step": 300 }, { "epoch": 0.16228242376652272, "grad_norm": 5.467852048052765, "learning_rate": 9.882220561383237e-07, "logits/chosen": -2.6258440017700195, "logits/rejected": -2.7410550117492676, "logps/chosen": -0.6673339605331421, "logps/rejected": -1.259270429611206, "loss": 0.2535, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.1613113135099411, "rewards/rejected": -0.5873299837112427, "step": 310 }, { "epoch": 0.167517340662217, "grad_norm": 3.8941661088693595, "learning_rate": 9.861689567172849e-07, "logits/chosen": -2.7174181938171387, "logits/rejected": -2.859903573989868, "logps/chosen": -0.7855610251426697, "logps/rejected": -1.3645145893096924, "loss": 0.571, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.11017797142267227, "rewards/rejected": -0.33003589510917664, "step": 320 }, { "epoch": 0.17275225755791127, "grad_norm": 16.181666110930603, "learning_rate": 9.839534808140065e-07, "logits/chosen": -2.7446448802948, "logits/rejected": -2.9309000968933105, "logps/chosen": -0.6837766170501709, "logps/rejected": -1.2414804697036743, "loss": 0.4195, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.17641454935073853, "rewards/rejected": -0.6049523949623108, "step": 330 }, { "epoch": 0.17798717445360554, "grad_norm": 6.6623417517030274, "learning_rate": 9.815763683794431e-07, "logits/chosen": -3.110708713531494, "logits/rejected": -3.232154130935669, "logps/chosen": -0.9199361801147461, "logps/rejected": -1.6282669305801392, "loss": 0.3528, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.16633550822734833, "rewards/rejected": -0.5285095572471619, "step": 340 }, { "epoch": 0.18322209134929984, "grad_norm": 42.75001280615641, "learning_rate": 9.790384133498377e-07, "logits/chosen": -3.181398868560791, "logits/rejected": -3.380305528640747, "logps/chosen": -0.6931721568107605, "logps/rejected": -1.301574468612671, "loss": 0.4167, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.238613560795784, "rewards/rejected": -0.5942808985710144, "step": 350 }, { "epoch": 0.18845700824499412, "grad_norm": 25.407488562124865, "learning_rate": 9.763404633815536e-07, "logits/chosen": -3.138686418533325, "logits/rejected": -3.3759498596191406, "logps/chosen": -0.7499098777770996, "logps/rejected": -1.2932870388031006, "loss": 0.1701, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.13701362907886505, "rewards/rejected": -0.4852725863456726, "step": 360 }, { "epoch": 0.1936919251406884, "grad_norm": 1.4892370693809598, "learning_rate": 9.73483419567964e-07, "logits/chosen": -3.4019501209259033, "logits/rejected": -3.549142837524414, "logps/chosen": -0.7944781184196472, "logps/rejected": -1.424883484840393, "loss": 0.1402, "rewards/accuracies": 0.75, "rewards/chosen": -0.12177709490060806, "rewards/rejected": -0.3804728090763092, "step": 370 }, { "epoch": 0.19892684203638267, "grad_norm": 54.39502830839909, "learning_rate": 9.70468236138494e-07, "logits/chosen": -3.2426352500915527, "logits/rejected": -3.4838757514953613, "logps/chosen": -0.6787932515144348, "logps/rejected": -1.1444861888885498, "loss": 0.5084, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.1906341016292572, "rewards/rejected": -0.5342355966567993, "step": 380 }, { "epoch": 0.20416175893207694, "grad_norm": 4.102532676377055, "learning_rate": 9.672959201399155e-07, "logits/chosen": -3.1436872482299805, "logits/rejected": -3.3000025749206543, "logps/chosen": -0.6593716144561768, "logps/rejected": -1.2297086715698242, "loss": 0.2254, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.1245049387216568, "rewards/rejected": -0.63857102394104, "step": 390 }, { "epoch": 0.20939667582777124, "grad_norm": 26.264208746234342, "learning_rate": 9.639675311000027e-07, "logits/chosen": -2.8664770126342773, "logits/rejected": -3.217289686203003, "logps/chosen": -0.5279222726821899, "logps/rejected": -0.9587762951850891, "loss": 0.2976, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.08593594282865524, "rewards/rejected": -0.5586038827896118, "step": 400 }, { "epoch": 0.21463159272346552, "grad_norm": 38.41571021062487, "learning_rate": 9.60484180673657e-07, "logits/chosen": -3.0221009254455566, "logits/rejected": -3.3375930786132812, "logps/chosen": -0.6153509020805359, "logps/rejected": -1.208428144454956, "loss": 0.2007, "rewards/accuracies": 0.8125, "rewards/chosen": -0.1186663880944252, "rewards/rejected": -0.7561261057853699, "step": 410 }, { "epoch": 0.2198665096191598, "grad_norm": 26.129325664394095, "learning_rate": 9.568470322716246e-07, "logits/chosen": -3.197547435760498, "logits/rejected": -3.473541736602783, "logps/chosen": -0.7100402116775513, "logps/rejected": -1.3731144666671753, "loss": 0.2129, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.15885800123214722, "rewards/rejected": -0.7041777968406677, "step": 420 }, { "epoch": 0.22510142651485407, "grad_norm": 3.3568794858215614, "learning_rate": 9.530573006719263e-07, "logits/chosen": -3.1833243370056152, "logits/rejected": -3.5457568168640137, "logps/chosen": -0.7644280195236206, "logps/rejected": -1.4675564765930176, "loss": 0.3047, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.10174532234668732, "rewards/rejected": -0.43558469414711, "step": 430 }, { "epoch": 0.23033634341054834, "grad_norm": 6.858790048554929, "learning_rate": 9.491162516141307e-07, "logits/chosen": -3.0963964462280273, "logits/rejected": -3.347712993621826, "logps/chosen": -0.7016817927360535, "logps/rejected": -1.3338514566421509, "loss": 0.3073, "rewards/accuracies": 0.8125, "rewards/chosen": -0.09215731918811798, "rewards/rejected": -0.45697230100631714, "step": 440 }, { "epoch": 0.23557126030624265, "grad_norm": 23.530428559730304, "learning_rate": 9.450252013766092e-07, "logits/chosen": -3.2469124794006348, "logits/rejected": -3.4776294231414795, "logps/chosen": -0.7150281667709351, "logps/rejected": -1.317091703414917, "loss": 0.2759, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.11948645114898682, "rewards/rejected": -0.5289221405982971, "step": 450 }, { "epoch": 0.24080617720193692, "grad_norm": 3.0816780004971567, "learning_rate": 9.407855163369078e-07, "logits/chosen": -3.1366970539093018, "logits/rejected": -3.4390456676483154, "logps/chosen": -0.7695866823196411, "logps/rejected": -1.28377366065979, "loss": 0.1834, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.10271289199590683, "rewards/rejected": -0.4681883454322815, "step": 460 }, { "epoch": 0.2460410940976312, "grad_norm": 0.8968075253326218, "learning_rate": 9.3639861251539e-07, "logits/chosen": -3.204808473587036, "logits/rejected": -3.410945415496826, "logps/chosen": -0.7953635454177856, "logps/rejected": -1.4356180429458618, "loss": 0.2817, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.11539553105831146, "rewards/rejected": -0.5434930324554443, "step": 470 }, { "epoch": 0.25127601099332547, "grad_norm": 15.250575277553056, "learning_rate": 9.318659551022955e-07, "logits/chosen": -3.5916152000427246, "logits/rejected": -3.745251417160034, "logps/chosen": -0.7927001118659973, "logps/rejected": -1.4079248905181885, "loss": 0.3906, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.14077363908290863, "rewards/rejected": -0.6647804379463196, "step": 480 }, { "epoch": 0.25651092788901975, "grad_norm": 8.302573476788423, "learning_rate": 9.271890579683804e-07, "logits/chosen": -3.6112866401672363, "logits/rejected": -3.7867329120635986, "logps/chosen": -0.7936150431632996, "logps/rejected": -1.3792264461517334, "loss": 0.155, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1597827672958374, "rewards/rejected": -0.7419155240058899, "step": 490 }, { "epoch": 0.261745844784714, "grad_norm": 19.15043456191315, "learning_rate": 9.223694831592952e-07, "logits/chosen": -3.4215950965881348, "logits/rejected": -3.5204520225524902, "logps/chosen": -0.6265555024147034, "logps/rejected": -1.1579577922821045, "loss": 0.465, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.20739369094371796, "rewards/rejected": -0.7289354205131531, "step": 500 }, { "epoch": 0.2669807616804083, "grad_norm": 64.7184486122584, "learning_rate": 9.174088403738755e-07, "logits/chosen": -3.099766969680786, "logits/rejected": -3.3204002380371094, "logps/chosen": -0.690481424331665, "logps/rejected": -1.2163944244384766, "loss": 0.3773, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.13189923763275146, "rewards/rejected": -0.520399808883667, "step": 510 }, { "epoch": 0.2722156785761026, "grad_norm": 17.501316331807786, "learning_rate": 9.123087864265147e-07, "logits/chosen": -3.1762442588806152, "logits/rejected": -3.2784526348114014, "logps/chosen": -0.6847952008247375, "logps/rejected": -1.1162774562835693, "loss": 0.1267, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.06572765856981277, "rewards/rejected": -0.351901113986969, "step": 520 }, { "epoch": 0.2774505954717969, "grad_norm": 14.212473293142173, "learning_rate": 9.070710246938016e-07, "logits/chosen": -3.2481791973114014, "logits/rejected": -3.489774227142334, "logps/chosen": -0.772381603717804, "logps/rejected": -1.4248247146606445, "loss": 0.2427, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.09138830751180649, "rewards/rejected": -0.4058937132358551, "step": 530 }, { "epoch": 0.2826855123674912, "grad_norm": 3.7993918380305227, "learning_rate": 9.016973045456073e-07, "logits/chosen": -3.5233864784240723, "logits/rejected": -3.5879790782928467, "logps/chosen": -0.6680731773376465, "logps/rejected": -1.2521740198135376, "loss": 0.233, "rewards/accuracies": 0.8125, "rewards/chosen": -0.16955754160881042, "rewards/rejected": -0.6247085928916931, "step": 540 }, { "epoch": 0.28792042926318545, "grad_norm": 31.268425824083035, "learning_rate": 8.961894207608087e-07, "logits/chosen": -3.3877577781677246, "logits/rejected": -3.6265366077423096, "logps/chosen": -0.7721540331840515, "logps/rejected": -1.4250587224960327, "loss": 0.2769, "rewards/accuracies": 0.75, "rewards/chosen": -0.16779252886772156, "rewards/rejected": -0.6825228333473206, "step": 550 }, { "epoch": 0.2931553461588797, "grad_norm": 7.718273564892154, "learning_rate": 8.905492129278477e-07, "logits/chosen": -3.29071044921875, "logits/rejected": -3.561675548553467, "logps/chosen": -0.8162961006164551, "logps/rejected": -1.4463467597961426, "loss": 0.1915, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.204677551984787, "rewards/rejected": -0.47629499435424805, "step": 560 }, { "epoch": 0.298390263054574, "grad_norm": 12.025693422800462, "learning_rate": 8.847785648303233e-07, "logits/chosen": -3.204369306564331, "logits/rejected": -3.2629711627960205, "logps/chosen": -0.7215951085090637, "logps/rejected": -1.2496535778045654, "loss": 0.3283, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.17979201674461365, "rewards/rejected": -0.6653040647506714, "step": 570 }, { "epoch": 0.3036251799502683, "grad_norm": 16.91677605832855, "learning_rate": 8.788794038178232e-07, "logits/chosen": -3.354393482208252, "logits/rejected": -3.566246747970581, "logps/chosen": -0.6297786831855774, "logps/rejected": -1.2458020448684692, "loss": 0.4011, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.1064259260892868, "rewards/rejected": -0.6175190210342407, "step": 580 }, { "epoch": 0.30886009684596255, "grad_norm": 3.0634420462864362, "learning_rate": 8.728537001622049e-07, "logits/chosen": -3.4152801036834717, "logits/rejected": -3.7001967430114746, "logps/chosen": -0.6182211637496948, "logps/rejected": -1.2031091451644897, "loss": 0.3326, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.12622275948524475, "rewards/rejected": -0.6566742062568665, "step": 590 }, { "epoch": 0.3140950137416568, "grad_norm": 35.90548539348865, "learning_rate": 8.667034663995408e-07, "logits/chosen": -3.3982882499694824, "logits/rejected": -3.6578991413116455, "logps/chosen": -0.6449892520904541, "logps/rejected": -1.2093414068222046, "loss": 0.3771, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.0947725772857666, "rewards/rejected": -0.46876105666160583, "step": 600 }, { "epoch": 0.31932993063735116, "grad_norm": 1.5338911609375667, "learning_rate": 8.604307566579472e-07, "logits/chosen": -3.5967631340026855, "logits/rejected": -3.748669385910034, "logps/chosen": -0.759170651435852, "logps/rejected": -1.21732497215271, "loss": 0.0886, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.12660792469978333, "rewards/rejected": -0.35170167684555054, "step": 610 }, { "epoch": 0.32456484753304543, "grad_norm": 3.030605610103173, "learning_rate": 8.540376659715225e-07, "logits/chosen": -3.6591286659240723, "logits/rejected": -3.9070792198181152, "logps/chosen": -0.6764650344848633, "logps/rejected": -1.1898220777511597, "loss": 0.1434, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1459110528230667, "rewards/rejected": -0.43774351477622986, "step": 620 }, { "epoch": 0.3297997644287397, "grad_norm": 10.069918354827117, "learning_rate": 8.47526329580623e-07, "logits/chosen": -3.5482306480407715, "logits/rejected": -3.7556564807891846, "logps/chosen": -0.6558570265769958, "logps/rejected": -1.2123215198516846, "loss": 0.516, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.10284946113824844, "rewards/rejected": -0.5016738772392273, "step": 630 }, { "epoch": 0.335034681324434, "grad_norm": 3.5244932761338124, "learning_rate": 8.408989222187096e-07, "logits/chosen": -3.4110941886901855, "logits/rejected": -3.6678364276885986, "logps/chosen": -0.6549906730651855, "logps/rejected": -1.246897578239441, "loss": 0.7311, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.10875538736581802, "rewards/rejected": -0.5850102305412292, "step": 640 }, { "epoch": 0.34026959822012826, "grad_norm": 4.0414984437360175, "learning_rate": 8.341576573860047e-07, "logits/chosen": -3.478461503982544, "logits/rejected": -3.702072858810425, "logps/chosen": -0.7687514424324036, "logps/rejected": -1.43941330909729, "loss": 0.2588, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.16434721648693085, "rewards/rejected": -0.5769973993301392, "step": 650 }, { "epoch": 0.34550451511582253, "grad_norm": 17.089029660346316, "learning_rate": 8.27304786610201e-07, "logits/chosen": -3.6008193492889404, "logits/rejected": -3.9538185596466064, "logps/chosen": -0.6982223987579346, "logps/rejected": -1.2972527742385864, "loss": 0.2549, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0916595607995987, "rewards/rejected": -0.38794606924057007, "step": 660 }, { "epoch": 0.3507394320115168, "grad_norm": 23.354286685814746, "learning_rate": 8.203425986944696e-07, "logits/chosen": -3.7454254627227783, "logits/rejected": -3.954153537750244, "logps/chosen": -0.6409385800361633, "logps/rejected": -1.1634663343429565, "loss": 0.1437, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.09134040772914886, "rewards/rejected": -0.4564918577671051, "step": 670 }, { "epoch": 0.3559743489072111, "grad_norm": 12.016594060247717, "learning_rate": 8.132734189530182e-07, "logits/chosen": -3.7062535285949707, "logits/rejected": -3.933081865310669, "logps/chosen": -0.5595335960388184, "logps/rejected": -1.047524333000183, "loss": 0.1974, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.20238538086414337, "rewards/rejected": -0.5873401165008545, "step": 680 }, { "epoch": 0.36120926580290535, "grad_norm": 13.607141196369907, "learning_rate": 8.060996084344553e-07, "logits/chosen": -3.6081855297088623, "logits/rejected": -3.7106146812438965, "logps/chosen": -0.7058667540550232, "logps/rejected": -1.333534836769104, "loss": 0.2337, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.20908991992473602, "rewards/rejected": -0.5528732538223267, "step": 690 }, { "epoch": 0.3664441826985997, "grad_norm": 9.343448893616527, "learning_rate": 7.98823563133219e-07, "logits/chosen": -3.7106995582580566, "logits/rejected": -3.8569560050964355, "logps/chosen": -0.5677663087844849, "logps/rejected": -1.077682614326477, "loss": 0.1728, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.1268927901983261, "rewards/rejected": -0.4453812539577484, "step": 700 }, { "epoch": 0.37167909959429396, "grad_norm": 1.674723781078412, "learning_rate": 7.914477131893342e-07, "logits/chosen": -3.6300597190856934, "logits/rejected": -3.8654580116271973, "logps/chosen": -0.6926698684692383, "logps/rejected": -1.3483922481536865, "loss": 0.2708, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.11889272928237915, "rewards/rejected": -0.7035388350486755, "step": 710 }, { "epoch": 0.37691401648998824, "grad_norm": 26.516721992313247, "learning_rate": 7.839745220767661e-07, "logits/chosen": -3.356396436691284, "logits/rejected": -3.629464626312256, "logps/chosen": -0.6213072538375854, "logps/rejected": -1.278612494468689, "loss": 0.4189, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.17910563945770264, "rewards/rejected": -0.6009346842765808, "step": 720 }, { "epoch": 0.3821489333856825, "grad_norm": 29.551216070368934, "learning_rate": 7.764064857806389e-07, "logits/chosen": -3.349151611328125, "logits/rejected": -3.5175952911376953, "logps/chosen": -0.6614469289779663, "logps/rejected": -1.1986842155456543, "loss": 0.3052, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.1393895149230957, "rewards/rejected": -0.49458661675453186, "step": 730 }, { "epoch": 0.3873838502813768, "grad_norm": 25.357071934733334, "learning_rate": 7.68746131963598e-07, "logits/chosen": -3.4714951515197754, "logits/rejected": -3.677035093307495, "logps/chosen": -0.620179295539856, "logps/rejected": -1.2890572547912598, "loss": 0.228, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.13783490657806396, "rewards/rejected": -0.794097363948822, "step": 740 }, { "epoch": 0.39261876717707106, "grad_norm": 25.198350240467338, "learning_rate": 7.609960191215909e-07, "logits/chosen": -3.4828593730926514, "logits/rejected": -3.795466899871826, "logps/chosen": -0.6967580318450928, "logps/rejected": -1.377361536026001, "loss": 0.3695, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.19926968216896057, "rewards/rejected": -0.9696201086044312, "step": 750 }, { "epoch": 0.39785368407276533, "grad_norm": 36.505889073660846, "learning_rate": 7.531587357293505e-07, "logits/chosen": -3.595017910003662, "logits/rejected": -3.8477072715759277, "logps/chosen": -0.7997711896896362, "logps/rejected": -1.3395113945007324, "loss": 0.1863, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.1399848610162735, "rewards/rejected": -0.5361682176589966, "step": 760 }, { "epoch": 0.4030886009684596, "grad_norm": 9.852370874435213, "learning_rate": 7.452368993758645e-07, "logits/chosen": -3.4401755332946777, "logits/rejected": -3.7114880084991455, "logps/chosen": -0.6346908211708069, "logps/rejected": -1.3159992694854736, "loss": 0.2026, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.16581778228282928, "rewards/rejected": -0.7039278745651245, "step": 770 }, { "epoch": 0.4083235178641539, "grad_norm": 39.35986226984939, "learning_rate": 7.372331558901237e-07, "logits/chosen": -3.411632537841797, "logits/rejected": -3.6088695526123047, "logps/chosen": -0.6711673140525818, "logps/rejected": -1.2047076225280762, "loss": 0.1863, "rewards/accuracies": 0.8125, "rewards/chosen": -0.1048843041062355, "rewards/rejected": -0.6397637128829956, "step": 780 }, { "epoch": 0.4135584347598482, "grad_norm": 7.036281988177846, "learning_rate": 7.291501784574355e-07, "logits/chosen": -3.5028297901153564, "logits/rejected": -3.7100181579589844, "logps/chosen": -0.6011011600494385, "logps/rejected": -1.1766241788864136, "loss": 0.3423, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.12209127098321915, "rewards/rejected": -0.5233365893363953, "step": 790 }, { "epoch": 0.4187933516555425, "grad_norm": 15.877033932538007, "learning_rate": 7.209906667266017e-07, "logits/chosen": -3.6198973655700684, "logits/rejected": -3.881483793258667, "logps/chosen": -0.7075928449630737, "logps/rejected": -1.1975640058517456, "loss": 0.1767, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.13878247141838074, "rewards/rejected": -0.5438691973686218, "step": 800 }, { "epoch": 0.42402826855123676, "grad_norm": 8.32857274649763, "learning_rate": 7.12757345908258e-07, "logits/chosen": -3.478787660598755, "logits/rejected": -3.5588173866271973, "logps/chosen": -0.5804450511932373, "logps/rejected": -1.1082279682159424, "loss": 0.1955, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.10243809223175049, "rewards/rejected": -0.5699917674064636, "step": 810 }, { "epoch": 0.42926318544693104, "grad_norm": 1.971725698476674, "learning_rate": 7.044529658646761e-07, "logits/chosen": -3.325711488723755, "logits/rejected": -3.489297866821289, "logps/chosen": -0.6689733266830444, "logps/rejected": -1.2249457836151123, "loss": 0.1751, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.15426844358444214, "rewards/rejected": -0.5962169170379639, "step": 820 }, { "epoch": 0.4344981023426253, "grad_norm": 6.500882333020027, "learning_rate": 6.960803001913314e-07, "logits/chosen": -3.3526389598846436, "logits/rejected": -3.616393566131592, "logps/chosen": -0.6438730955123901, "logps/rejected": -1.241818904876709, "loss": 0.241, "rewards/accuracies": 0.75, "rewards/chosen": -0.1811794489622116, "rewards/rejected": -0.5559738874435425, "step": 830 }, { "epoch": 0.4397330192383196, "grad_norm": 1.5485575267591558, "learning_rate": 6.876421452905448e-07, "logits/chosen": -3.6445419788360596, "logits/rejected": -3.844398021697998, "logps/chosen": -0.6539190411567688, "logps/rejected": -1.1970140933990479, "loss": 0.1112, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.12994791567325592, "rewards/rejected": -0.5939737558364868, "step": 840 }, { "epoch": 0.44496793613401386, "grad_norm": 8.846891414177467, "learning_rate": 6.791413194375076e-07, "logits/chosen": -3.665837049484253, "logits/rejected": -4.038450717926025, "logps/chosen": -0.6240882873535156, "logps/rejected": -1.2376606464385986, "loss": 0.1749, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.13635075092315674, "rewards/rejected": -0.5054816007614136, "step": 850 }, { "epoch": 0.45020285302970814, "grad_norm": 50.539531832216845, "learning_rate": 6.705806618389997e-07, "logits/chosen": -3.5491700172424316, "logits/rejected": -3.891484498977661, "logps/chosen": -0.6607200503349304, "logps/rejected": -1.2415850162506104, "loss": 0.2544, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.11792077124118805, "rewards/rejected": -0.7187885642051697, "step": 860 }, { "epoch": 0.4554377699254024, "grad_norm": 2.281104923342322, "learning_rate": 6.619630316851182e-07, "logits/chosen": -3.623032331466675, "logits/rejected": -3.9570438861846924, "logps/chosen": -0.5454970598220825, "logps/rejected": -1.008527398109436, "loss": 0.4412, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.11349859088659286, "rewards/rejected": -0.49833065271377563, "step": 870 }, { "epoch": 0.4606726868210967, "grad_norm": 21.825427920475462, "learning_rate": 6.532913071943307e-07, "logits/chosen": -3.726950168609619, "logits/rejected": -3.893709182739258, "logps/chosen": -0.7641295194625854, "logps/rejected": -1.277066946029663, "loss": 0.0858, "rewards/accuracies": 0.8125, "rewards/chosen": -0.11578428745269775, "rewards/rejected": -0.31977829337120056, "step": 880 }, { "epoch": 0.465907603716791, "grad_norm": 10.863674979284927, "learning_rate": 6.445683846521738e-07, "logits/chosen": -3.648641586303711, "logits/rejected": -3.897275447845459, "logps/chosen": -0.7207273244857788, "logps/rejected": -1.2213026285171509, "loss": 0.1486, "rewards/accuracies": 0.8125, "rewards/chosen": -0.11959713697433472, "rewards/rejected": -0.580722987651825, "step": 890 }, { "epoch": 0.4711425206124853, "grad_norm": 11.024990455629961, "learning_rate": 6.357971774439177e-07, "logits/chosen": -3.67216157913208, "logits/rejected": -3.8727009296417236, "logps/chosen": -0.5953903198242188, "logps/rejected": -1.187514066696167, "loss": 0.3925, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.162687286734581, "rewards/rejected": -0.5742394328117371, "step": 900 }, { "epoch": 0.47637743750817957, "grad_norm": 9.020917089934892, "learning_rate": 6.269806150815187e-07, "logits/chosen": -3.646181583404541, "logits/rejected": -3.939856767654419, "logps/chosen": -0.6697233319282532, "logps/rejected": -1.2958990335464478, "loss": 0.14, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.1477135419845581, "rewards/rejected": -0.5787355899810791, "step": 910 }, { "epoch": 0.48161235440387384, "grad_norm": 17.07185019161522, "learning_rate": 6.181216422251862e-07, "logits/chosen": -3.652355909347534, "logits/rejected": -3.9332756996154785, "logps/chosen": -0.6171292066574097, "logps/rejected": -1.1538760662078857, "loss": 0.1442, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.1485549807548523, "rewards/rejected": -0.5386639833450317, "step": 920 }, { "epoch": 0.4868472712995681, "grad_norm": 11.35798051607949, "learning_rate": 6.092232176998897e-07, "logits/chosen": -3.3274683952331543, "logits/rejected": -3.683140993118286, "logps/chosen": -0.6949746012687683, "logps/rejected": -1.2979357242584229, "loss": 0.2627, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.10100005567073822, "rewards/rejected": -0.40933284163475037, "step": 930 }, { "epoch": 0.4920821881952624, "grad_norm": 7.48204999503568, "learning_rate": 6.002883135071362e-07, "logits/chosen": -3.6361114978790283, "logits/rejected": -3.845881700515747, "logps/chosen": -0.6067990064620972, "logps/rejected": -1.1521469354629517, "loss": 0.2342, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.08356883376836777, "rewards/rejected": -0.3899988830089569, "step": 940 }, { "epoch": 0.49731710509095667, "grad_norm": 9.65932504460624, "learning_rate": 5.913199138323448e-07, "logits/chosen": -3.4517874717712402, "logits/rejected": -3.6863322257995605, "logps/chosen": -0.6873298287391663, "logps/rejected": -1.2405993938446045, "loss": 0.0732, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.13713274896144867, "rewards/rejected": -0.5883907079696655, "step": 950 }, { "epoch": 0.5025520219866509, "grad_norm": 6.954926765138611, "learning_rate": 5.82321014048154e-07, "logits/chosen": -3.531513214111328, "logits/rejected": -3.8438732624053955, "logps/chosen": -0.6925119161605835, "logps/rejected": -1.3587197065353394, "loss": 0.2015, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.12155506759881973, "rewards/rejected": -0.5510644912719727, "step": 960 }, { "epoch": 0.5077869388823453, "grad_norm": 29.092706700807746, "learning_rate": 5.732946197139906e-07, "logits/chosen": -3.543038845062256, "logits/rejected": -3.7100837230682373, "logps/chosen": -0.6176570057868958, "logps/rejected": -1.1735661029815674, "loss": 0.1762, "rewards/accuracies": 0.8125, "rewards/chosen": -0.11741694062948227, "rewards/rejected": -0.5648257732391357, "step": 970 }, { "epoch": 0.5130218557780395, "grad_norm": 15.364134485434684, "learning_rate": 5.642437455722381e-07, "logits/chosen": -3.527390718460083, "logits/rejected": -3.6467444896698, "logps/chosen": -0.5736885070800781, "logps/rejected": -1.1105291843414307, "loss": 0.1348, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.1023680791258812, "rewards/rejected": -0.5240501761436462, "step": 980 }, { "epoch": 0.5182567726737338, "grad_norm": 10.7420392782178, "learning_rate": 5.551714145413368e-07, "logits/chosen": -3.633018970489502, "logits/rejected": -3.8861050605773926, "logps/chosen": -0.6110260486602783, "logps/rejected": -1.197975516319275, "loss": 0.1833, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.2393743246793747, "rewards/rejected": -0.8618858456611633, "step": 990 }, { "epoch": 0.523491689569428, "grad_norm": 29.652979248675152, "learning_rate": 5.460806567061533e-07, "logits/chosen": -3.5295844078063965, "logits/rejected": -3.756152629852295, "logps/chosen": -0.6682409048080444, "logps/rejected": -1.2239763736724854, "loss": 0.1562, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.061064548790454865, "rewards/rejected": -0.29901862144470215, "step": 1000 }, { "epoch": 0.5287266064651224, "grad_norm": 4.935111757809973, "learning_rate": 5.369745083059577e-07, "logits/chosen": -3.706066608428955, "logits/rejected": -3.871903657913208, "logps/chosen": -0.8551700711250305, "logps/rejected": -1.495203971862793, "loss": 0.09, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.08738512545824051, "rewards/rejected": -0.4607165455818176, "step": 1010 }, { "epoch": 0.5339615233608166, "grad_norm": 14.950139526881932, "learning_rate": 5.278560107203437e-07, "logits/chosen": -3.6445841789245605, "logits/rejected": -4.002659320831299, "logps/chosen": -0.678175151348114, "logps/rejected": -1.2137267589569092, "loss": 0.164, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.16553157567977905, "rewards/rejected": -0.7228254079818726, "step": 1020 }, { "epoch": 0.5391964402565109, "grad_norm": 10.883180795778303, "learning_rate": 5.18728209453432e-07, "logits/chosen": -3.707653760910034, "logits/rejected": -3.845078229904175, "logps/chosen": -0.6454753875732422, "logps/rejected": -1.1513105630874634, "loss": 0.1559, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1302897185087204, "rewards/rejected": -0.5494991540908813, "step": 1030 }, { "epoch": 0.5444313571522053, "grad_norm": 33.05212121743528, "learning_rate": 5.095941531166982e-07, "logits/chosen": -3.78800630569458, "logits/rejected": -4.111274242401123, "logps/chosen": -0.650190532207489, "logps/rejected": -1.2724605798721313, "loss": 0.3122, "rewards/accuracies": 0.75, "rewards/chosen": -0.16642943024635315, "rewards/rejected": -0.7723706960678101, "step": 1040 }, { "epoch": 0.5496662740478995, "grad_norm": 25.0104518604515, "learning_rate": 5.004568924107598e-07, "logits/chosen": -3.5907185077667236, "logits/rejected": -3.9407639503479004, "logps/chosen": -0.6616253852844238, "logps/rejected": -1.3152577877044678, "loss": 0.3186, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.16016852855682373, "rewards/rejected": -0.4862311780452728, "step": 1050 }, { "epoch": 0.5549011909435938, "grad_norm": 31.981933797923496, "learning_rate": 4.913194791064675e-07, "logits/chosen": -3.687349796295166, "logits/rejected": -3.9528133869171143, "logps/chosen": -0.8306125402450562, "logps/rejected": -1.4087917804718018, "loss": 0.362, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.1095658391714096, "rewards/rejected": -0.4266139566898346, "step": 1060 }, { "epoch": 0.560136107839288, "grad_norm": 43.09016084350836, "learning_rate": 4.82184965025639e-07, "logits/chosen": -3.7304611206054688, "logits/rejected": -3.975867748260498, "logps/chosen": -0.6367403268814087, "logps/rejected": -1.249473214149475, "loss": 0.2017, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.23390457034111023, "rewards/rejected": -0.6657984256744385, "step": 1070 }, { "epoch": 0.5653710247349824, "grad_norm": 14.680677683118907, "learning_rate": 4.73056401021775e-07, "logits/chosen": -3.6824183464050293, "logits/rejected": -3.9073410034179688, "logps/chosen": -0.7108127474784851, "logps/rejected": -1.4023791551589966, "loss": 0.479, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.14911451935768127, "rewards/rejected": -0.5604814291000366, "step": 1080 }, { "epoch": 0.5706059416306766, "grad_norm": 15.630100484670548, "learning_rate": 4.639368359610982e-07, "logits/chosen": -3.694349765777588, "logits/rejected": -4.01293420791626, "logps/chosen": -0.7198182344436646, "logps/rejected": -1.2933294773101807, "loss": 0.3266, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.21065545082092285, "rewards/rejected": -0.7207514047622681, "step": 1090 }, { "epoch": 0.5758408585263709, "grad_norm": 13.347415996971426, "learning_rate": 4.5482931570425803e-07, "logits/chosen": -3.7276809215545654, "logits/rejected": -4.011933326721191, "logps/chosen": -0.6607510447502136, "logps/rejected": -1.281141757965088, "loss": 0.1704, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.17264564335346222, "rewards/rejected": -0.6156941652297974, "step": 1100 }, { "epoch": 0.5810757754220651, "grad_norm": 12.382613041230709, "learning_rate": 4.4573688208903686e-07, "logits/chosen": -3.6291985511779785, "logits/rejected": -3.9804370403289795, "logps/chosen": -0.6637237071990967, "logps/rejected": -1.391247034072876, "loss": 0.2221, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.15261736512184143, "rewards/rejected": -0.8548731803894043, "step": 1110 }, { "epoch": 0.5863106923177595, "grad_norm": 7.303791497022784, "learning_rate": 4.366625719144016e-07, "logits/chosen": -3.506834030151367, "logits/rejected": -3.755375623703003, "logps/chosen": -0.7788494825363159, "logps/rejected": -1.385221004486084, "loss": 0.2242, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.10770156234502792, "rewards/rejected": -0.43017715215682983, "step": 1120 }, { "epoch": 0.5915456092134538, "grad_norm": 14.381226143020704, "learning_rate": 4.276094159262368e-07, "logits/chosen": -3.543980360031128, "logits/rejected": -3.700058698654175, "logps/chosen": -0.7008415460586548, "logps/rejected": -1.2009141445159912, "loss": 0.1348, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.08333039283752441, "rewards/rejected": -0.4378291666507721, "step": 1130 }, { "epoch": 0.596780526109148, "grad_norm": 8.760901934055333, "learning_rate": 4.1858043780510135e-07, "logits/chosen": -3.62018084526062, "logits/rejected": -3.911879062652588, "logps/chosen": -0.5855687260627747, "logps/rejected": -1.1562426090240479, "loss": 0.205, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.10116372257471085, "rewards/rejected": -0.6775528192520142, "step": 1140 }, { "epoch": 0.6020154430048423, "grad_norm": 64.65530781329107, "learning_rate": 4.0957865315634204e-07, "logits/chosen": -3.5645194053649902, "logits/rejected": -3.8217787742614746, "logps/chosen": -0.641860842704773, "logps/rejected": -1.2221088409423828, "loss": 0.2331, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.08705325424671173, "rewards/rejected": -0.46784210205078125, "step": 1150 }, { "epoch": 0.6072503599005366, "grad_norm": 3.8658549598616143, "learning_rate": 4.006070685029075e-07, "logits/chosen": -3.679039716720581, "logits/rejected": -3.946254253387451, "logps/chosen": -0.6737911105155945, "logps/rejected": -1.2060964107513428, "loss": 0.259, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.1347390115261078, "rewards/rejected": -0.5623631477355957, "step": 1160 }, { "epoch": 0.6124852767962309, "grad_norm": 0.4610785350786279, "learning_rate": 3.916686802811927e-07, "logits/chosen": -3.583909511566162, "logits/rejected": -3.8461241722106934, "logps/chosen": -0.6507914662361145, "logps/rejected": -1.2234233617782593, "loss": 0.1494, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.09081225097179413, "rewards/rejected": -0.700011670589447, "step": 1170 }, { "epoch": 0.6177201936919251, "grad_norm": 1.6721740201035047, "learning_rate": 3.8276647384025467e-07, "logits/chosen": -3.608611583709717, "logits/rejected": -3.8778247833251953, "logps/chosen": -0.6140819191932678, "logps/rejected": -1.1463892459869385, "loss": 0.417, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.09192916750907898, "rewards/rejected": -0.569677472114563, "step": 1180 }, { "epoch": 0.6229551105876194, "grad_norm": 34.92800950922805, "learning_rate": 3.7390342244472883e-07, "logits/chosen": -3.686276912689209, "logits/rejected": -3.9486172199249268, "logps/chosen": -0.6567327976226807, "logps/rejected": -1.2959892749786377, "loss": 0.1887, "rewards/accuracies": 0.8125, "rewards/chosen": -0.13273802399635315, "rewards/rejected": -0.6054214239120483, "step": 1190 }, { "epoch": 0.6281900274833137, "grad_norm": 28.837778937392045, "learning_rate": 3.6508248628178446e-07, "logits/chosen": -3.635249376296997, "logits/rejected": -3.995410203933716, "logps/chosen": -0.6689791679382324, "logps/rejected": -1.2368990182876587, "loss": 0.1684, "rewards/accuracies": 0.8125, "rewards/chosen": -0.11262966692447662, "rewards/rejected": -0.5506707429885864, "step": 1200 }, { "epoch": 0.633424944379008, "grad_norm": 6.4862027458335465, "learning_rate": 3.563066114724441e-07, "logits/chosen": -3.7043285369873047, "logits/rejected": -3.9376754760742188, "logps/chosen": -0.6666765213012695, "logps/rejected": -1.1883481740951538, "loss": 0.1755, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.24648375809192657, "rewards/rejected": -0.5887495279312134, "step": 1210 }, { "epoch": 0.6386598612747023, "grad_norm": 29.832662347559868, "learning_rate": 3.475787290876055e-07, "logits/chosen": -3.6531460285186768, "logits/rejected": -3.931300640106201, "logps/chosen": -0.7262079119682312, "logps/rejected": -1.4471489191055298, "loss": 0.4461, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.13697785139083862, "rewards/rejected": -0.5294802188873291, "step": 1220 }, { "epoch": 0.6438947781703965, "grad_norm": 8.313230076052804, "learning_rate": 3.389017541690854e-07, "logits/chosen": -3.6925830841064453, "logits/rejected": -3.8890395164489746, "logps/chosen": -0.6669970154762268, "logps/rejected": -1.1209336519241333, "loss": 0.1875, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11731680482625961, "rewards/rejected": -0.4473400115966797, "step": 1230 }, { "epoch": 0.6491296950660909, "grad_norm": 6.599402039349072, "learning_rate": 3.30278584756021e-07, "logits/chosen": -3.7490150928497314, "logits/rejected": -3.9958243370056152, "logps/chosen": -0.6234461069107056, "logps/rejected": -1.2914457321166992, "loss": 0.2222, "rewards/accuracies": 0.8125, "rewards/chosen": -0.19926027953624725, "rewards/rejected": -0.7430733442306519, "step": 1240 }, { "epoch": 0.6543646119617851, "grad_norm": 26.42550655780738, "learning_rate": 3.2171210091694735e-07, "logits/chosen": -3.5512046813964844, "logits/rejected": -3.8761677742004395, "logps/chosen": -0.6285715699195862, "logps/rejected": -1.1303393840789795, "loss": 0.4599, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.10718154907226562, "rewards/rejected": -0.5068109035491943, "step": 1250 }, { "epoch": 0.6595995288574794, "grad_norm": 43.52112297389642, "learning_rate": 3.132051637878789e-07, "logits/chosen": -3.754105806350708, "logits/rejected": -3.9892616271972656, "logps/chosen": -0.604164183139801, "logps/rejected": -1.2354532480239868, "loss": 0.3654, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1477893888950348, "rewards/rejected": -0.6003723740577698, "step": 1260 }, { "epoch": 0.6648344457531736, "grad_norm": 2.249013697408821, "learning_rate": 3.0476061461671155e-07, "logits/chosen": -3.6410465240478516, "logits/rejected": -3.9180960655212402, "logps/chosen": -0.6587497591972351, "logps/rejected": -1.34711754322052, "loss": 0.258, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.13485677540302277, "rewards/rejected": -0.5674414038658142, "step": 1270 }, { "epoch": 0.670069362648868, "grad_norm": 4.4535505774168636, "learning_rate": 2.9638127381427127e-07, "logits/chosen": -3.6331870555877686, "logits/rejected": -3.924232006072998, "logps/chosen": -0.6530889272689819, "logps/rejected": -1.2471270561218262, "loss": 0.1464, "rewards/accuracies": 0.75, "rewards/chosen": -0.13776201009750366, "rewards/rejected": -0.5627979040145874, "step": 1280 }, { "epoch": 0.6753042795445622, "grad_norm": 13.03175390211842, "learning_rate": 2.8806994001231766e-07, "logits/chosen": -3.5974411964416504, "logits/rejected": -3.770061492919922, "logps/chosen": -0.57194584608078, "logps/rejected": -1.1356195211410522, "loss": 0.2263, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.21213237941265106, "rewards/rejected": -0.47332197427749634, "step": 1290 }, { "epoch": 0.6805391964402565, "grad_norm": 9.542565711403059, "learning_rate": 2.7982938912882544e-07, "logits/chosen": -3.5941874980926514, "logits/rejected": -3.9326794147491455, "logps/chosen": -0.6405996084213257, "logps/rejected": -1.4193586111068726, "loss": 0.1943, "rewards/accuracies": 0.8125, "rewards/chosen": -0.1434379667043686, "rewards/rejected": -0.8874284625053406, "step": 1300 }, { "epoch": 0.6857741133359508, "grad_norm": 20.899845143040352, "learning_rate": 2.716623734408488e-07, "logits/chosen": -3.7071640491485596, "logits/rejected": -3.954035997390747, "logps/chosen": -0.7084048390388489, "logps/rejected": -1.408484697341919, "loss": 0.16, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.11447383463382721, "rewards/rejected": -0.5277458429336548, "step": 1310 }, { "epoch": 0.6910090302316451, "grad_norm": 26.585542302796576, "learning_rate": 2.635716206652843e-07, "logits/chosen": -3.568807601928711, "logits/rejected": -3.851666212081909, "logps/chosen": -0.6219191551208496, "logps/rejected": -1.185127854347229, "loss": 0.2097, "rewards/accuracies": 0.8125, "rewards/chosen": -0.16521432995796204, "rewards/rejected": -0.8127703666687012, "step": 1320 }, { "epoch": 0.6962439471273394, "grad_norm": 0.9910292983788035, "learning_rate": 2.5555983304783515e-07, "logits/chosen": -3.706960678100586, "logits/rejected": -3.9914677143096924, "logps/chosen": -0.6214891076087952, "logps/rejected": -1.2848238945007324, "loss": 0.2059, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.09211207926273346, "rewards/rejected": -0.45670217275619507, "step": 1330 }, { "epoch": 0.7014788640230336, "grad_norm": 19.940007489482195, "learning_rate": 2.4762968646048356e-07, "logits/chosen": -3.6894028186798096, "logits/rejected": -4.012866497039795, "logps/chosen": -0.6447241902351379, "logps/rejected": -1.2848459482192993, "loss": 0.3272, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.10874289274215698, "rewards/rejected": -0.6314564943313599, "step": 1340 }, { "epoch": 0.7067137809187279, "grad_norm": 1.6579625246627596, "learning_rate": 2.397838295077703e-07, "logits/chosen": -3.513641357421875, "logits/rejected": -3.8426365852355957, "logps/chosen": -0.6752243041992188, "logps/rejected": -1.2607519626617432, "loss": 0.1058, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1565995216369629, "rewards/rejected": -0.6347146034240723, "step": 1350 }, { "epoch": 0.7119486978144222, "grad_norm": 5.659662645549925, "learning_rate": 2.3202488264218357e-07, "logits/chosen": -3.4555447101593018, "logits/rejected": -3.8530869483947754, "logps/chosen": -0.697325587272644, "logps/rejected": -1.2927258014678955, "loss": 0.0785, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.08387357741594315, "rewards/rejected": -0.39989030361175537, "step": 1360 }, { "epoch": 0.7171836147101165, "grad_norm": 4.657928400009495, "learning_rate": 2.243554372889479e-07, "logits/chosen": -3.5664660930633545, "logits/rejected": -3.876011371612549, "logps/chosen": -0.705878734588623, "logps/rejected": -1.364867925643921, "loss": 0.1093, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.12067972123622894, "rewards/rejected": -0.6570446491241455, "step": 1370 }, { "epoch": 0.7224185316058107, "grad_norm": 1.3444622420831092, "learning_rate": 2.1677805498050998e-07, "logits/chosen": -3.3227603435516357, "logits/rejected": -3.743194580078125, "logps/chosen": -0.6672931909561157, "logps/rejected": -1.1365772485733032, "loss": 0.1243, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.09715026617050171, "rewards/rejected": -0.43300461769104004, "step": 1380 }, { "epoch": 0.727653448501505, "grad_norm": 8.614820635695539, "learning_rate": 2.0929526650100716e-07, "logits/chosen": -3.5229296684265137, "logits/rejected": -3.9142937660217285, "logps/chosen": -0.6047448515892029, "logps/rejected": -1.2415525913238525, "loss": 0.2105, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.23621472716331482, "rewards/rejected": -0.8043211698532104, "step": 1390 }, { "epoch": 0.7328883653971994, "grad_norm": 5.179700896632157, "learning_rate": 2.0190957104100692e-07, "logits/chosen": -3.4930293560028076, "logits/rejected": -3.749809741973877, "logps/chosen": -0.6624347567558289, "logps/rejected": -1.1844425201416016, "loss": 0.1966, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.1073043942451477, "rewards/rejected": -0.4727447032928467, "step": 1400 }, { "epoch": 0.7381232822928936, "grad_norm": 21.802519076308094, "learning_rate": 1.9462343536279612e-07, "logits/chosen": -3.6438193321228027, "logits/rejected": -4.09710168838501, "logps/chosen": -0.6798110604286194, "logps/rejected": -1.2490794658660889, "loss": 0.18, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.11491537094116211, "rewards/rejected": -0.5919861197471619, "step": 1410 }, { "epoch": 0.7433581991885879, "grad_norm": 8.921153921786187, "learning_rate": 1.874392929765044e-07, "logits/chosen": -3.7223763465881348, "logits/rejected": -4.110812187194824, "logps/chosen": -0.6291832327842712, "logps/rejected": -1.2136573791503906, "loss": 0.1329, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10531127452850342, "rewards/rejected": -0.5478615164756775, "step": 1420 }, { "epoch": 0.7485931160842821, "grad_norm": 69.71187759138483, "learning_rate": 1.8035954332732889e-07, "logits/chosen": -3.6256279945373535, "logits/rejected": -3.9363632202148438, "logps/chosen": -0.6302305459976196, "logps/rejected": -1.1628683805465698, "loss": 0.1542, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.1714845895767212, "rewards/rejected": -0.623928964138031, "step": 1430 }, { "epoch": 0.7538280329799765, "grad_norm": 20.431726665972217, "learning_rate": 1.733865509941419e-07, "logits/chosen": -3.574036121368408, "logits/rejected": -3.994558811187744, "logps/chosen": -0.6578459143638611, "logps/rejected": -1.2709509134292603, "loss": 0.1268, "rewards/accuracies": 0.8125, "rewards/chosen": -0.12474487721920013, "rewards/rejected": -0.477055162191391, "step": 1440 }, { "epoch": 0.7590629498756707, "grad_norm": 9.763384229327023, "learning_rate": 1.6652264489973861e-07, "logits/chosen": -3.586714506149292, "logits/rejected": -3.934051990509033, "logps/chosen": -0.672654390335083, "logps/rejected": -1.4327255487442017, "loss": 0.1755, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.12246622890233994, "rewards/rejected": -0.6903547644615173, "step": 1450 }, { "epoch": 0.764297866771365, "grad_norm": 5.3595001978781704, "learning_rate": 1.5977011753299724e-07, "logits/chosen": -3.651690721511841, "logits/rejected": -3.986185073852539, "logps/chosen": -0.6952771544456482, "logps/rejected": -1.2006165981292725, "loss": 0.2141, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.23796899616718292, "rewards/rejected": -0.6073740124702454, "step": 1460 }, { "epoch": 0.7695327836670592, "grad_norm": 1.6432168817434278, "learning_rate": 1.5313122418320496e-07, "logits/chosen": -3.5539729595184326, "logits/rejected": -3.897873640060425, "logps/chosen": -0.632127583026886, "logps/rejected": -1.2387049198150635, "loss": 0.1406, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.10146383196115494, "rewards/rejected": -0.5321124196052551, "step": 1470 }, { "epoch": 0.7747677005627536, "grad_norm": 1.3794858766661577, "learning_rate": 1.4660818218681125e-07, "logits/chosen": -3.5363082885742188, "logits/rejected": -3.8142707347869873, "logps/chosen": -0.7643290758132935, "logps/rejected": -1.4336775541305542, "loss": 0.1708, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.08295364677906036, "rewards/rejected": -0.3842242360115051, "step": 1480 }, { "epoch": 0.7800026174584479, "grad_norm": 14.698867702350263, "learning_rate": 1.4020317018685362e-07, "logits/chosen": -3.398146152496338, "logits/rejected": -3.777660369873047, "logps/chosen": -0.8031834363937378, "logps/rejected": -1.415450096130371, "loss": 0.1705, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.08375723659992218, "rewards/rejected": -0.3890025019645691, "step": 1490 }, { "epoch": 0.7852375343541421, "grad_norm": 12.210364521217109, "learning_rate": 1.3391832740531055e-07, "logits/chosen": -3.4719395637512207, "logits/rejected": -3.8840765953063965, "logps/chosen": -0.67162024974823, "logps/rejected": -1.2897964715957642, "loss": 0.0969, "rewards/accuracies": 0.875, "rewards/chosen": -0.15899525582790375, "rewards/rejected": -0.5975922346115112, "step": 1500 }, { "epoch": 0.7904724512498364, "grad_norm": 23.473420770496503, "learning_rate": 1.2775575292861707e-07, "logits/chosen": -3.528533458709717, "logits/rejected": -3.907036304473877, "logps/chosen": -0.5467859506607056, "logps/rejected": -1.1106278896331787, "loss": 0.1681, "rewards/accuracies": 0.8125, "rewards/chosen": -0.11632678657770157, "rewards/rejected": -0.560685396194458, "step": 1510 }, { "epoch": 0.7957073681455307, "grad_norm": 4.542700336855168, "learning_rate": 1.21717505006588e-07, "logits/chosen": -3.7147388458251953, "logits/rejected": -3.999300003051758, "logps/chosen": -0.7240277528762817, "logps/rejected": -1.3829585313796997, "loss": 0.1492, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.16688640415668488, "rewards/rejected": -0.7482727766036987, "step": 1520 }, { "epoch": 0.800942285041225, "grad_norm": 2.86910989170756, "learning_rate": 1.1580560036497877e-07, "logits/chosen": -3.5072569847106934, "logits/rejected": -3.800105571746826, "logps/chosen": -0.6719281077384949, "logps/rejected": -1.196406602859497, "loss": 0.2545, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.10692320019006729, "rewards/rejected": -0.5672041177749634, "step": 1530 }, { "epoch": 0.8061772019369192, "grad_norm": 0.7212916751703865, "learning_rate": 1.1002201353191521e-07, "logits/chosen": -3.5515499114990234, "logits/rejected": -3.918053150177002, "logps/chosen": -0.5516917109489441, "logps/rejected": -1.1447056531906128, "loss": 0.2219, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.17747844755649567, "rewards/rejected": -0.8485193252563477, "step": 1540 }, { "epoch": 0.8114121188326135, "grad_norm": 3.9789628772583874, "learning_rate": 1.0436867617841766e-07, "logits/chosen": -3.523468017578125, "logits/rejected": -4.002907752990723, "logps/chosen": -0.6106997728347778, "logps/rejected": -1.2265563011169434, "loss": 0.1674, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.1898810714483261, "rewards/rejected": -0.8234789967536926, "step": 1550 }, { "epoch": 0.8166470357283078, "grad_norm": 23.019959616553912, "learning_rate": 9.884747647323854e-08, "logits/chosen": -3.5710349082946777, "logits/rejected": -3.8469862937927246, "logps/chosen": -0.6847441792488098, "logps/rejected": -1.2800266742706299, "loss": 0.1525, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.07254868000745773, "rewards/rejected": -0.3605559766292572, "step": 1560 }, { "epoch": 0.8218819526240021, "grad_norm": 1.9159672672104486, "learning_rate": 9.346025845222871e-08, "logits/chosen": -3.5710854530334473, "logits/rejected": -3.9252638816833496, "logps/chosen": -0.5996052026748657, "logps/rejected": -1.185727834701538, "loss": 0.0693, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.09702242910861969, "rewards/rejected": -0.43671149015426636, "step": 1570 }, { "epoch": 0.8271168695196964, "grad_norm": 6.0432637545944985, "learning_rate": 8.82088214024454e-08, "logits/chosen": -3.4795615673065186, "logits/rejected": -3.900925397872925, "logps/chosen": -0.6539788842201233, "logps/rejected": -1.3062019348144531, "loss": 0.0777, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1087212786078453, "rewards/rejected": -0.6267635226249695, "step": 1580 }, { "epoch": 0.8323517864153906, "grad_norm": 0.8065986090798118, "learning_rate": 8.309491926120393e-08, "logits/chosen": -3.445683002471924, "logits/rejected": -3.8484814167022705, "logps/chosen": -0.6346088647842407, "logps/rejected": -1.2594006061553955, "loss": 0.1251, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11368497461080551, "rewards/rejected": -0.4606091380119324, "step": 1590 }, { "epoch": 0.837586703311085, "grad_norm": 1.631423358922648, "learning_rate": 7.812026003027771e-08, "logits/chosen": -3.4795145988464355, "logits/rejected": -3.843942165374756, "logps/chosen": -0.6449909210205078, "logps/rejected": -1.2673732042312622, "loss": 0.1368, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.11242115497589111, "rewards/rejected": -0.6334723234176636, "step": 1600 }, { "epoch": 0.8428216202067792, "grad_norm": 28.643433953668108, "learning_rate": 7.328650520543906e-08, "logits/chosen": -3.5898594856262207, "logits/rejected": -3.8868191242218018, "logps/chosen": -0.6432119011878967, "logps/rejected": -1.2170101404190063, "loss": 0.24, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.14053374528884888, "rewards/rejected": -0.6827019453048706, "step": 1610 }, { "epoch": 0.8480565371024735, "grad_norm": 6.576620973204435, "learning_rate": 6.859526922153352e-08, "logits/chosen": -3.576659679412842, "logits/rejected": -3.9092178344726562, "logps/chosen": -0.5998526811599731, "logps/rejected": -1.1230926513671875, "loss": 0.1143, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.0742657333612442, "rewards/rejected": -0.3632586598396301, "step": 1620 }, { "epoch": 0.8532914539981677, "grad_norm": 19.103957999815783, "learning_rate": 6.40481189132711e-08, "logits/chosen": -3.498664379119873, "logits/rejected": -3.8451638221740723, "logps/chosen": -0.6496328711509705, "logps/rejected": -1.1756832599639893, "loss": 0.148, "rewards/accuracies": 0.8125, "rewards/chosen": -0.12481925636529922, "rewards/rejected": -0.3642726540565491, "step": 1630 }, { "epoch": 0.8585263708938621, "grad_norm": 2.479610731568999, "learning_rate": 5.964657299191711e-08, "logits/chosen": -3.6090331077575684, "logits/rejected": -3.8709425926208496, "logps/chosen": -0.7074568867683411, "logps/rejected": -1.3075045347213745, "loss": 0.1935, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.0973527580499649, "rewards/rejected": -0.4661819338798523, "step": 1640 }, { "epoch": 0.8637612877895563, "grad_norm": 24.31365663092268, "learning_rate": 5.53921015380539e-08, "logits/chosen": -3.368114948272705, "logits/rejected": -3.887836456298828, "logps/chosen": -0.6916152238845825, "logps/rejected": -1.3344032764434814, "loss": 0.2021, "rewards/accuracies": 0.75, "rewards/chosen": -0.13118405640125275, "rewards/rejected": -0.5621457695960999, "step": 1650 }, { "epoch": 0.8689962046852506, "grad_norm": 18.53418978223593, "learning_rate": 5.1286125510586805e-08, "logits/chosen": -3.587425708770752, "logits/rejected": -3.909331798553467, "logps/chosen": -0.6051632165908813, "logps/rejected": -1.1239159107208252, "loss": 0.2629, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.08930396288633347, "rewards/rejected": -0.4709382951259613, "step": 1660 }, { "epoch": 0.8742311215809448, "grad_norm": 4.50359805992076, "learning_rate": 4.733001627215466e-08, "logits/chosen": -3.5434436798095703, "logits/rejected": -3.836855411529541, "logps/chosen": -0.6643694043159485, "logps/rejected": -1.2192775011062622, "loss": 0.3901, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.17054535448551178, "rewards/rejected": -0.6519421339035034, "step": 1670 }, { "epoch": 0.8794660384766392, "grad_norm": 34.178405196863906, "learning_rate": 4.352509513110658e-08, "logits/chosen": -3.487307071685791, "logits/rejected": -3.8664581775665283, "logps/chosen": -0.6555562615394592, "logps/rejected": -1.2320266962051392, "loss": 0.2195, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.17077895998954773, "rewards/rejected": -0.64589923620224, "step": 1680 }, { "epoch": 0.8847009553723335, "grad_norm": 2.86389693559937, "learning_rate": 3.9872632900194936e-08, "logits/chosen": -3.507810592651367, "logits/rejected": -3.888345241546631, "logps/chosen": -0.7281653881072998, "logps/rejected": -1.3483150005340576, "loss": 0.1462, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.13594810664653778, "rewards/rejected": -0.5338603258132935, "step": 1690 }, { "epoch": 0.8899358722680277, "grad_norm": 10.406439092961616, "learning_rate": 3.6373849472134954e-08, "logits/chosen": -3.631108045578003, "logits/rejected": -3.8256747722625732, "logps/chosen": -0.6505134701728821, "logps/rejected": -1.196921706199646, "loss": 0.0768, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.17471420764923096, "rewards/rejected": -0.6126881241798401, "step": 1700 }, { "epoch": 0.8951707891637221, "grad_norm": 15.532461573765454, "learning_rate": 3.302991341216976e-08, "logits/chosen": -3.6967296600341797, "logits/rejected": -4.029541015625, "logps/chosen": -0.6618956327438354, "logps/rejected": -1.3089344501495361, "loss": 0.1687, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.14995837211608887, "rewards/rejected": -0.5820103883743286, "step": 1710 }, { "epoch": 0.9004057060594163, "grad_norm": 5.162585825584226, "learning_rate": 2.9841941567779474e-08, "logits/chosen": -3.644044876098633, "logits/rejected": -3.9487557411193848, "logps/chosen": -0.7292143106460571, "logps/rejected": -1.3466829061508179, "loss": 0.1542, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.11079509556293488, "rewards/rejected": -0.4718669056892395, "step": 1720 }, { "epoch": 0.9056406229551106, "grad_norm": 6.250911119837395, "learning_rate": 2.681099869566328e-08, "logits/chosen": -3.541680097579956, "logits/rejected": -3.892336368560791, "logps/chosen": -0.6768237948417664, "logps/rejected": -1.1760004758834839, "loss": 0.3289, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.10029733180999756, "rewards/rejected": -0.7108488082885742, "step": 1730 }, { "epoch": 0.9108755398508048, "grad_norm": 12.552976379248005, "learning_rate": 2.3938097106119216e-08, "logits/chosen": -3.7278189659118652, "logits/rejected": -3.972843647003174, "logps/chosen": -0.6094867587089539, "logps/rejected": -1.1807337999343872, "loss": 0.1072, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.15854167938232422, "rewards/rejected": -0.5542012453079224, "step": 1740 }, { "epoch": 0.9161104567464992, "grad_norm": 2.1007604039088514, "learning_rate": 2.12241963249406e-08, "logits/chosen": -3.621340274810791, "logits/rejected": -4.070878505706787, "logps/chosen": -0.6373471617698669, "logps/rejected": -1.2419856786727905, "loss": 0.1475, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.14094902575016022, "rewards/rejected": -0.5264729857444763, "step": 1750 }, { "epoch": 0.9213453736421934, "grad_norm": 12.311524530866398, "learning_rate": 1.8670202772942568e-08, "logits/chosen": -3.631922483444214, "logits/rejected": -3.9891021251678467, "logps/chosen": -0.6251589059829712, "logps/rejected": -1.3054759502410889, "loss": 0.2536, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.19476152956485748, "rewards/rejected": -0.747911810874939, "step": 1760 }, { "epoch": 0.9265802905378877, "grad_norm": 6.827588288274221, "learning_rate": 1.6276969463224545e-08, "logits/chosen": -3.5727906227111816, "logits/rejected": -3.9334945678710938, "logps/chosen": -0.7015948295593262, "logps/rejected": -1.320188045501709, "loss": 0.1134, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.09665700793266296, "rewards/rejected": -0.5031177401542664, "step": 1770 }, { "epoch": 0.931815207433582, "grad_norm": 2.07334457771776, "learning_rate": 1.4045295716271e-08, "logits/chosen": -3.584230899810791, "logits/rejected": -4.013778209686279, "logps/chosen": -0.6425756216049194, "logps/rejected": -1.4021472930908203, "loss": 0.1508, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.12355978786945343, "rewards/rejected": -0.4425305426120758, "step": 1780 }, { "epoch": 0.9370501243292763, "grad_norm": 6.207844735521484, "learning_rate": 1.1975926892984766e-08, "logits/chosen": -3.419482707977295, "logits/rejected": -3.659214496612549, "logps/chosen": -0.5653207898139954, "logps/rejected": -1.1019750833511353, "loss": 0.1604, "rewards/accuracies": 0.75, "rewards/chosen": -0.1579764187335968, "rewards/rejected": -0.5906132459640503, "step": 1790 }, { "epoch": 0.9422850412249706, "grad_norm": 9.527007176662295, "learning_rate": 1.0069554145742787e-08, "logits/chosen": -3.579073667526245, "logits/rejected": -3.9208247661590576, "logps/chosen": -0.6493052840232849, "logps/rejected": -1.1913435459136963, "loss": 0.2047, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.10687317699193954, "rewards/rejected": -0.4368254542350769, "step": 1800 }, { "epoch": 0.9475199581206648, "grad_norm": 31.675698349395965, "learning_rate": 8.326814187556485e-09, "logits/chosen": -3.670579433441162, "logits/rejected": -4.052734375, "logps/chosen": -0.6743156313896179, "logps/rejected": -1.2803795337677002, "loss": 0.2567, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.12054960429668427, "rewards/rejected": -0.5073380470275879, "step": 1810 }, { "epoch": 0.9527548750163591, "grad_norm": 42.855469273806456, "learning_rate": 6.7482890794151594e-09, "logits/chosen": -3.602036237716675, "logits/rejected": -3.961235761642456, "logps/chosen": -0.6719382405281067, "logps/rejected": -1.306571364402771, "loss": 0.1819, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.09778688848018646, "rewards/rejected": -0.545978844165802, "step": 1820 }, { "epoch": 0.9579897919120534, "grad_norm": 14.021917338266821, "learning_rate": 5.334506035882036e-09, "logits/chosen": -3.5885958671569824, "logits/rejected": -3.972801685333252, "logps/chosen": -0.6078780293464661, "logps/rejected": -1.2133899927139282, "loss": 0.2865, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.18109995126724243, "rewards/rejected": -0.8068861961364746, "step": 1830 }, { "epoch": 0.9632247088077477, "grad_norm": 6.0298508108639925, "learning_rate": 4.0859372490090194e-09, "logits/chosen": -3.607355833053589, "logits/rejected": -3.94720196723938, "logps/chosen": -0.5455012917518616, "logps/rejected": -1.1150436401367188, "loss": 0.3449, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.1432439386844635, "rewards/rejected": -0.6401658058166504, "step": 1840 }, { "epoch": 0.9684596257034419, "grad_norm": 4.5937644631506105, "learning_rate": 3.0029997306283416e-09, "logits/chosen": -3.550992250442505, "logits/rejected": -3.908057451248169, "logps/chosen": -0.5658137798309326, "logps/rejected": -1.1313598155975342, "loss": 0.1792, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.10125686228275299, "rewards/rejected": -0.37562742829322815, "step": 1850 }, { "epoch": 0.9736945425991362, "grad_norm": 0.9472001507121848, "learning_rate": 2.0860551730742526e-09, "logits/chosen": -3.4225573539733887, "logits/rejected": -3.754948377609253, "logps/chosen": -0.6793702244758606, "logps/rejected": -1.271066427230835, "loss": 0.075, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.12655304372310638, "rewards/rejected": -0.5372025966644287, "step": 1860 }, { "epoch": 0.9789294594948306, "grad_norm": 28.725611670468133, "learning_rate": 1.3354098283802628e-09, "logits/chosen": -3.5682125091552734, "logits/rejected": -3.978659152984619, "logps/chosen": -0.6888954043388367, "logps/rejected": -1.3216421604156494, "loss": 0.2941, "rewards/accuracies": 0.875, "rewards/chosen": -0.17312012612819672, "rewards/rejected": -0.5689764022827148, "step": 1870 }, { "epoch": 0.9841643763905248, "grad_norm": 6.5557009909155335, "learning_rate": 7.513144059937415e-10, "logits/chosen": -3.502890110015869, "logits/rejected": -3.8684380054473877, "logps/chosen": -0.6373413801193237, "logps/rejected": -1.253266453742981, "loss": 0.1433, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.14787249267101288, "rewards/rejected": -0.5477157831192017, "step": 1880 }, { "epoch": 0.9893992932862191, "grad_norm": 7.693007289411357, "learning_rate": 3.3396398904106393e-10, "logits/chosen": -3.721400737762451, "logits/rejected": -4.00911808013916, "logps/chosen": -0.5624986886978149, "logps/rejected": -1.1344749927520752, "loss": 0.3553, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.2794850170612335, "rewards/rejected": -0.8449773788452148, "step": 1890 }, { "epoch": 0.9946342101819133, "grad_norm": 10.230098634327602, "learning_rate": 8.349796917112018e-11, "logits/chosen": -3.5617072582244873, "logits/rejected": -3.961862087249756, "logps/chosen": -0.6174992322921753, "logps/rejected": -1.282220482826233, "loss": 0.2374, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.14698410034179688, "rewards/rejected": -0.745864987373352, "step": 1900 }, { "epoch": 0.9998691270776077, "grad_norm": 19.58846312067376, "learning_rate": 0.0, "logits/chosen": -3.5836310386657715, "logits/rejected": -3.961308717727661, "logps/chosen": -0.7217192649841309, "logps/rejected": -1.424443006515503, "loss": 0.3428, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.1093846932053566, "rewards/rejected": -0.5250765681266785, "step": 1910 }, { "epoch": 0.9998691270776077, "step": 1910, "total_flos": 167763918716928.0, "train_loss": 0.3737170026252407, "train_runtime": 21184.4179, "train_samples_per_second": 2.886, "train_steps_per_second": 0.09 } ], "logging_steps": 10, "max_steps": 1910, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 800, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 167763918716928.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }