{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5215803885773895, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5.000000000000001e-07, "logits/chosen": -2.2717783451080322, "logits/rejected": -2.2640371322631836, "logps/chosen": -200.07493591308594, "logps/rejected": -200.70086669921875, "loss": 0.6789, "rewards/accuracies": 0.5625, "rewards/chosen": 0.023946668952703476, "rewards/margins": 0.029492639005184174, "rewards/rejected": -0.005545974709093571, "step": 1 }, { "epoch": 0.0, "learning_rate": 1.0000000000000002e-06, "logits/chosen": -2.2359213829040527, "logits/rejected": -2.2241828441619873, "logps/chosen": -188.74400329589844, "logps/rejected": -181.30078125, "loss": 0.7042, "rewards/accuracies": 0.5, "rewards/chosen": -0.022877119481563568, "rewards/margins": -0.02025613933801651, "rewards/rejected": -0.0026209834031760693, "step": 2 }, { "epoch": 0.0, "learning_rate": 1.5e-06, "logits/chosen": -2.2504844665527344, "logits/rejected": -2.2917656898498535, "logps/chosen": -182.1482391357422, "logps/rejected": -201.4050750732422, "loss": 0.6934, "rewards/accuracies": 0.5625, "rewards/chosen": 0.007386707700788975, "rewards/margins": 0.0019398471340537071, "rewards/rejected": 0.005446866154670715, "step": 3 }, { "epoch": 0.0, "learning_rate": 2.0000000000000003e-06, "logits/chosen": -2.166776180267334, "logits/rejected": -2.0744781494140625, "logps/chosen": -173.78936767578125, "logps/rejected": -150.8326416015625, "loss": 0.6891, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0020218612626194954, "rewards/margins": 0.009007596410810947, "rewards/rejected": -0.011029457673430443, "step": 4 }, { "epoch": 0.01, "learning_rate": 2.5e-06, "logits/chosen": -2.1799259185791016, "logits/rejected": -2.3425800800323486, "logps/chosen": -137.8708953857422, "logps/rejected": -148.37060546875, "loss": 0.6906, "rewards/accuracies": 0.5, "rewards/chosen": 0.012697315774857998, "rewards/margins": 0.00599064864218235, "rewards/rejected": 0.006706667132675648, "step": 5 }, { "epoch": 0.01, "learning_rate": 3e-06, "logits/chosen": -2.1913797855377197, "logits/rejected": -2.1852920055389404, "logps/chosen": -127.57758331298828, "logps/rejected": -138.31591796875, "loss": 0.7037, "rewards/accuracies": 0.125, "rewards/chosen": -0.010897636413574219, "rewards/margins": -0.02074580080807209, "rewards/rejected": 0.009848165325820446, "step": 6 }, { "epoch": 0.01, "learning_rate": 3.5000000000000004e-06, "logits/chosen": -2.1357011795043945, "logits/rejected": -2.136214256286621, "logps/chosen": -139.83346557617188, "logps/rejected": -145.2589111328125, "loss": 0.6845, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0056040287017822266, "rewards/margins": 0.01838543452322483, "rewards/rejected": -0.01278140489012003, "step": 7 }, { "epoch": 0.01, "learning_rate": 4.000000000000001e-06, "logits/chosen": -2.105727195739746, "logits/rejected": -2.0614218711853027, "logps/chosen": -195.34869384765625, "logps/rejected": -206.2098388671875, "loss": 0.6963, "rewards/accuracies": 0.1875, "rewards/chosen": 0.030789854004979134, "rewards/margins": -0.005677317269146442, "rewards/rejected": 0.03646716848015785, "step": 8 }, { "epoch": 0.01, "learning_rate": 4.5e-06, "logits/chosen": -2.3017406463623047, "logits/rejected": -2.3441271781921387, "logps/chosen": -174.49053955078125, "logps/rejected": -197.5611572265625, "loss": 0.6875, "rewards/accuracies": 0.625, "rewards/chosen": -0.006907796021550894, "rewards/margins": 0.012369632720947266, "rewards/rejected": -0.019277429208159447, "step": 9 }, { "epoch": 0.01, "learning_rate": 5e-06, "logits/chosen": -2.257244825363159, "logits/rejected": -2.2989351749420166, "logps/chosen": -182.47164916992188, "logps/rejected": -170.6004180908203, "loss": 0.6925, "rewards/accuracies": 0.375, "rewards/chosen": 0.009080934338271618, "rewards/margins": 0.0034890654496848583, "rewards/rejected": 0.005591869354248047, "step": 10 }, { "epoch": 0.01, "learning_rate": 5.500000000000001e-06, "logits/chosen": -2.084881544113159, "logits/rejected": -2.1664023399353027, "logps/chosen": -147.18572998046875, "logps/rejected": -154.4085693359375, "loss": 0.7014, "rewards/accuracies": 0.375, "rewards/chosen": -0.014512108638882637, "rewards/margins": -0.0151824951171875, "rewards/rejected": 0.0006703853141516447, "step": 11 }, { "epoch": 0.01, "learning_rate": 6e-06, "logits/chosen": -2.122969150543213, "logits/rejected": -2.114781379699707, "logps/chosen": -238.08493041992188, "logps/rejected": -221.2827606201172, "loss": 0.6798, "rewards/accuracies": 0.625, "rewards/chosen": 0.003618289018049836, "rewards/margins": 0.028017427772283554, "rewards/rejected": -0.02439913898706436, "step": 12 }, { "epoch": 0.01, "learning_rate": 6.5000000000000004e-06, "logits/chosen": -2.3176565170288086, "logits/rejected": -2.26943039894104, "logps/chosen": -166.89556884765625, "logps/rejected": -156.35850524902344, "loss": 0.6901, "rewards/accuracies": 0.4375, "rewards/chosen": 0.013709260150790215, "rewards/margins": 0.007927654311060905, "rewards/rejected": 0.00578160397708416, "step": 13 }, { "epoch": 0.01, "learning_rate": 7.000000000000001e-06, "logits/chosen": -2.2808492183685303, "logits/rejected": -2.295313596725464, "logps/chosen": -158.31381225585938, "logps/rejected": -165.48663330078125, "loss": 0.6953, "rewards/accuracies": 0.4375, "rewards/chosen": 0.01445994433015585, "rewards/margins": -0.002848696894943714, "rewards/rejected": 0.017308639362454414, "step": 14 }, { "epoch": 0.02, "learning_rate": 7.5e-06, "logits/chosen": -2.428576707839966, "logits/rejected": -2.4046826362609863, "logps/chosen": -198.4075164794922, "logps/rejected": -199.75180053710938, "loss": 0.6883, "rewards/accuracies": 0.4375, "rewards/chosen": 0.012678648345172405, "rewards/margins": 0.010429286397993565, "rewards/rejected": 0.0022493600845336914, "step": 15 }, { "epoch": 0.02, "learning_rate": 8.000000000000001e-06, "logits/chosen": -2.102433681488037, "logits/rejected": -2.098867893218994, "logps/chosen": -130.66616821289062, "logps/rejected": -129.43551635742188, "loss": 0.6897, "rewards/accuracies": 0.5625, "rewards/chosen": 0.00463492888957262, "rewards/margins": 0.007227444555610418, "rewards/rejected": -0.0025925161316990852, "step": 16 }, { "epoch": 0.02, "learning_rate": 8.500000000000002e-06, "logits/chosen": -2.2135839462280273, "logits/rejected": -2.2183382511138916, "logps/chosen": -157.07391357421875, "logps/rejected": -173.192138671875, "loss": 0.7028, "rewards/accuracies": 0.375, "rewards/chosen": -0.00783538818359375, "rewards/margins": -0.018341876566410065, "rewards/rejected": 0.010506488382816315, "step": 17 }, { "epoch": 0.02, "learning_rate": 9e-06, "logits/chosen": -2.2919342517852783, "logits/rejected": -2.3105809688568115, "logps/chosen": -212.9804229736328, "logps/rejected": -213.6470947265625, "loss": 0.6944, "rewards/accuracies": 0.375, "rewards/chosen": 0.017333555966615677, "rewards/margins": -0.0019711018539965153, "rewards/rejected": 0.019304655492305756, "step": 18 }, { "epoch": 0.02, "learning_rate": 9.5e-06, "logits/chosen": -2.150813579559326, "logits/rejected": -2.1184804439544678, "logps/chosen": -164.96514892578125, "logps/rejected": -159.76754760742188, "loss": 0.688, "rewards/accuracies": 0.4375, "rewards/chosen": -0.016628170385956764, "rewards/margins": 0.01211006660014391, "rewards/rejected": -0.02873823791742325, "step": 19 }, { "epoch": 0.02, "learning_rate": 1e-05, "logits/chosen": -2.357393264770508, "logits/rejected": -2.284986734390259, "logps/chosen": -206.59085083007812, "logps/rejected": -196.2120819091797, "loss": 0.6848, "rewards/accuracies": 0.625, "rewards/chosen": 0.006941366009414196, "rewards/margins": 0.018102647736668587, "rewards/rejected": -0.011161278933286667, "step": 20 }, { "epoch": 0.02, "learning_rate": 1.05e-05, "logits/chosen": -2.1450111865997314, "logits/rejected": -2.2165582180023193, "logps/chosen": -157.14804077148438, "logps/rejected": -169.00897216796875, "loss": 0.6747, "rewards/accuracies": 0.6875, "rewards/chosen": 0.01565234735608101, "rewards/margins": 0.03814287483692169, "rewards/rejected": -0.022490523755550385, "step": 21 }, { "epoch": 0.02, "learning_rate": 1.1000000000000001e-05, "logits/chosen": -2.194695234298706, "logits/rejected": -2.1587462425231934, "logps/chosen": -180.54441833496094, "logps/rejected": -179.88087463378906, "loss": 0.6891, "rewards/accuracies": 0.625, "rewards/chosen": 0.004329085350036621, "rewards/margins": 0.008360721170902252, "rewards/rejected": -0.0040316348895430565, "step": 22 }, { "epoch": 0.02, "learning_rate": 1.1500000000000002e-05, "logits/chosen": -2.224414825439453, "logits/rejected": -2.2185750007629395, "logps/chosen": -172.73263549804688, "logps/rejected": -164.4583282470703, "loss": 0.6916, "rewards/accuracies": 0.5, "rewards/chosen": -0.007931852713227272, "rewards/margins": 0.0039233677089214325, "rewards/rejected": -0.011855222284793854, "step": 23 }, { "epoch": 0.03, "learning_rate": 1.2e-05, "logits/chosen": -2.2102274894714355, "logits/rejected": -2.2018349170684814, "logps/chosen": -187.68124389648438, "logps/rejected": -195.62225341796875, "loss": 0.7161, "rewards/accuracies": 0.25, "rewards/chosen": -0.016138983890414238, "rewards/margins": -0.04391060024499893, "rewards/rejected": 0.027771614491939545, "step": 24 }, { "epoch": 0.03, "learning_rate": 1.25e-05, "logits/chosen": -2.124497413635254, "logits/rejected": -2.180361270904541, "logps/chosen": -173.36505126953125, "logps/rejected": -188.89918518066406, "loss": 0.7035, "rewards/accuracies": 0.3125, "rewards/chosen": -0.021524429321289062, "rewards/margins": -0.02002444490790367, "rewards/rejected": -0.0014999869745224714, "step": 25 }, { "epoch": 0.03, "learning_rate": 1.3000000000000001e-05, "logits/chosen": -2.154703140258789, "logits/rejected": -2.2054295539855957, "logps/chosen": -161.19815063476562, "logps/rejected": -172.0135040283203, "loss": 0.6995, "rewards/accuracies": 0.375, "rewards/chosen": -0.007346701342612505, "rewards/margins": -0.011799763888120651, "rewards/rejected": 0.004453063011169434, "step": 26 }, { "epoch": 0.03, "learning_rate": 1.3500000000000001e-05, "logits/chosen": -2.2279651165008545, "logits/rejected": -2.360706329345703, "logps/chosen": -134.864501953125, "logps/rejected": -170.91477966308594, "loss": 0.6831, "rewards/accuracies": 0.5625, "rewards/chosen": -0.017622999846935272, "rewards/margins": 0.021318625658750534, "rewards/rejected": -0.03894162178039551, "step": 27 }, { "epoch": 0.03, "learning_rate": 1.4000000000000001e-05, "logits/chosen": -2.119718551635742, "logits/rejected": -2.1303789615631104, "logps/chosen": -153.42706298828125, "logps/rejected": -149.59426879882812, "loss": 0.7052, "rewards/accuracies": 0.5, "rewards/chosen": -0.022306587547063828, "rewards/margins": -0.02277979999780655, "rewards/rejected": 0.00047321245074272156, "step": 28 }, { "epoch": 0.03, "learning_rate": 1.45e-05, "logits/chosen": -2.1661736965179443, "logits/rejected": -2.200699806213379, "logps/chosen": -134.8897705078125, "logps/rejected": -173.7844696044922, "loss": 0.6753, "rewards/accuracies": 0.6875, "rewards/chosen": -0.013965796679258347, "rewards/margins": 0.03791213408112526, "rewards/rejected": -0.05187792703509331, "step": 29 }, { "epoch": 0.03, "learning_rate": 1.5e-05, "logits/chosen": -2.1092920303344727, "logits/rejected": -2.1575889587402344, "logps/chosen": -156.42156982421875, "logps/rejected": -184.9061737060547, "loss": 0.6931, "rewards/accuracies": 0.4375, "rewards/chosen": -0.02043609507381916, "rewards/margins": 0.0014666561037302017, "rewards/rejected": -0.02190275304019451, "step": 30 }, { "epoch": 0.03, "learning_rate": 1.55e-05, "logits/chosen": -2.1494805812835693, "logits/rejected": -2.235766887664795, "logps/chosen": -147.58779907226562, "logps/rejected": -176.71292114257812, "loss": 0.6972, "rewards/accuracies": 0.375, "rewards/chosen": -0.020243335515260696, "rewards/margins": -0.007363701239228249, "rewards/rejected": -0.012879634276032448, "step": 31 }, { "epoch": 0.03, "learning_rate": 1.6000000000000003e-05, "logits/chosen": -2.0302445888519287, "logits/rejected": -2.072943687438965, "logps/chosen": -161.97325134277344, "logps/rejected": -169.5047149658203, "loss": 0.7081, "rewards/accuracies": 0.3125, "rewards/chosen": -0.033078648149967194, "rewards/margins": -0.026705406606197357, "rewards/rejected": -0.006373238749802113, "step": 32 }, { "epoch": 0.03, "learning_rate": 1.65e-05, "logits/chosen": -2.1231038570404053, "logits/rejected": -2.164695978164673, "logps/chosen": -177.8040313720703, "logps/rejected": -184.5164794921875, "loss": 0.7067, "rewards/accuracies": 0.3125, "rewards/chosen": -0.02300238609313965, "rewards/margins": -0.026141025125980377, "rewards/rejected": 0.003138638101518154, "step": 33 }, { "epoch": 0.04, "learning_rate": 1.7000000000000003e-05, "logits/chosen": -2.297323703765869, "logits/rejected": -2.2965850830078125, "logps/chosen": -138.19195556640625, "logps/rejected": -146.45855712890625, "loss": 0.6963, "rewards/accuracies": 0.4375, "rewards/chosen": -0.01872837543487549, "rewards/margins": -0.003586245933547616, "rewards/rejected": -0.01514213066548109, "step": 34 }, { "epoch": 0.04, "learning_rate": 1.75e-05, "logits/chosen": -1.9681299924850464, "logits/rejected": -2.0026988983154297, "logps/chosen": -148.55194091796875, "logps/rejected": -154.24107360839844, "loss": 0.6965, "rewards/accuracies": 0.4375, "rewards/chosen": -0.027061177417635918, "rewards/margins": -0.003932238090783358, "rewards/rejected": -0.023128939792513847, "step": 35 }, { "epoch": 0.04, "learning_rate": 1.8e-05, "logits/chosen": -2.351780414581299, "logits/rejected": -2.4137086868286133, "logps/chosen": -206.77256774902344, "logps/rejected": -187.95159912109375, "loss": 0.6941, "rewards/accuracies": 0.5, "rewards/chosen": -0.015692900866270065, "rewards/margins": -0.0006634213495999575, "rewards/rejected": -0.015029479749500751, "step": 36 }, { "epoch": 0.04, "learning_rate": 1.85e-05, "logits/chosen": -2.145651340484619, "logits/rejected": -2.1530380249023438, "logps/chosen": -174.49249267578125, "logps/rejected": -170.29107666015625, "loss": 0.6981, "rewards/accuracies": 0.5, "rewards/chosen": -0.05813932418823242, "rewards/margins": -0.007088134065270424, "rewards/rejected": -0.05105118826031685, "step": 37 }, { "epoch": 0.04, "learning_rate": 1.9e-05, "logits/chosen": -2.119550943374634, "logits/rejected": -2.1285836696624756, "logps/chosen": -158.59103393554688, "logps/rejected": -161.4877471923828, "loss": 0.6626, "rewards/accuracies": 0.75, "rewards/chosen": -0.004540919791907072, "rewards/margins": 0.06354211270809174, "rewards/rejected": -0.0680830180644989, "step": 38 }, { "epoch": 0.04, "learning_rate": 1.9500000000000003e-05, "logits/chosen": -2.0205461978912354, "logits/rejected": -2.0473952293395996, "logps/chosen": -132.5583953857422, "logps/rejected": -158.61367797851562, "loss": 0.7152, "rewards/accuracies": 0.1875, "rewards/chosen": -0.05132186412811279, "rewards/margins": -0.04252650961279869, "rewards/rejected": -0.008795355446636677, "step": 39 }, { "epoch": 0.04, "learning_rate": 2e-05, "logits/chosen": -2.1021435260772705, "logits/rejected": -2.056175708770752, "logps/chosen": -164.51136779785156, "logps/rejected": -140.69076538085938, "loss": 0.704, "rewards/accuracies": 0.3125, "rewards/chosen": -0.042324475944042206, "rewards/margins": -0.01991286501288414, "rewards/rejected": -0.022411609068512917, "step": 40 }, { "epoch": 0.04, "learning_rate": 2.05e-05, "logits/chosen": -2.0000710487365723, "logits/rejected": -2.0445573329925537, "logps/chosen": -144.15602111816406, "logps/rejected": -158.8379364013672, "loss": 0.687, "rewards/accuracies": 0.5, "rewards/chosen": 0.007117677479982376, "rewards/margins": 0.014511799439787865, "rewards/rejected": -0.007394121494144201, "step": 41 }, { "epoch": 0.04, "learning_rate": 2.1e-05, "logits/chosen": -2.1619393825531006, "logits/rejected": -2.2450132369995117, "logps/chosen": -147.3644561767578, "logps/rejected": -152.8286895751953, "loss": 0.6965, "rewards/accuracies": 0.4375, "rewards/chosen": -0.050475671887397766, "rewards/margins": -0.003598665352910757, "rewards/rejected": -0.04687700420618057, "step": 42 }, { "epoch": 0.04, "learning_rate": 2.15e-05, "logits/chosen": -2.103001356124878, "logits/rejected": -2.1376430988311768, "logps/chosen": -197.96510314941406, "logps/rejected": -183.20042419433594, "loss": 0.6886, "rewards/accuracies": 0.5, "rewards/chosen": -0.02689351886510849, "rewards/margins": 0.01161129493266344, "rewards/rejected": -0.038504816591739655, "step": 43 }, { "epoch": 0.05, "learning_rate": 2.2000000000000003e-05, "logits/chosen": -2.249006748199463, "logits/rejected": -2.2560691833496094, "logps/chosen": -183.13180541992188, "logps/rejected": -191.20266723632812, "loss": 0.6994, "rewards/accuracies": 0.5, "rewards/chosen": -0.0708339661359787, "rewards/margins": -0.009988496080040932, "rewards/rejected": -0.06084546819329262, "step": 44 }, { "epoch": 0.05, "learning_rate": 2.25e-05, "logits/chosen": -2.076373815536499, "logits/rejected": -2.1558804512023926, "logps/chosen": -173.52224731445312, "logps/rejected": -189.98086547851562, "loss": 0.6965, "rewards/accuracies": 0.5, "rewards/chosen": -0.061136387288570404, "rewards/margins": -0.0023721233010292053, "rewards/rejected": -0.0587642677128315, "step": 45 }, { "epoch": 0.05, "learning_rate": 2.3000000000000003e-05, "logits/chosen": -2.2398018836975098, "logits/rejected": -2.1674294471740723, "logps/chosen": -162.8035125732422, "logps/rejected": -156.17276000976562, "loss": 0.7007, "rewards/accuracies": 0.3125, "rewards/chosen": -0.06224403530359268, "rewards/margins": -0.008054491132497787, "rewards/rejected": -0.054189540445804596, "step": 46 }, { "epoch": 0.05, "learning_rate": 2.35e-05, "logits/chosen": -2.152360200881958, "logits/rejected": -2.221158742904663, "logps/chosen": -241.5851593017578, "logps/rejected": -263.8956298828125, "loss": 0.6805, "rewards/accuracies": 0.5625, "rewards/chosen": -0.04438929632306099, "rewards/margins": 0.027727916836738586, "rewards/rejected": -0.07211720943450928, "step": 47 }, { "epoch": 0.05, "learning_rate": 2.4e-05, "logits/chosen": -2.180600166320801, "logits/rejected": -2.184905529022217, "logps/chosen": -146.00100708007812, "logps/rejected": -172.64227294921875, "loss": 0.7003, "rewards/accuracies": 0.4375, "rewards/chosen": -0.04197859391570091, "rewards/margins": -0.011492157354950905, "rewards/rejected": -0.030486440286040306, "step": 48 }, { "epoch": 0.05, "learning_rate": 2.45e-05, "logits/chosen": -2.113537549972534, "logits/rejected": -2.1188883781433105, "logps/chosen": -145.5101318359375, "logps/rejected": -145.72979736328125, "loss": 0.6969, "rewards/accuracies": 0.5, "rewards/chosen": -0.09581132233142853, "rewards/margins": -0.0029689306393265724, "rewards/rejected": -0.09284238517284393, "step": 49 }, { "epoch": 0.05, "learning_rate": 2.5e-05, "logits/chosen": -2.2486584186553955, "logits/rejected": -2.2213432788848877, "logps/chosen": -165.0408172607422, "logps/rejected": -172.3516845703125, "loss": 0.672, "rewards/accuracies": 0.625, "rewards/chosen": -0.07465384155511856, "rewards/margins": 0.04853079840540886, "rewards/rejected": -0.12318463623523712, "step": 50 }, { "epoch": 0.05, "learning_rate": 2.5500000000000003e-05, "logits/chosen": -2.0694539546966553, "logits/rejected": -2.0400142669677734, "logps/chosen": -177.0946502685547, "logps/rejected": -164.363037109375, "loss": 0.734, "rewards/accuracies": 0.1875, "rewards/chosen": -0.11727481335401535, "rewards/margins": -0.07744203507900238, "rewards/rejected": -0.03983278200030327, "step": 51 }, { "epoch": 0.05, "learning_rate": 2.6000000000000002e-05, "logits/chosen": -2.2477283477783203, "logits/rejected": -2.322450876235962, "logps/chosen": -180.88040161132812, "logps/rejected": -194.59298706054688, "loss": 0.7052, "rewards/accuracies": 0.4375, "rewards/chosen": -0.05896530672907829, "rewards/margins": -0.019788123667240143, "rewards/rejected": -0.03917717933654785, "step": 52 }, { "epoch": 0.06, "learning_rate": 2.6500000000000004e-05, "logits/chosen": -2.28759765625, "logits/rejected": -2.2965030670166016, "logps/chosen": -166.57757568359375, "logps/rejected": -168.3147430419922, "loss": 0.7066, "rewards/accuracies": 0.3125, "rewards/chosen": -0.11757899075746536, "rewards/margins": -0.023945949971675873, "rewards/rejected": -0.09363303333520889, "step": 53 }, { "epoch": 0.06, "learning_rate": 2.7000000000000002e-05, "logits/chosen": -2.1356818675994873, "logits/rejected": -2.077864170074463, "logps/chosen": -163.74700927734375, "logps/rejected": -186.17466735839844, "loss": 0.6966, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0876019299030304, "rewards/margins": -0.0037088878452777863, "rewards/rejected": -0.08389303088188171, "step": 54 }, { "epoch": 0.06, "learning_rate": 2.7500000000000004e-05, "logits/chosen": -2.3550398349761963, "logits/rejected": -2.3566269874572754, "logps/chosen": -201.12591552734375, "logps/rejected": -196.99490356445312, "loss": 0.6809, "rewards/accuracies": 0.625, "rewards/chosen": -0.14813652634620667, "rewards/margins": 0.03020734339952469, "rewards/rejected": -0.17834386229515076, "step": 55 }, { "epoch": 0.06, "learning_rate": 2.8000000000000003e-05, "logits/chosen": -2.1153206825256348, "logits/rejected": -2.172855854034424, "logps/chosen": -176.15061950683594, "logps/rejected": -198.8987579345703, "loss": 0.7242, "rewards/accuracies": 0.375, "rewards/chosen": -0.1643160581588745, "rewards/margins": -0.05339653417468071, "rewards/rejected": -0.1109195277094841, "step": 56 }, { "epoch": 0.06, "learning_rate": 2.8499999999999998e-05, "logits/chosen": -2.0671656131744385, "logits/rejected": -2.146867036819458, "logps/chosen": -190.34786987304688, "logps/rejected": -229.0176544189453, "loss": 0.6826, "rewards/accuracies": 0.5625, "rewards/chosen": -0.12184757739305496, "rewards/margins": 0.025761937722563744, "rewards/rejected": -0.14760951697826385, "step": 57 }, { "epoch": 0.06, "learning_rate": 2.9e-05, "logits/chosen": -2.1337296962738037, "logits/rejected": -2.1995086669921875, "logps/chosen": -152.1367950439453, "logps/rejected": -201.97044372558594, "loss": 0.6778, "rewards/accuracies": 0.625, "rewards/chosen": -0.1797308623790741, "rewards/margins": 0.04084575176239014, "rewards/rejected": -0.22057661414146423, "step": 58 }, { "epoch": 0.06, "learning_rate": 2.95e-05, "logits/chosen": -1.802043080329895, "logits/rejected": -1.7019507884979248, "logps/chosen": -136.04141235351562, "logps/rejected": -147.78366088867188, "loss": 0.6867, "rewards/accuracies": 0.625, "rewards/chosen": -0.14073634147644043, "rewards/margins": 0.02230207994580269, "rewards/rejected": -0.163038432598114, "step": 59 }, { "epoch": 0.06, "learning_rate": 3e-05, "logits/chosen": -2.1223766803741455, "logits/rejected": -2.229828357696533, "logps/chosen": -163.0341033935547, "logps/rejected": -186.51254272460938, "loss": 0.6933, "rewards/accuracies": 0.625, "rewards/chosen": -0.15651509165763855, "rewards/margins": 0.003519295249134302, "rewards/rejected": -0.16003437340259552, "step": 60 }, { "epoch": 0.06, "learning_rate": 3.05e-05, "logits/chosen": -2.0530059337615967, "logits/rejected": -2.076582908630371, "logps/chosen": -140.6927490234375, "logps/rejected": -159.9715576171875, "loss": 0.7026, "rewards/accuracies": 0.625, "rewards/chosen": -0.1471368670463562, "rewards/margins": -0.013904569670557976, "rewards/rejected": -0.13323231041431427, "step": 61 }, { "epoch": 0.06, "learning_rate": 3.1e-05, "logits/chosen": -2.2076821327209473, "logits/rejected": -2.2303273677825928, "logps/chosen": -162.5172882080078, "logps/rejected": -172.63409423828125, "loss": 0.7205, "rewards/accuracies": 0.5, "rewards/chosen": -0.21913205087184906, "rewards/margins": -0.046682070940732956, "rewards/rejected": -0.1724499762058258, "step": 62 }, { "epoch": 0.07, "learning_rate": 3.15e-05, "logits/chosen": -2.1623778343200684, "logits/rejected": -2.109609365463257, "logps/chosen": -190.25259399414062, "logps/rejected": -181.3391876220703, "loss": 0.6934, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2230222225189209, "rewards/margins": 0.014317656867206097, "rewards/rejected": -0.23733988404273987, "step": 63 }, { "epoch": 0.07, "learning_rate": 3.2000000000000005e-05, "logits/chosen": -2.1411020755767822, "logits/rejected": -2.087689161300659, "logps/chosen": -149.68978881835938, "logps/rejected": -144.92745971679688, "loss": 0.68, "rewards/accuracies": 0.625, "rewards/chosen": -0.18927860260009766, "rewards/margins": 0.03361104428768158, "rewards/rejected": -0.22288964688777924, "step": 64 }, { "epoch": 0.07, "learning_rate": 3.2500000000000004e-05, "logits/chosen": -2.1835758686065674, "logits/rejected": -2.1907880306243896, "logps/chosen": -164.97738647460938, "logps/rejected": -180.56930541992188, "loss": 0.6891, "rewards/accuracies": 0.5625, "rewards/chosen": -0.20788520574569702, "rewards/margins": 0.012470051646232605, "rewards/rejected": -0.22035524249076843, "step": 65 }, { "epoch": 0.07, "learning_rate": 3.3e-05, "logits/chosen": -2.3262267112731934, "logits/rejected": -2.2733724117279053, "logps/chosen": -179.5419921875, "logps/rejected": -180.0846710205078, "loss": 0.7209, "rewards/accuracies": 0.4375, "rewards/chosen": -0.21241910755634308, "rewards/margins": -0.04504784941673279, "rewards/rejected": -0.16737127304077148, "step": 66 }, { "epoch": 0.07, "learning_rate": 3.35e-05, "logits/chosen": -2.183920383453369, "logits/rejected": -2.155302047729492, "logps/chosen": -165.67642211914062, "logps/rejected": -179.09642028808594, "loss": 0.6755, "rewards/accuracies": 0.625, "rewards/chosen": -0.18955731391906738, "rewards/margins": 0.04666180908679962, "rewards/rejected": -0.2362191379070282, "step": 67 }, { "epoch": 0.07, "learning_rate": 3.4000000000000007e-05, "logits/chosen": -2.099074602127075, "logits/rejected": -2.144994020462036, "logps/chosen": -146.17672729492188, "logps/rejected": -151.28427124023438, "loss": 0.737, "rewards/accuracies": 0.3125, "rewards/chosen": -0.26217174530029297, "rewards/margins": -0.07887978851795197, "rewards/rejected": -0.1832919716835022, "step": 68 }, { "epoch": 0.07, "learning_rate": 3.45e-05, "logits/chosen": -2.2945265769958496, "logits/rejected": -2.2669451236724854, "logps/chosen": -189.3528594970703, "logps/rejected": -166.80990600585938, "loss": 0.6892, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2909608483314514, "rewards/margins": 0.024565648287534714, "rewards/rejected": -0.31552648544311523, "step": 69 }, { "epoch": 0.07, "learning_rate": 3.5e-05, "logits/chosen": -2.0787875652313232, "logits/rejected": -2.090841054916382, "logps/chosen": -192.3681640625, "logps/rejected": -224.5333251953125, "loss": 0.7097, "rewards/accuracies": 0.4375, "rewards/chosen": -0.29522770643234253, "rewards/margins": -0.02591385506093502, "rewards/rejected": -0.26931384205818176, "step": 70 }, { "epoch": 0.07, "learning_rate": 3.55e-05, "logits/chosen": -2.1215460300445557, "logits/rejected": -2.0683701038360596, "logps/chosen": -144.15745544433594, "logps/rejected": -141.385498046875, "loss": 0.7234, "rewards/accuracies": 0.3125, "rewards/chosen": -0.34091001749038696, "rewards/margins": -0.048917919397354126, "rewards/rejected": -0.2919921278953552, "step": 71 }, { "epoch": 0.08, "learning_rate": 3.6e-05, "logits/chosen": -2.1100878715515137, "logits/rejected": -2.112070083618164, "logps/chosen": -154.8167724609375, "logps/rejected": -158.28297424316406, "loss": 0.6744, "rewards/accuracies": 0.5, "rewards/chosen": -0.2560737431049347, "rewards/margins": 0.04427195340394974, "rewards/rejected": -0.30034565925598145, "step": 72 }, { "epoch": 0.08, "learning_rate": 3.65e-05, "logits/chosen": -2.129936933517456, "logits/rejected": -2.2630691528320312, "logps/chosen": -158.00205993652344, "logps/rejected": -176.6663055419922, "loss": 0.6952, "rewards/accuracies": 0.375, "rewards/chosen": -0.27465441823005676, "rewards/margins": 0.013333894312381744, "rewards/rejected": -0.2879883348941803, "step": 73 }, { "epoch": 0.08, "learning_rate": 3.7e-05, "logits/chosen": -2.2137179374694824, "logits/rejected": -2.219494342803955, "logps/chosen": -175.63868713378906, "logps/rejected": -165.37460327148438, "loss": 0.7134, "rewards/accuracies": 0.375, "rewards/chosen": -0.30476734042167664, "rewards/margins": -0.019104812294244766, "rewards/rejected": -0.28566253185272217, "step": 74 }, { "epoch": 0.08, "learning_rate": 3.7500000000000003e-05, "logits/chosen": -2.021366596221924, "logits/rejected": -1.953249216079712, "logps/chosen": -148.9893341064453, "logps/rejected": -147.3232421875, "loss": 0.6865, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2916088402271271, "rewards/margins": 0.026065416634082794, "rewards/rejected": -0.31767427921295166, "step": 75 }, { "epoch": 0.08, "learning_rate": 3.8e-05, "logits/chosen": -2.0788702964782715, "logits/rejected": -2.081282377243042, "logps/chosen": -142.72647094726562, "logps/rejected": -155.7174835205078, "loss": 0.6944, "rewards/accuracies": 0.375, "rewards/chosen": -0.33365076780319214, "rewards/margins": 0.008806329220533371, "rewards/rejected": -0.342457115650177, "step": 76 }, { "epoch": 0.08, "learning_rate": 3.85e-05, "logits/chosen": -2.141986608505249, "logits/rejected": -2.1325931549072266, "logps/chosen": -166.86537170410156, "logps/rejected": -197.6178436279297, "loss": 0.7052, "rewards/accuracies": 0.4375, "rewards/chosen": -0.34245288372039795, "rewards/margins": 0.011411521583795547, "rewards/rejected": -0.3538644015789032, "step": 77 }, { "epoch": 0.08, "learning_rate": 3.9000000000000006e-05, "logits/chosen": -2.215611457824707, "logits/rejected": -2.212402582168579, "logps/chosen": -161.9552459716797, "logps/rejected": -160.0537109375, "loss": 0.6606, "rewards/accuracies": 0.5, "rewards/chosen": -0.28822606801986694, "rewards/margins": 0.07802614569664001, "rewards/rejected": -0.36625221371650696, "step": 78 }, { "epoch": 0.08, "learning_rate": 3.9500000000000005e-05, "logits/chosen": -2.1117148399353027, "logits/rejected": -2.152738094329834, "logps/chosen": -187.3416290283203, "logps/rejected": -185.2106170654297, "loss": 0.7461, "rewards/accuracies": 0.25, "rewards/chosen": -0.38387393951416016, "rewards/margins": -0.09052477777004242, "rewards/rejected": -0.2933492064476013, "step": 79 }, { "epoch": 0.08, "learning_rate": 4e-05, "logits/chosen": -2.081353187561035, "logits/rejected": -2.1454594135284424, "logps/chosen": -148.2335968017578, "logps/rejected": -169.42164611816406, "loss": 0.7009, "rewards/accuracies": 0.5625, "rewards/chosen": -0.34989604353904724, "rewards/margins": 0.00529644638299942, "rewards/rejected": -0.35519248247146606, "step": 80 }, { "epoch": 0.08, "learning_rate": 4.05e-05, "logits/chosen": -2.1243531703948975, "logits/rejected": -2.1677510738372803, "logps/chosen": -164.04400634765625, "logps/rejected": -185.00840759277344, "loss": 0.6988, "rewards/accuracies": 0.4375, "rewards/chosen": -0.3187049627304077, "rewards/margins": 0.011884737759828568, "rewards/rejected": -0.3305897116661072, "step": 81 }, { "epoch": 0.09, "learning_rate": 4.1e-05, "logits/chosen": -2.0324885845184326, "logits/rejected": -1.9073715209960938, "logps/chosen": -171.0150604248047, "logps/rejected": -178.9537811279297, "loss": 0.6898, "rewards/accuracies": 0.5625, "rewards/chosen": -0.262698233127594, "rewards/margins": 0.012961791828274727, "rewards/rejected": -0.27566003799438477, "step": 82 }, { "epoch": 0.09, "learning_rate": 4.15e-05, "logits/chosen": -2.1523077487945557, "logits/rejected": -2.1646199226379395, "logps/chosen": -164.2784881591797, "logps/rejected": -170.58087158203125, "loss": 0.6565, "rewards/accuracies": 0.625, "rewards/chosen": -0.2823156714439392, "rewards/margins": 0.08943505585193634, "rewards/rejected": -0.37175074219703674, "step": 83 }, { "epoch": 0.09, "learning_rate": 4.2e-05, "logits/chosen": -2.132960081100464, "logits/rejected": -2.170064926147461, "logps/chosen": -217.71466064453125, "logps/rejected": -224.87718200683594, "loss": 0.6449, "rewards/accuracies": 0.75, "rewards/chosen": -0.338445246219635, "rewards/margins": 0.1073441132903099, "rewards/rejected": -0.4457893371582031, "step": 84 }, { "epoch": 0.09, "learning_rate": 4.25e-05, "logits/chosen": -2.263223648071289, "logits/rejected": -2.3083293437957764, "logps/chosen": -156.03329467773438, "logps/rejected": -174.6391143798828, "loss": 0.6873, "rewards/accuracies": 0.5625, "rewards/chosen": -0.22993193566799164, "rewards/margins": 0.018939830362796783, "rewards/rejected": -0.24887175858020782, "step": 85 }, { "epoch": 0.09, "learning_rate": 4.3e-05, "logits/chosen": -2.265439510345459, "logits/rejected": -2.2670176029205322, "logps/chosen": -166.77760314941406, "logps/rejected": -166.4705352783203, "loss": 0.737, "rewards/accuracies": 0.5, "rewards/chosen": -0.3399352431297302, "rewards/margins": -0.06634283810853958, "rewards/rejected": -0.27359241247177124, "step": 86 }, { "epoch": 0.09, "learning_rate": 4.35e-05, "logits/chosen": -2.210916757583618, "logits/rejected": -2.2296037673950195, "logps/chosen": -142.21340942382812, "logps/rejected": -161.62600708007812, "loss": 0.6545, "rewards/accuracies": 0.625, "rewards/chosen": -0.301052987575531, "rewards/margins": 0.09594573080539703, "rewards/rejected": -0.39699873328208923, "step": 87 }, { "epoch": 0.09, "learning_rate": 4.4000000000000006e-05, "logits/chosen": -2.220825433731079, "logits/rejected": -2.1372034549713135, "logps/chosen": -156.62326049804688, "logps/rejected": -150.1629638671875, "loss": 0.6818, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2567201852798462, "rewards/margins": 0.031475357711315155, "rewards/rejected": -0.28819552063941956, "step": 88 }, { "epoch": 0.09, "learning_rate": 4.4500000000000004e-05, "logits/chosen": -2.080606698989868, "logits/rejected": -2.16312575340271, "logps/chosen": -162.34506225585938, "logps/rejected": -175.51246643066406, "loss": 0.7598, "rewards/accuracies": 0.5, "rewards/chosen": -0.4455791115760803, "rewards/margins": -0.09895279258489609, "rewards/rejected": -0.34662631154060364, "step": 89 }, { "epoch": 0.09, "learning_rate": 4.5e-05, "logits/chosen": -2.22636342048645, "logits/rejected": -2.32147479057312, "logps/chosen": -152.72509765625, "logps/rejected": -210.8067169189453, "loss": 0.6536, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2957713007926941, "rewards/margins": 0.11437603086233139, "rewards/rejected": -0.4101472795009613, "step": 90 }, { "epoch": 0.09, "learning_rate": 4.55e-05, "logits/chosen": -2.264246702194214, "logits/rejected": -2.322878122329712, "logps/chosen": -116.30667114257812, "logps/rejected": -136.15834045410156, "loss": 0.6652, "rewards/accuracies": 0.4375, "rewards/chosen": -0.21190989017486572, "rewards/margins": 0.09101668000221252, "rewards/rejected": -0.30292657017707825, "step": 91 }, { "epoch": 0.1, "learning_rate": 4.600000000000001e-05, "logits/chosen": -2.2319533824920654, "logits/rejected": -2.272087335586548, "logps/chosen": -209.2500762939453, "logps/rejected": -213.80284118652344, "loss": 0.7243, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5155326128005981, "rewards/margins": -0.025821635499596596, "rewards/rejected": -0.4897109568119049, "step": 92 }, { "epoch": 0.1, "learning_rate": 4.6500000000000005e-05, "logits/chosen": -2.3007993698120117, "logits/rejected": -2.233243227005005, "logps/chosen": -163.1545867919922, "logps/rejected": -162.89089965820312, "loss": 0.7095, "rewards/accuracies": 0.4375, "rewards/chosen": -0.4529023766517639, "rewards/margins": -0.020960787311196327, "rewards/rejected": -0.43194156885147095, "step": 93 }, { "epoch": 0.1, "learning_rate": 4.7e-05, "logits/chosen": -2.0963847637176514, "logits/rejected": -2.120976209640503, "logps/chosen": -180.0958709716797, "logps/rejected": -205.76585388183594, "loss": 0.6049, "rewards/accuracies": 0.6875, "rewards/chosen": -0.30844229459762573, "rewards/margins": 0.21104586124420166, "rewards/rejected": -0.5194881558418274, "step": 94 }, { "epoch": 0.1, "learning_rate": 4.75e-05, "logits/chosen": -2.2864556312561035, "logits/rejected": -2.241337299346924, "logps/chosen": -202.34237670898438, "logps/rejected": -200.288330078125, "loss": 0.6701, "rewards/accuracies": 0.5, "rewards/chosen": -0.35101816058158875, "rewards/margins": 0.08223339170217514, "rewards/rejected": -0.4332515299320221, "step": 95 }, { "epoch": 0.1, "learning_rate": 4.8e-05, "logits/chosen": -2.1984472274780273, "logits/rejected": -2.230916976928711, "logps/chosen": -176.45132446289062, "logps/rejected": -189.4639892578125, "loss": 0.7, "rewards/accuracies": 0.625, "rewards/chosen": -0.3947378993034363, "rewards/margins": 0.01168506033718586, "rewards/rejected": -0.4064229726791382, "step": 96 }, { "epoch": 0.1, "learning_rate": 4.85e-05, "logits/chosen": -2.1936614513397217, "logits/rejected": -2.178769588470459, "logps/chosen": -172.03759765625, "logps/rejected": -184.67947387695312, "loss": 0.7077, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5262473821640015, "rewards/margins": -0.006830459460616112, "rewards/rejected": -0.5194169282913208, "step": 97 }, { "epoch": 0.1, "learning_rate": 4.9e-05, "logits/chosen": -2.1688649654388428, "logits/rejected": -2.1433870792388916, "logps/chosen": -158.25515747070312, "logps/rejected": -165.4871368408203, "loss": 0.7229, "rewards/accuracies": 0.5, "rewards/chosen": -0.3400518298149109, "rewards/margins": -0.027561640366911888, "rewards/rejected": -0.31249016523361206, "step": 98 }, { "epoch": 0.1, "learning_rate": 4.9500000000000004e-05, "logits/chosen": -2.25595760345459, "logits/rejected": -2.2448055744171143, "logps/chosen": -175.17568969726562, "logps/rejected": -179.33013916015625, "loss": 0.7008, "rewards/accuracies": 0.5, "rewards/chosen": -0.4599221646785736, "rewards/margins": 0.01116972602903843, "rewards/rejected": -0.4710919260978699, "step": 99 }, { "epoch": 0.1, "learning_rate": 5e-05, "logits/chosen": -2.0946810245513916, "logits/rejected": -2.0803956985473633, "logps/chosen": -118.56491088867188, "logps/rejected": -113.53369140625, "loss": 0.6508, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2681715190410614, "rewards/margins": 0.11040147393941879, "rewards/rejected": -0.3785730004310608, "step": 100 }, { "epoch": 0.11, "learning_rate": 4.9999832415172185e-05, "logits/chosen": -2.1622209548950195, "logits/rejected": -2.251312732696533, "logps/chosen": -148.2783966064453, "logps/rejected": -232.58053588867188, "loss": 0.6485, "rewards/accuracies": 0.5, "rewards/chosen": -0.36020269989967346, "rewards/margins": 0.13017883896827698, "rewards/rejected": -0.49038150906562805, "step": 101 }, { "epoch": 0.11, "learning_rate": 4.9999329662935534e-05, "logits/chosen": -2.1302261352539062, "logits/rejected": -2.146063804626465, "logps/chosen": -182.8227081298828, "logps/rejected": -189.34913635253906, "loss": 0.6736, "rewards/accuracies": 0.625, "rewards/chosen": -0.6098363995552063, "rewards/margins": 0.08228112757205963, "rewards/rejected": -0.6921175122261047, "step": 102 }, { "epoch": 0.11, "learning_rate": 4.9998491750030315e-05, "logits/chosen": -2.0695760250091553, "logits/rejected": -2.151846170425415, "logps/chosen": -157.34878540039062, "logps/rejected": -173.14401245117188, "loss": 0.7008, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5831299424171448, "rewards/margins": 0.029685884714126587, "rewards/rejected": -0.612815797328949, "step": 103 }, { "epoch": 0.11, "learning_rate": 4.999731868769027e-05, "logits/chosen": -2.1843104362487793, "logits/rejected": -2.107654094696045, "logps/chosen": -161.08914184570312, "logps/rejected": -142.64749145507812, "loss": 0.7995, "rewards/accuracies": 0.3125, "rewards/chosen": -0.6787440776824951, "rewards/margins": -0.16208001971244812, "rewards/rejected": -0.5166640877723694, "step": 104 }, { "epoch": 0.11, "learning_rate": 4.999581049164237e-05, "logits/chosen": -2.0645735263824463, "logits/rejected": -2.147791862487793, "logps/chosen": -149.2325897216797, "logps/rejected": -173.02069091796875, "loss": 0.623, "rewards/accuracies": 0.75, "rewards/chosen": -0.3364582061767578, "rewards/margins": 0.1654883325099945, "rewards/rejected": -0.5019465684890747, "step": 105 }, { "epoch": 0.11, "learning_rate": 4.99939671821067e-05, "logits/chosen": -2.238675832748413, "logits/rejected": -2.308804988861084, "logps/chosen": -187.11219787597656, "logps/rejected": -192.90240478515625, "loss": 0.6952, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4883981943130493, "rewards/margins": 0.011134681291878223, "rewards/rejected": -0.49953290820121765, "step": 106 }, { "epoch": 0.11, "learning_rate": 4.999178878379611e-05, "logits/chosen": -2.1448211669921875, "logits/rejected": -2.163353681564331, "logps/chosen": -151.56005859375, "logps/rejected": -149.57290649414062, "loss": 0.7033, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5375577807426453, "rewards/margins": 0.005782928317785263, "rewards/rejected": -0.5433407425880432, "step": 107 }, { "epoch": 0.11, "learning_rate": 4.998927532591592e-05, "logits/chosen": -2.1543734073638916, "logits/rejected": -2.1483535766601562, "logps/chosen": -168.912841796875, "logps/rejected": -163.83612060546875, "loss": 0.7009, "rewards/accuracies": 0.5, "rewards/chosen": -0.40299952030181885, "rewards/margins": 0.021744927391409874, "rewards/rejected": -0.42474448680877686, "step": 108 }, { "epoch": 0.11, "learning_rate": 4.9986426842163515e-05, "logits/chosen": -2.196964979171753, "logits/rejected": -2.1282408237457275, "logps/chosen": -148.19366455078125, "logps/rejected": -141.83212280273438, "loss": 0.622, "rewards/accuracies": 0.625, "rewards/chosen": -0.36278650164604187, "rewards/margins": 0.18898257613182068, "rewards/rejected": -0.5517690777778625, "step": 109 }, { "epoch": 0.11, "learning_rate": 4.9983243370727914e-05, "logits/chosen": -2.136972665786743, "logits/rejected": -2.154928684234619, "logps/chosen": -146.0774383544922, "logps/rejected": -132.6929473876953, "loss": 0.6987, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5064787268638611, "rewards/margins": 0.05323922634124756, "rewards/rejected": -0.5597178936004639, "step": 110 }, { "epoch": 0.12, "learning_rate": 4.9979724954289244e-05, "logits/chosen": -2.1273446083068848, "logits/rejected": -2.1698343753814697, "logps/chosen": -140.70408630371094, "logps/rejected": -164.12890625, "loss": 0.5856, "rewards/accuracies": 0.625, "rewards/chosen": -0.27825814485549927, "rewards/margins": 0.30408480763435364, "rewards/rejected": -0.5823429822921753, "step": 111 }, { "epoch": 0.12, "learning_rate": 4.9975871640018154e-05, "logits/chosen": -2.156425952911377, "logits/rejected": -2.149127244949341, "logps/chosen": -209.48033142089844, "logps/rejected": -183.49618530273438, "loss": 0.6814, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3876263499259949, "rewards/margins": 0.07094159722328186, "rewards/rejected": -0.45856791734695435, "step": 112 }, { "epoch": 0.12, "learning_rate": 4.99716834795752e-05, "logits/chosen": -2.195077896118164, "logits/rejected": -2.171152353286743, "logps/chosen": -137.59030151367188, "logps/rejected": -144.78488159179688, "loss": 0.6117, "rewards/accuracies": 0.5625, "rewards/chosen": -0.201141357421875, "rewards/margins": 0.24156084656715393, "rewards/rejected": -0.4427022337913513, "step": 113 }, { "epoch": 0.12, "learning_rate": 4.996716052911017e-05, "logits/chosen": -2.0847880840301514, "logits/rejected": -2.141540050506592, "logps/chosen": -197.46426391601562, "logps/rejected": -199.35997009277344, "loss": 0.6728, "rewards/accuracies": 0.625, "rewards/chosen": -0.4001588821411133, "rewards/margins": 0.0924140065908432, "rewards/rejected": -0.4925729036331177, "step": 114 }, { "epoch": 0.12, "learning_rate": 4.996230284926128e-05, "logits/chosen": -1.791740894317627, "logits/rejected": -1.7912871837615967, "logps/chosen": -181.44020080566406, "logps/rejected": -201.87872314453125, "loss": 0.6437, "rewards/accuracies": 0.75, "rewards/chosen": -0.40906965732574463, "rewards/margins": 0.15568459033966064, "rewards/rejected": -0.5647542476654053, "step": 115 }, { "epoch": 0.12, "learning_rate": 4.99571105051544e-05, "logits/chosen": -2.1246492862701416, "logits/rejected": -2.122555732727051, "logps/chosen": -175.01571655273438, "logps/rejected": -176.2000732421875, "loss": 0.7431, "rewards/accuracies": 0.3125, "rewards/chosen": -0.36241090297698975, "rewards/margins": -0.04763410612940788, "rewards/rejected": -0.314776748418808, "step": 116 }, { "epoch": 0.12, "learning_rate": 4.99515835664022e-05, "logits/chosen": -2.066060781478882, "logits/rejected": -2.0761799812316895, "logps/chosen": -135.93301391601562, "logps/rejected": -162.29739379882812, "loss": 0.8265, "rewards/accuracies": 0.5, "rewards/chosen": -0.39111948013305664, "rewards/margins": -0.14791490137577057, "rewards/rejected": -0.2432045191526413, "step": 117 }, { "epoch": 0.12, "learning_rate": 4.994572210710315e-05, "logits/chosen": -2.0879368782043457, "logits/rejected": -2.1556289196014404, "logps/chosen": -197.7449951171875, "logps/rejected": -203.45278930664062, "loss": 0.6753, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2648780345916748, "rewards/margins": 0.068690724670887, "rewards/rejected": -0.3335687518119812, "step": 118 }, { "epoch": 0.12, "learning_rate": 4.993952620584058e-05, "logits/chosen": -2.2932987213134766, "logits/rejected": -2.330873489379883, "logps/chosen": -139.2769775390625, "logps/rejected": -152.76504516601562, "loss": 0.6608, "rewards/accuracies": 0.625, "rewards/chosen": -0.23606640100479126, "rewards/margins": 0.08653931319713593, "rewards/rejected": -0.3226057291030884, "step": 119 }, { "epoch": 0.13, "learning_rate": 4.993299594568163e-05, "logits/chosen": -2.1717681884765625, "logits/rejected": -2.176131010055542, "logps/chosen": -171.66172790527344, "logps/rejected": -188.26991271972656, "loss": 0.6917, "rewards/accuracies": 0.5, "rewards/chosen": -0.43707746267318726, "rewards/margins": 0.05936805531382561, "rewards/rejected": -0.49644553661346436, "step": 120 }, { "epoch": 0.13, "learning_rate": 4.992613141417608e-05, "logits/chosen": -2.1105880737304688, "logits/rejected": -2.0103209018707275, "logps/chosen": -155.08033752441406, "logps/rejected": -141.55419921875, "loss": 0.5905, "rewards/accuracies": 0.75, "rewards/chosen": -0.2041604220867157, "rewards/margins": 0.32256096601486206, "rewards/rejected": -0.5267213582992554, "step": 121 }, { "epoch": 0.13, "learning_rate": 4.9918932703355256e-05, "logits/chosen": -2.1467816829681396, "logits/rejected": -2.115278720855713, "logps/chosen": -152.28237915039062, "logps/rejected": -133.01211547851562, "loss": 0.7446, "rewards/accuracies": 0.625, "rewards/chosen": -0.44335490465164185, "rewards/margins": -0.022354908287525177, "rewards/rejected": -0.42100000381469727, "step": 122 }, { "epoch": 0.13, "learning_rate": 4.9911399909730714e-05, "logits/chosen": -2.318913221359253, "logits/rejected": -2.275251865386963, "logps/chosen": -160.89625549316406, "logps/rejected": -159.76943969726562, "loss": 0.6249, "rewards/accuracies": 0.8125, "rewards/chosen": -0.21618026494979858, "rewards/margins": 0.20579932630062103, "rewards/rejected": -0.42197954654693604, "step": 123 }, { "epoch": 0.13, "learning_rate": 4.990353313429303e-05, "logits/chosen": -2.1120331287384033, "logits/rejected": -2.0410468578338623, "logps/chosen": -183.7588653564453, "logps/rejected": -222.8588104248047, "loss": 0.684, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4172906279563904, "rewards/margins": 0.05181103199720383, "rewards/rejected": -0.4691016674041748, "step": 124 }, { "epoch": 0.13, "learning_rate": 4.989533248251037e-05, "logits/chosen": -2.0413904190063477, "logits/rejected": -2.050387382507324, "logps/chosen": -221.38436889648438, "logps/rejected": -220.9242401123047, "loss": 0.5889, "rewards/accuracies": 0.8125, "rewards/chosen": -0.18506890535354614, "rewards/margins": 0.2643589675426483, "rewards/rejected": -0.44942787289619446, "step": 125 }, { "epoch": 0.13, "learning_rate": 4.988679806432712e-05, "logits/chosen": -2.119227170944214, "logits/rejected": -2.140362024307251, "logps/chosen": -164.545166015625, "logps/rejected": -171.43540954589844, "loss": 0.727, "rewards/accuracies": 0.5625, "rewards/chosen": -0.41472572088241577, "rewards/margins": -0.024414831772446632, "rewards/rejected": -0.3903109133243561, "step": 126 }, { "epoch": 0.13, "learning_rate": 4.98779299941624e-05, "logits/chosen": -2.008235454559326, "logits/rejected": -2.0146234035491943, "logps/chosen": -182.02488708496094, "logps/rejected": -173.46522521972656, "loss": 0.7665, "rewards/accuracies": 0.5, "rewards/chosen": -0.39273571968078613, "rewards/margins": -0.07521196454763412, "rewards/rejected": -0.3175237476825714, "step": 127 }, { "epoch": 0.13, "learning_rate": 4.9868728390908526e-05, "logits/chosen": -2.194214105606079, "logits/rejected": -2.183523654937744, "logps/chosen": -144.79937744140625, "logps/rejected": -145.96029663085938, "loss": 0.71, "rewards/accuracies": 0.5625, "rewards/chosen": -0.15473532676696777, "rewards/margins": 0.0015310226008296013, "rewards/rejected": -0.15626637637615204, "step": 128 }, { "epoch": 0.13, "learning_rate": 4.985919337792944e-05, "logits/chosen": -2.0170881748199463, "logits/rejected": -2.021702527999878, "logps/chosen": -157.9308624267578, "logps/rejected": -187.35256958007812, "loss": 0.682, "rewards/accuracies": 0.5625, "rewards/chosen": -0.23143672943115234, "rewards/margins": 0.07466793060302734, "rewards/rejected": -0.3061046600341797, "step": 129 }, { "epoch": 0.14, "learning_rate": 4.9849325083059e-05, "logits/chosen": -2.345395088195801, "logits/rejected": -2.294443130493164, "logps/chosen": -202.0467987060547, "logps/rejected": -186.38247680664062, "loss": 0.692, "rewards/accuracies": 0.4375, "rewards/chosen": -0.19066600501537323, "rewards/margins": 0.07181324809789658, "rewards/rejected": -0.2624792754650116, "step": 130 }, { "epoch": 0.14, "learning_rate": 4.983912363859935e-05, "logits/chosen": -2.14298152923584, "logits/rejected": -2.1677587032318115, "logps/chosen": -125.33621215820312, "logps/rejected": -137.17404174804688, "loss": 0.6617, "rewards/accuracies": 0.625, "rewards/chosen": -0.23150649666786194, "rewards/margins": 0.10384988784790039, "rewards/rejected": -0.33535638451576233, "step": 131 }, { "epoch": 0.14, "learning_rate": 4.982858918131906e-05, "logits/chosen": -2.190133810043335, "logits/rejected": -2.247300863265991, "logps/chosen": -167.1696014404297, "logps/rejected": -168.60281372070312, "loss": 0.7058, "rewards/accuracies": 0.5, "rewards/chosen": -0.43472814559936523, "rewards/margins": 0.03844255581498146, "rewards/rejected": -0.473170667886734, "step": 132 }, { "epoch": 0.14, "learning_rate": 4.981772185245135e-05, "logits/chosen": -2.154466152191162, "logits/rejected": -2.1573760509490967, "logps/chosen": -168.56600952148438, "logps/rejected": -188.00607299804688, "loss": 0.6659, "rewards/accuracies": 0.375, "rewards/chosen": -0.3921719491481781, "rewards/margins": 0.10123846679925919, "rewards/rejected": -0.4934104084968567, "step": 133 }, { "epoch": 0.14, "learning_rate": 4.980652179769218e-05, "logits/chosen": -2.139244318008423, "logits/rejected": -2.170041561126709, "logps/chosen": -194.27749633789062, "logps/rejected": -194.87745666503906, "loss": 0.6939, "rewards/accuracies": 0.375, "rewards/chosen": -0.19869127869606018, "rewards/margins": 0.15997806191444397, "rewards/rejected": -0.3586694002151489, "step": 134 }, { "epoch": 0.14, "learning_rate": 4.979498916719828e-05, "logits/chosen": -2.0148520469665527, "logits/rejected": -2.0170676708221436, "logps/chosen": -178.43853759765625, "logps/rejected": -177.73255920410156, "loss": 0.6163, "rewards/accuracies": 0.75, "rewards/chosen": -0.15721122920513153, "rewards/margins": 0.2595069408416748, "rewards/rejected": -0.41671818494796753, "step": 135 }, { "epoch": 0.14, "learning_rate": 4.978312411558518e-05, "logits/chosen": -2.2386107444763184, "logits/rejected": -2.2416539192199707, "logps/chosen": -153.1583251953125, "logps/rejected": -165.6448516845703, "loss": 0.679, "rewards/accuracies": 0.625, "rewards/chosen": -0.48863306641578674, "rewards/margins": 0.107185497879982, "rewards/rejected": -0.5958185195922852, "step": 136 }, { "epoch": 0.14, "learning_rate": 4.977092680192507e-05, "logits/chosen": -1.9784085750579834, "logits/rejected": -1.9997234344482422, "logps/chosen": -148.26358032226562, "logps/rejected": -131.2608642578125, "loss": 0.7235, "rewards/accuracies": 0.375, "rewards/chosen": -0.3870002031326294, "rewards/margins": -0.03670747950673103, "rewards/rejected": -0.35029271245002747, "step": 137 }, { "epoch": 0.14, "learning_rate": 4.9758397389744734e-05, "logits/chosen": -2.2231109142303467, "logits/rejected": -2.1324055194854736, "logps/chosen": -175.7447052001953, "logps/rejected": -165.63980102539062, "loss": 0.6117, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3166625499725342, "rewards/margins": 0.23041030764579773, "rewards/rejected": -0.5470728278160095, "step": 138 }, { "epoch": 0.14, "learning_rate": 4.9745536047023324e-05, "logits/chosen": -2.1007936000823975, "logits/rejected": -2.2264654636383057, "logps/chosen": -205.53456115722656, "logps/rejected": -180.32980346679688, "loss": 0.6427, "rewards/accuracies": 0.625, "rewards/chosen": -0.45254045724868774, "rewards/margins": 0.15192103385925293, "rewards/rejected": -0.6044614911079407, "step": 139 }, { "epoch": 0.15, "learning_rate": 4.973234294619011e-05, "logits/chosen": -1.936387062072754, "logits/rejected": -2.0128026008605957, "logps/chosen": -145.3013916015625, "logps/rejected": -161.27244567871094, "loss": 0.6728, "rewards/accuracies": 0.5, "rewards/chosen": -0.3800917863845825, "rewards/margins": 0.11550942063331604, "rewards/rejected": -0.49560117721557617, "step": 140 }, { "epoch": 0.15, "learning_rate": 4.971881826412218e-05, "logits/chosen": -2.154330253601074, "logits/rejected": -2.237229585647583, "logps/chosen": -156.77369689941406, "logps/rejected": -176.82176208496094, "loss": 0.6794, "rewards/accuracies": 0.6875, "rewards/chosen": -0.36407431960105896, "rewards/margins": 0.1079927533864975, "rewards/rejected": -0.47206708788871765, "step": 141 }, { "epoch": 0.15, "learning_rate": 4.9704962182142044e-05, "logits/chosen": -2.1118252277374268, "logits/rejected": -2.134640693664551, "logps/chosen": -166.844482421875, "logps/rejected": -167.47900390625, "loss": 0.6575, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4557605981826782, "rewards/margins": 0.10824623703956604, "rewards/rejected": -0.5640068054199219, "step": 142 }, { "epoch": 0.15, "learning_rate": 4.9690774886015244e-05, "logits/chosen": -2.1018967628479004, "logits/rejected": -2.1379857063293457, "logps/chosen": -176.53829956054688, "logps/rejected": -194.2880859375, "loss": 0.6878, "rewards/accuracies": 0.4375, "rewards/chosen": -0.4790266156196594, "rewards/margins": 0.10418644547462463, "rewards/rejected": -0.5832130312919617, "step": 143 }, { "epoch": 0.15, "learning_rate": 4.967625656594782e-05, "logits/chosen": -2.0278987884521484, "logits/rejected": -2.0434699058532715, "logps/chosen": -145.874267578125, "logps/rejected": -151.4019317626953, "loss": 0.7109, "rewards/accuracies": 0.5, "rewards/chosen": -0.38604238629341125, "rewards/margins": 0.0024005472660064697, "rewards/rejected": -0.3884429931640625, "step": 144 }, { "epoch": 0.15, "learning_rate": 4.966140741658379e-05, "logits/chosen": -2.128117084503174, "logits/rejected": -2.169542074203491, "logps/chosen": -166.9932098388672, "logps/rejected": -166.7843475341797, "loss": 0.699, "rewards/accuracies": 0.5625, "rewards/chosen": -0.42920929193496704, "rewards/margins": 0.09982432425022125, "rewards/rejected": -0.5290336608886719, "step": 145 }, { "epoch": 0.15, "learning_rate": 4.9646227637002515e-05, "logits/chosen": -2.2982516288757324, "logits/rejected": -2.3041136264801025, "logps/chosen": -174.27261352539062, "logps/rejected": -181.32333374023438, "loss": 0.7344, "rewards/accuracies": 0.5, "rewards/chosen": -0.5106673836708069, "rewards/margins": 0.0017841942608356476, "rewards/rejected": -0.5124515891075134, "step": 146 }, { "epoch": 0.15, "learning_rate": 4.963071743071607e-05, "logits/chosen": -2.1971347332000732, "logits/rejected": -2.252727508544922, "logps/chosen": -164.0286102294922, "logps/rejected": -174.8863525390625, "loss": 0.7943, "rewards/accuracies": 0.5, "rewards/chosen": -0.5613827705383301, "rewards/margins": -0.05153023824095726, "rewards/rejected": -0.5098525285720825, "step": 147 }, { "epoch": 0.15, "learning_rate": 4.961487700566646e-05, "logits/chosen": -2.056870222091675, "logits/rejected": -2.05178165435791, "logps/chosen": -143.26681518554688, "logps/rejected": -183.6707000732422, "loss": 0.744, "rewards/accuracies": 0.4375, "rewards/chosen": -0.3036291301250458, "rewards/margins": 0.006056658923625946, "rewards/rejected": -0.3096857964992523, "step": 148 }, { "epoch": 0.16, "learning_rate": 4.9598706574222886e-05, "logits/chosen": -2.159867286682129, "logits/rejected": -2.2232213020324707, "logps/chosen": -161.9599609375, "logps/rejected": -184.70950317382812, "loss": 0.7647, "rewards/accuracies": 0.5, "rewards/chosen": -0.32318681478500366, "rewards/margins": -0.07305724918842316, "rewards/rejected": -0.2501295506954193, "step": 149 }, { "epoch": 0.16, "learning_rate": 4.958220635317886e-05, "logits/chosen": -2.0702919960021973, "logits/rejected": -2.1789698600769043, "logps/chosen": -159.52511596679688, "logps/rejected": -187.4576416015625, "loss": 0.6574, "rewards/accuracies": 0.625, "rewards/chosen": -0.264212042093277, "rewards/margins": 0.14712117612361908, "rewards/rejected": -0.41133320331573486, "step": 150 }, { "epoch": 0.16, "learning_rate": 4.956537656374933e-05, "logits/chosen": -2.1290884017944336, "logits/rejected": -2.1409378051757812, "logps/chosen": -156.96994018554688, "logps/rejected": -166.66525268554688, "loss": 0.7229, "rewards/accuracies": 0.4375, "rewards/chosen": -0.4065425992012024, "rewards/margins": 0.03063153848052025, "rewards/rejected": -0.43717408180236816, "step": 151 }, { "epoch": 0.16, "learning_rate": 4.9548217431567665e-05, "logits/chosen": -2.1941640377044678, "logits/rejected": -2.2345049381256104, "logps/chosen": -141.96652221679688, "logps/rejected": -145.7759552001953, "loss": 0.6756, "rewards/accuracies": 0.4375, "rewards/chosen": -0.2183290421962738, "rewards/margins": 0.11311781406402588, "rewards/rejected": -0.3314468264579773, "step": 152 }, { "epoch": 0.16, "learning_rate": 4.95307291866827e-05, "logits/chosen": -2.1858081817626953, "logits/rejected": -2.1450536251068115, "logps/chosen": -155.87149047851562, "logps/rejected": -157.30197143554688, "loss": 0.6938, "rewards/accuracies": 0.75, "rewards/chosen": -0.34080612659454346, "rewards/margins": 0.06700116395950317, "rewards/rejected": -0.40780726075172424, "step": 153 }, { "epoch": 0.16, "learning_rate": 4.95129120635556e-05, "logits/chosen": -2.240177869796753, "logits/rejected": -2.171326160430908, "logps/chosen": -165.3470458984375, "logps/rejected": -152.51541137695312, "loss": 0.7462, "rewards/accuracies": 0.375, "rewards/chosen": -0.3122984766960144, "rewards/margins": -0.0709066390991211, "rewards/rejected": -0.24139180779457092, "step": 154 }, { "epoch": 0.16, "learning_rate": 4.949476630105669e-05, "logits/chosen": -2.2033369541168213, "logits/rejected": -2.219047784805298, "logps/chosen": -201.08102416992188, "logps/rejected": -198.62356567382812, "loss": 0.6226, "rewards/accuracies": 0.75, "rewards/chosen": -0.3462004065513611, "rewards/margins": 0.1935741901397705, "rewards/rejected": -0.5397745966911316, "step": 155 }, { "epoch": 0.16, "learning_rate": 4.9476292142462374e-05, "logits/chosen": -2.0294063091278076, "logits/rejected": -2.0089852809906006, "logps/chosen": -141.88330078125, "logps/rejected": -145.25686645507812, "loss": 0.6791, "rewards/accuracies": 0.5, "rewards/chosen": -0.2922942638397217, "rewards/margins": 0.08209052681922913, "rewards/rejected": -0.3743847906589508, "step": 156 }, { "epoch": 0.16, "learning_rate": 4.945748983545172e-05, "logits/chosen": -2.1608543395996094, "logits/rejected": -2.099316358566284, "logps/chosen": -140.04598999023438, "logps/rejected": -135.99771118164062, "loss": 0.5853, "rewards/accuracies": 0.625, "rewards/chosen": -0.14180442690849304, "rewards/margins": 0.3191547393798828, "rewards/rejected": -0.46095913648605347, "step": 157 }, { "epoch": 0.16, "learning_rate": 4.943835963210324e-05, "logits/chosen": -2.23140811920166, "logits/rejected": -2.160383701324463, "logps/chosen": -176.86737060546875, "logps/rejected": -173.28509521484375, "loss": 0.6432, "rewards/accuracies": 0.5, "rewards/chosen": -0.16037102043628693, "rewards/margins": 0.1384631097316742, "rewards/rejected": -0.2988341450691223, "step": 158 }, { "epoch": 0.17, "learning_rate": 4.941890178889149e-05, "logits/chosen": -2.226886034011841, "logits/rejected": -2.2156994342803955, "logps/chosen": -151.04501342773438, "logps/rejected": -152.1208953857422, "loss": 0.7934, "rewards/accuracies": 0.25, "rewards/chosen": -0.4307902157306671, "rewards/margins": -0.15255290269851685, "rewards/rejected": -0.2782372236251831, "step": 159 }, { "epoch": 0.17, "learning_rate": 4.939911656668361e-05, "logits/chosen": -2.1915454864501953, "logits/rejected": -2.1480343341827393, "logps/chosen": -127.00537109375, "logps/rejected": -118.40946960449219, "loss": 0.6693, "rewards/accuracies": 0.5625, "rewards/chosen": -0.05850663036108017, "rewards/margins": 0.07123789936304092, "rewards/rejected": -0.1297445297241211, "step": 160 }, { "epoch": 0.17, "learning_rate": 4.937900423073585e-05, "logits/chosen": -2.2268741130828857, "logits/rejected": -2.223992347717285, "logps/chosen": -180.8136749267578, "logps/rejected": -194.1834259033203, "loss": 0.7348, "rewards/accuracies": 0.375, "rewards/chosen": -0.2488463819026947, "rewards/margins": 0.01856984756886959, "rewards/rejected": -0.26741623878479004, "step": 161 }, { "epoch": 0.17, "learning_rate": 4.9358565050689985e-05, "logits/chosen": -2.163513660430908, "logits/rejected": -2.1871607303619385, "logps/chosen": -196.00209045410156, "logps/rejected": -197.71170043945312, "loss": 0.6894, "rewards/accuracies": 0.5, "rewards/chosen": -0.3192088305950165, "rewards/margins": 0.049343544989824295, "rewards/rejected": -0.36855238676071167, "step": 162 }, { "epoch": 0.17, "learning_rate": 4.933779930056975e-05, "logits/chosen": -2.1793665885925293, "logits/rejected": -2.140435218811035, "logps/chosen": -141.68283081054688, "logps/rejected": -157.8527069091797, "loss": 0.8341, "rewards/accuracies": 0.1875, "rewards/chosen": -0.5028910636901855, "rewards/margins": -0.241933673620224, "rewards/rejected": -0.26095739006996155, "step": 163 }, { "epoch": 0.17, "learning_rate": 4.93167072587771e-05, "logits/chosen": -2.1162264347076416, "logits/rejected": -2.159403085708618, "logps/chosen": -167.28477478027344, "logps/rejected": -159.3861083984375, "loss": 0.6934, "rewards/accuracies": 0.5625, "rewards/chosen": -0.32176950573921204, "rewards/margins": 0.0923306867480278, "rewards/rejected": -0.41410019993782043, "step": 164 }, { "epoch": 0.17, "learning_rate": 4.929528920808854e-05, "logits/chosen": -2.1691784858703613, "logits/rejected": -2.151355266571045, "logps/chosen": -210.38043212890625, "logps/rejected": -182.91961669921875, "loss": 0.5909, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3064863383769989, "rewards/margins": 0.28497347235679626, "rewards/rejected": -0.5914597511291504, "step": 165 }, { "epoch": 0.17, "learning_rate": 4.92735454356513e-05, "logits/chosen": -2.105543613433838, "logits/rejected": -2.123973846435547, "logps/chosen": -136.9002685546875, "logps/rejected": -134.88809204101562, "loss": 0.7425, "rewards/accuracies": 0.4375, "rewards/chosen": -0.40990638732910156, "rewards/margins": -0.044356442987918854, "rewards/rejected": -0.3655499517917633, "step": 166 }, { "epoch": 0.17, "learning_rate": 4.925147623297949e-05, "logits/chosen": -2.324575901031494, "logits/rejected": -2.318359613418579, "logps/chosen": -196.5352020263672, "logps/rejected": -168.95431518554688, "loss": 0.8018, "rewards/accuracies": 0.375, "rewards/chosen": -0.35998621582984924, "rewards/margins": -0.16241417825222015, "rewards/rejected": -0.1975720375776291, "step": 167 }, { "epoch": 0.18, "learning_rate": 4.922908189595018e-05, "logits/chosen": -2.0524559020996094, "logits/rejected": -2.0096793174743652, "logps/chosen": -154.45022583007812, "logps/rejected": -153.45050048828125, "loss": 0.6433, "rewards/accuracies": 0.625, "rewards/chosen": -0.21955004334449768, "rewards/margins": 0.17729638516902924, "rewards/rejected": -0.3968464732170105, "step": 168 }, { "epoch": 0.18, "learning_rate": 4.920636272479946e-05, "logits/chosen": -2.305999755859375, "logits/rejected": -2.299030065536499, "logps/chosen": -164.8284149169922, "logps/rejected": -151.80828857421875, "loss": 0.6648, "rewards/accuracies": 0.5, "rewards/chosen": -0.31743210554122925, "rewards/margins": 0.11833076179027557, "rewards/rejected": -0.43576285243034363, "step": 169 }, { "epoch": 0.18, "learning_rate": 4.9183319024118415e-05, "logits/chosen": -2.1119515895843506, "logits/rejected": -2.1947803497314453, "logps/chosen": -135.42007446289062, "logps/rejected": -142.30645751953125, "loss": 0.7242, "rewards/accuracies": 0.5, "rewards/chosen": -0.29650259017944336, "rewards/margins": -0.007799305021762848, "rewards/rejected": -0.2887033224105835, "step": 170 }, { "epoch": 0.18, "learning_rate": 4.915995110284901e-05, "logits/chosen": -2.1119866371154785, "logits/rejected": -2.133423328399658, "logps/chosen": -180.34866333007812, "logps/rejected": -194.75259399414062, "loss": 0.712, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3628592789173126, "rewards/margins": 0.04306137561798096, "rewards/rejected": -0.4059206247329712, "step": 171 }, { "epoch": 0.18, "learning_rate": 4.9136259274279955e-05, "logits/chosen": -2.2553658485412598, "logits/rejected": -2.261441707611084, "logps/chosen": -146.23861694335938, "logps/rejected": -143.55841064453125, "loss": 0.7566, "rewards/accuracies": 0.4375, "rewards/chosen": -0.4861433506011963, "rewards/margins": -0.029232196509838104, "rewards/rejected": -0.4569111168384552, "step": 172 }, { "epoch": 0.18, "learning_rate": 4.911224385604255e-05, "logits/chosen": -2.3380446434020996, "logits/rejected": -2.278407335281372, "logps/chosen": -160.54949951171875, "logps/rejected": -151.08018493652344, "loss": 0.8986, "rewards/accuracies": 0.25, "rewards/chosen": -0.5705781579017639, "rewards/margins": -0.310020387172699, "rewards/rejected": -0.26055777072906494, "step": 173 }, { "epoch": 0.18, "learning_rate": 4.908790517010636e-05, "logits/chosen": -2.298267364501953, "logits/rejected": -2.317840814590454, "logps/chosen": -148.01202392578125, "logps/rejected": -157.5655517578125, "loss": 0.5926, "rewards/accuracies": 0.5625, "rewards/chosen": -0.20174311101436615, "rewards/margins": 0.3064672350883484, "rewards/rejected": -0.5082104206085205, "step": 174 }, { "epoch": 0.18, "learning_rate": 4.906324354277495e-05, "logits/chosen": -2.31154727935791, "logits/rejected": -2.3395073413848877, "logps/chosen": -208.95376586914062, "logps/rejected": -197.12318420410156, "loss": 0.8049, "rewards/accuracies": 0.3125, "rewards/chosen": -0.4388049244880676, "rewards/margins": -0.17647361755371094, "rewards/rejected": -0.2623312771320343, "step": 175 }, { "epoch": 0.18, "learning_rate": 4.903825930468149e-05, "logits/chosen": -2.3014442920684814, "logits/rejected": -2.2676706314086914, "logps/chosen": -176.29176330566406, "logps/rejected": -158.9712371826172, "loss": 0.6158, "rewards/accuracies": 0.8125, "rewards/chosen": -0.17731614410877228, "rewards/margins": 0.219602569937706, "rewards/rejected": -0.3969186842441559, "step": 176 }, { "epoch": 0.18, "learning_rate": 4.901295279078431e-05, "logits/chosen": -2.3018527030944824, "logits/rejected": -2.250325918197632, "logps/chosen": -198.3526611328125, "logps/rejected": -224.3977508544922, "loss": 0.749, "rewards/accuracies": 0.375, "rewards/chosen": -0.6051126718521118, "rewards/margins": -0.05212073400616646, "rewards/rejected": -0.5529919862747192, "step": 177 }, { "epoch": 0.19, "learning_rate": 4.898732434036244e-05, "logits/chosen": -2.299297571182251, "logits/rejected": -2.3030190467834473, "logps/chosen": -167.0242156982422, "logps/rejected": -163.21133422851562, "loss": 0.818, "rewards/accuracies": 0.4375, "rewards/chosen": -0.4616413414478302, "rewards/margins": -0.1739044189453125, "rewards/rejected": -0.2877369225025177, "step": 178 }, { "epoch": 0.19, "learning_rate": 4.896137429701102e-05, "logits/chosen": -2.186522960662842, "logits/rejected": -2.127988338470459, "logps/chosen": -173.8786163330078, "logps/rejected": -175.3472137451172, "loss": 0.6219, "rewards/accuracies": 0.75, "rewards/chosen": -0.33478206396102905, "rewards/margins": 0.27421796321868896, "rewards/rejected": -0.609000027179718, "step": 179 }, { "epoch": 0.19, "learning_rate": 4.893510300863676e-05, "logits/chosen": -2.1868398189544678, "logits/rejected": -2.2052829265594482, "logps/chosen": -226.92471313476562, "logps/rejected": -219.23912048339844, "loss": 0.7715, "rewards/accuracies": 0.5, "rewards/chosen": -0.5434471964836121, "rewards/margins": -0.10749813914299011, "rewards/rejected": -0.43594905734062195, "step": 180 }, { "epoch": 0.19, "learning_rate": 4.890851082745319e-05, "logits/chosen": -2.2493791580200195, "logits/rejected": -2.300419330596924, "logps/chosen": -181.13748168945312, "logps/rejected": -190.27645874023438, "loss": 0.6568, "rewards/accuracies": 0.5625, "rewards/chosen": -0.10172881931066513, "rewards/margins": 0.09888561069965363, "rewards/rejected": -0.20061442255973816, "step": 181 }, { "epoch": 0.19, "learning_rate": 4.8881598109976004e-05, "logits/chosen": -2.3254058361053467, "logits/rejected": -2.3100500106811523, "logps/chosen": -208.1369171142578, "logps/rejected": -213.24652099609375, "loss": 0.7061, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5665594339370728, "rewards/margins": 0.10720066726207733, "rewards/rejected": -0.673760175704956, "step": 182 }, { "epoch": 0.19, "learning_rate": 4.885436521701824e-05, "logits/chosen": -2.3967294692993164, "logits/rejected": -2.4122745990753174, "logps/chosen": -123.20140838623047, "logps/rejected": -133.7183837890625, "loss": 0.6325, "rewards/accuracies": 0.625, "rewards/chosen": -0.3578134775161743, "rewards/margins": 0.17870007455348969, "rewards/rejected": -0.5365135669708252, "step": 183 }, { "epoch": 0.19, "learning_rate": 4.8826812513685487e-05, "logits/chosen": -2.327406167984009, "logits/rejected": -2.3479795455932617, "logps/chosen": -169.1548614501953, "logps/rejected": -181.2438201904297, "loss": 0.6755, "rewards/accuracies": 0.3125, "rewards/chosen": -0.5492762327194214, "rewards/margins": 0.12284587323665619, "rewards/rejected": -0.672122061252594, "step": 184 }, { "epoch": 0.19, "learning_rate": 4.8798940369370944e-05, "logits/chosen": -2.2139463424682617, "logits/rejected": -2.182992935180664, "logps/chosen": -169.96737670898438, "logps/rejected": -162.341552734375, "loss": 0.7834, "rewards/accuracies": 0.5, "rewards/chosen": -0.5437475442886353, "rewards/margins": -0.11101134121417999, "rewards/rejected": -0.43273624777793884, "step": 185 }, { "epoch": 0.19, "learning_rate": 4.877074915775049e-05, "logits/chosen": -2.370654582977295, "logits/rejected": -2.2954137325286865, "logps/chosen": -197.25172424316406, "logps/rejected": -176.16543579101562, "loss": 0.7289, "rewards/accuracies": 0.5, "rewards/chosen": -0.43914932012557983, "rewards/margins": -0.006785091012716293, "rewards/rejected": -0.43236425518989563, "step": 186 }, { "epoch": 0.2, "learning_rate": 4.8742239256777674e-05, "logits/chosen": -2.179440975189209, "logits/rejected": -2.1714303493499756, "logps/chosen": -151.60626220703125, "logps/rejected": -179.89920043945312, "loss": 0.7239, "rewards/accuracies": 0.625, "rewards/chosen": -0.5735316872596741, "rewards/margins": 0.02325405180454254, "rewards/rejected": -0.5967857241630554, "step": 187 }, { "epoch": 0.2, "learning_rate": 4.8713411048678635e-05, "logits/chosen": -2.132361650466919, "logits/rejected": -1.9567621946334839, "logps/chosen": -185.78890991210938, "logps/rejected": -146.61630249023438, "loss": 0.7341, "rewards/accuracies": 0.375, "rewards/chosen": -0.5060631632804871, "rewards/margins": -0.03276711702346802, "rewards/rejected": -0.47329598665237427, "step": 188 }, { "epoch": 0.2, "learning_rate": 4.868426491994702e-05, "logits/chosen": -2.0946998596191406, "logits/rejected": -2.0924675464630127, "logps/chosen": -182.0203857421875, "logps/rejected": -189.61448669433594, "loss": 0.6746, "rewards/accuracies": 0.75, "rewards/chosen": -0.5287783741950989, "rewards/margins": 0.13924337923526764, "rewards/rejected": -0.6680217981338501, "step": 189 }, { "epoch": 0.2, "learning_rate": 4.865480126133872e-05, "logits/chosen": -2.1674387454986572, "logits/rejected": -2.1639039516448975, "logps/chosen": -215.89862060546875, "logps/rejected": -194.6399383544922, "loss": 0.6866, "rewards/accuracies": 0.4375, "rewards/chosen": -0.4448810815811157, "rewards/margins": 0.05918551981449127, "rewards/rejected": -0.5040666460990906, "step": 190 }, { "epoch": 0.2, "learning_rate": 4.862502046786671e-05, "logits/chosen": -2.1711835861206055, "logits/rejected": -2.3064002990722656, "logps/chosen": -173.3459014892578, "logps/rejected": -198.23008728027344, "loss": 0.6805, "rewards/accuracies": 0.4375, "rewards/chosen": -0.4160701632499695, "rewards/margins": 0.11089640110731125, "rewards/rejected": -0.5269665122032166, "step": 191 }, { "epoch": 0.2, "learning_rate": 4.859492293879574e-05, "logits/chosen": -2.2058756351470947, "logits/rejected": -2.178715944290161, "logps/chosen": -227.00070190429688, "logps/rejected": -238.19232177734375, "loss": 0.6392, "rewards/accuracies": 0.5, "rewards/chosen": -0.3010501563549042, "rewards/margins": 0.1993046998977661, "rewards/rejected": -0.5003548264503479, "step": 192 }, { "epoch": 0.2, "learning_rate": 4.856450907763693e-05, "logits/chosen": -2.2074475288391113, "logits/rejected": -2.1603050231933594, "logps/chosen": -149.5859832763672, "logps/rejected": -149.609130859375, "loss": 0.777, "rewards/accuracies": 0.3125, "rewards/chosen": -0.4570000171661377, "rewards/margins": -0.09236741065979004, "rewards/rejected": -0.36463260650634766, "step": 193 }, { "epoch": 0.2, "learning_rate": 4.853377929214243e-05, "logits/chosen": -2.123965263366699, "logits/rejected": -2.116884469985962, "logps/chosen": -172.16937255859375, "logps/rejected": -180.29434204101562, "loss": 0.6691, "rewards/accuracies": 0.75, "rewards/chosen": -0.4884795546531677, "rewards/margins": 0.12548628449440002, "rewards/rejected": -0.6139658689498901, "step": 194 }, { "epoch": 0.2, "learning_rate": 4.85027339942999e-05, "logits/chosen": -2.302314281463623, "logits/rejected": -2.272852659225464, "logps/chosen": -220.17941284179688, "logps/rejected": -226.3072509765625, "loss": 0.6509, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4287835657596588, "rewards/margins": 0.1545097827911377, "rewards/rejected": -0.5832933783531189, "step": 195 }, { "epoch": 0.2, "learning_rate": 4.8471373600326996e-05, "logits/chosen": -2.139336347579956, "logits/rejected": -2.087122678756714, "logps/chosen": -140.20321655273438, "logps/rejected": -131.48948669433594, "loss": 0.7291, "rewards/accuracies": 0.4375, "rewards/chosen": -0.3434653580188751, "rewards/margins": -0.03645901009440422, "rewards/rejected": -0.3070063292980194, "step": 196 }, { "epoch": 0.21, "learning_rate": 4.843969853066584e-05, "logits/chosen": -2.292895793914795, "logits/rejected": -2.316250801086426, "logps/chosen": -147.1464385986328, "logps/rejected": -161.3351593017578, "loss": 0.6387, "rewards/accuracies": 0.5, "rewards/chosen": -0.1997198909521103, "rewards/margins": 0.2121596336364746, "rewards/rejected": -0.4118794798851013, "step": 197 }, { "epoch": 0.21, "learning_rate": 4.8407709209977305e-05, "logits/chosen": -2.4767165184020996, "logits/rejected": -2.521482467651367, "logps/chosen": -206.472412109375, "logps/rejected": -215.40516662597656, "loss": 0.6462, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5403072237968445, "rewards/margins": 0.17954078316688538, "rewards/rejected": -0.7198480367660522, "step": 198 }, { "epoch": 0.21, "learning_rate": 4.837540606713538e-05, "logits/chosen": -2.2345879077911377, "logits/rejected": -2.221876621246338, "logps/chosen": -166.65631103515625, "logps/rejected": -146.72183227539062, "loss": 0.8822, "rewards/accuracies": 0.3125, "rewards/chosen": -0.7132791876792908, "rewards/margins": -0.2818105220794678, "rewards/rejected": -0.43146878480911255, "step": 199 }, { "epoch": 0.21, "learning_rate": 4.834278953522138e-05, "logits/chosen": -2.1550986766815186, "logits/rejected": -2.2200095653533936, "logps/chosen": -141.46798706054688, "logps/rejected": -152.30160522460938, "loss": 0.6853, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3020961284637451, "rewards/margins": 0.08537312597036362, "rewards/rejected": -0.3874692916870117, "step": 200 }, { "epoch": 0.21, "learning_rate": 4.8309860051518204e-05, "logits/chosen": -2.1666178703308105, "logits/rejected": -2.195492744445801, "logps/chosen": -154.3419647216797, "logps/rejected": -154.75958251953125, "loss": 0.7683, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5529704093933105, "rewards/margins": 0.002926960587501526, "rewards/rejected": -0.5558973550796509, "step": 201 }, { "epoch": 0.21, "learning_rate": 4.8276618057504376e-05, "logits/chosen": -2.2236335277557373, "logits/rejected": -2.2724032402038574, "logps/chosen": -143.28988647460938, "logps/rejected": -150.9318084716797, "loss": 0.7539, "rewards/accuracies": 0.5, "rewards/chosen": -0.256462037563324, "rewards/margins": -0.02850787341594696, "rewards/rejected": -0.2279541939496994, "step": 202 }, { "epoch": 0.21, "learning_rate": 4.824306399884822e-05, "logits/chosen": -2.2802443504333496, "logits/rejected": -2.2836642265319824, "logps/chosen": -179.90365600585938, "logps/rejected": -169.27171325683594, "loss": 0.7907, "rewards/accuracies": 0.3125, "rewards/chosen": -0.4755932092666626, "rewards/margins": -0.1532573252916336, "rewards/rejected": -0.3223358988761902, "step": 203 }, { "epoch": 0.21, "learning_rate": 4.8209198325401815e-05, "logits/chosen": -2.3384103775024414, "logits/rejected": -2.314667224884033, "logps/chosen": -170.7639617919922, "logps/rejected": -162.15753173828125, "loss": 0.6634, "rewards/accuracies": 0.75, "rewards/chosen": -0.26886916160583496, "rewards/margins": 0.1269666701555252, "rewards/rejected": -0.39583584666252136, "step": 204 }, { "epoch": 0.21, "learning_rate": 4.817502149119502e-05, "logits/chosen": -2.250046491622925, "logits/rejected": -2.225694179534912, "logps/chosen": -165.20504760742188, "logps/rejected": -172.9903106689453, "loss": 0.6962, "rewards/accuracies": 0.375, "rewards/chosen": -0.45896613597869873, "rewards/margins": 0.0778706818819046, "rewards/rejected": -0.5368368029594421, "step": 205 }, { "epoch": 0.21, "learning_rate": 4.8140533954429327e-05, "logits/chosen": -2.313793420791626, "logits/rejected": -2.2939281463623047, "logps/chosen": -143.09251403808594, "logps/rejected": -150.27903747558594, "loss": 0.6284, "rewards/accuracies": 0.375, "rewards/chosen": -0.2762828767299652, "rewards/margins": 0.2016773521900177, "rewards/rejected": -0.4779602289199829, "step": 206 }, { "epoch": 0.22, "learning_rate": 4.810573617747178e-05, "logits/chosen": -2.332127809524536, "logits/rejected": -2.327253818511963, "logps/chosen": -162.88739013671875, "logps/rejected": -167.6454315185547, "loss": 0.6114, "rewards/accuracies": 0.8125, "rewards/chosen": -0.22507114708423615, "rewards/margins": 0.24488690495491028, "rewards/rejected": -0.4699580669403076, "step": 207 }, { "epoch": 0.22, "learning_rate": 4.8070628626848735e-05, "logits/chosen": -2.12371563911438, "logits/rejected": -2.1890509128570557, "logps/chosen": -169.19342041015625, "logps/rejected": -191.41851806640625, "loss": 0.7002, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5895254611968994, "rewards/margins": 0.07979288697242737, "rewards/rejected": -0.6693182587623596, "step": 208 }, { "epoch": 0.22, "learning_rate": 4.803521177323962e-05, "logits/chosen": -2.2081453800201416, "logits/rejected": -2.1713621616363525, "logps/chosen": -163.9095458984375, "logps/rejected": -169.9920654296875, "loss": 0.6973, "rewards/accuracies": 0.5, "rewards/chosen": -0.3412324786186218, "rewards/margins": 0.045901067554950714, "rewards/rejected": -0.38713356852531433, "step": 209 }, { "epoch": 0.22, "learning_rate": 4.799948609147061e-05, "logits/chosen": -2.161041259765625, "logits/rejected": -2.1152327060699463, "logps/chosen": -165.09188842773438, "logps/rejected": -156.6759796142578, "loss": 0.7462, "rewards/accuracies": 0.5, "rewards/chosen": -0.48258787393569946, "rewards/margins": -0.04187957942485809, "rewards/rejected": -0.44070830941200256, "step": 210 }, { "epoch": 0.22, "learning_rate": 4.796345206050829e-05, "logits/chosen": -2.106369733810425, "logits/rejected": -2.2182164192199707, "logps/chosen": -176.6033477783203, "logps/rejected": -213.2372589111328, "loss": 0.7045, "rewards/accuracies": 0.4375, "rewards/chosen": -0.4269718527793884, "rewards/margins": 0.00813320279121399, "rewards/rejected": -0.4351051151752472, "step": 211 }, { "epoch": 0.22, "learning_rate": 4.792711016345321e-05, "logits/chosen": -2.1926729679107666, "logits/rejected": -2.137371063232422, "logps/chosen": -157.62571716308594, "logps/rejected": -141.09170532226562, "loss": 0.6535, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3278404176235199, "rewards/margins": 0.16385424137115479, "rewards/rejected": -0.49169468879699707, "step": 212 }, { "epoch": 0.22, "learning_rate": 4.7890460887533417e-05, "logits/chosen": -2.1466121673583984, "logits/rejected": -2.181826114654541, "logps/chosen": -163.6356964111328, "logps/rejected": -176.73495483398438, "loss": 0.6503, "rewards/accuracies": 0.625, "rewards/chosen": -0.25386762619018555, "rewards/margins": 0.17454546689987183, "rewards/rejected": -0.4284130930900574, "step": 213 }, { "epoch": 0.22, "learning_rate": 4.785350472409792e-05, "logits/chosen": -2.1585237979888916, "logits/rejected": -2.2346014976501465, "logps/chosen": -171.45030212402344, "logps/rejected": -226.79852294921875, "loss": 0.7963, "rewards/accuracies": 0.375, "rewards/chosen": -0.43708741664886475, "rewards/margins": -0.08538319170475006, "rewards/rejected": -0.3517042398452759, "step": 214 }, { "epoch": 0.22, "learning_rate": 4.7816242168610093e-05, "logits/chosen": -2.247028350830078, "logits/rejected": -2.2580173015594482, "logps/chosen": -190.72055053710938, "logps/rejected": -186.36619567871094, "loss": 0.6804, "rewards/accuracies": 0.5625, "rewards/chosen": -0.447765588760376, "rewards/margins": 0.06104414537549019, "rewards/rejected": -0.5088096857070923, "step": 215 }, { "epoch": 0.23, "learning_rate": 4.777867372064105e-05, "logits/chosen": -2.222308874130249, "logits/rejected": -2.2742927074432373, "logps/chosen": -165.11985778808594, "logps/rejected": -187.6106719970703, "loss": 0.7121, "rewards/accuracies": 0.4375, "rewards/chosen": -0.37397119402885437, "rewards/margins": 0.040555573999881744, "rewards/rejected": -0.4145267605781555, "step": 216 }, { "epoch": 0.23, "learning_rate": 4.774079988386296e-05, "logits/chosen": -2.2380332946777344, "logits/rejected": -2.3339126110076904, "logps/chosen": -137.3435821533203, "logps/rejected": -169.9375, "loss": 0.7015, "rewards/accuracies": 0.5, "rewards/chosen": -0.36366236209869385, "rewards/margins": 0.009570196270942688, "rewards/rejected": -0.37323257327079773, "step": 217 }, { "epoch": 0.23, "learning_rate": 4.770262116604224e-05, "logits/chosen": -2.2831175327301025, "logits/rejected": -2.260009765625, "logps/chosen": -207.33409118652344, "logps/rejected": -217.00962829589844, "loss": 0.6735, "rewards/accuracies": 0.5, "rewards/chosen": -0.27594539523124695, "rewards/margins": 0.09640569984912872, "rewards/rejected": -0.3723510801792145, "step": 218 }, { "epoch": 0.23, "learning_rate": 4.76641380790328e-05, "logits/chosen": -2.2921030521392822, "logits/rejected": -2.2923154830932617, "logps/chosen": -168.2577667236328, "logps/rejected": -166.49632263183594, "loss": 0.6891, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2567214369773865, "rewards/margins": 0.15365271270275116, "rewards/rejected": -0.41037416458129883, "step": 219 }, { "epoch": 0.23, "learning_rate": 4.762535113876917e-05, "logits/chosen": -2.273233652114868, "logits/rejected": -2.246183395385742, "logps/chosen": -208.0199737548828, "logps/rejected": -209.45909118652344, "loss": 0.6409, "rewards/accuracies": 0.75, "rewards/chosen": -0.12088227272033691, "rewards/margins": 0.15805310010910034, "rewards/rejected": -0.27893537282943726, "step": 220 }, { "epoch": 0.23, "learning_rate": 4.758626086525956e-05, "logits/chosen": -2.2132441997528076, "logits/rejected": -2.22251296043396, "logps/chosen": -167.60195922851562, "logps/rejected": -189.9409942626953, "loss": 0.6678, "rewards/accuracies": 0.625, "rewards/chosen": -0.3752858340740204, "rewards/margins": 0.10380817204713821, "rewards/rejected": -0.479093998670578, "step": 221 }, { "epoch": 0.23, "learning_rate": 4.754686778257891e-05, "logits/chosen": -2.282052755355835, "logits/rejected": -2.2734806537628174, "logps/chosen": -132.63140869140625, "logps/rejected": -136.8794708251953, "loss": 0.6003, "rewards/accuracies": 0.6875, "rewards/chosen": 0.009242314845323563, "rewards/margins": 0.26125800609588623, "rewards/rejected": -0.25201570987701416, "step": 222 }, { "epoch": 0.23, "learning_rate": 4.750717241886185e-05, "logits/chosen": -2.189680337905884, "logits/rejected": -2.1535582542419434, "logps/chosen": -129.69590759277344, "logps/rejected": -127.54085540771484, "loss": 0.7529, "rewards/accuracies": 0.5, "rewards/chosen": -0.5190645456314087, "rewards/margins": -0.08417561650276184, "rewards/rejected": -0.43488895893096924, "step": 223 }, { "epoch": 0.23, "learning_rate": 4.7467175306295655e-05, "logits/chosen": -2.2347896099090576, "logits/rejected": -2.2738678455352783, "logps/chosen": -158.85928344726562, "logps/rejected": -158.6512908935547, "loss": 0.683, "rewards/accuracies": 0.375, "rewards/chosen": -0.4156660735607147, "rewards/margins": 0.1208164319396019, "rewards/rejected": -0.5364825129508972, "step": 224 }, { "epoch": 0.23, "learning_rate": 4.7426876981113044e-05, "logits/chosen": -2.1925535202026367, "logits/rejected": -2.140517234802246, "logps/chosen": -165.46412658691406, "logps/rejected": -160.30532836914062, "loss": 0.6721, "rewards/accuracies": 0.625, "rewards/chosen": -0.22652333974838257, "rewards/margins": 0.09546832740306854, "rewards/rejected": -0.3219916820526123, "step": 225 }, { "epoch": 0.24, "learning_rate": 4.738627798358506e-05, "logits/chosen": -2.3328022956848145, "logits/rejected": -2.3571670055389404, "logps/chosen": -214.76580810546875, "logps/rejected": -231.52804565429688, "loss": 0.5743, "rewards/accuracies": 0.75, "rewards/chosen": -0.2423395961523056, "rewards/margins": 0.3217008709907532, "rewards/rejected": -0.5640404224395752, "step": 226 }, { "epoch": 0.24, "learning_rate": 4.7345378858013776e-05, "logits/chosen": -2.250012159347534, "logits/rejected": -2.26370906829834, "logps/chosen": -208.80230712890625, "logps/rejected": -210.8206329345703, "loss": 0.8003, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5306903719902039, "rewards/margins": -0.14622673392295837, "rewards/rejected": -0.3844636082649231, "step": 227 }, { "epoch": 0.24, "learning_rate": 4.730418015272503e-05, "logits/chosen": -2.3351643085479736, "logits/rejected": -2.33880352973938, "logps/chosen": -209.63296508789062, "logps/rejected": -219.62771606445312, "loss": 0.6265, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4614105522632599, "rewards/margins": 0.28935885429382324, "rewards/rejected": -0.7507694959640503, "step": 228 }, { "epoch": 0.24, "learning_rate": 4.726268242006106e-05, "logits/chosen": -2.051967144012451, "logits/rejected": -2.0519886016845703, "logps/chosen": -142.45199584960938, "logps/rejected": -140.49044799804688, "loss": 0.6649, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4592810571193695, "rewards/margins": 0.15481841564178467, "rewards/rejected": -0.6140995025634766, "step": 229 }, { "epoch": 0.24, "learning_rate": 4.722088621637309e-05, "logits/chosen": -2.2426981925964355, "logits/rejected": -2.287612199783325, "logps/chosen": -169.47291564941406, "logps/rejected": -179.918212890625, "loss": 0.7675, "rewards/accuracies": 0.3125, "rewards/chosen": -0.6015586853027344, "rewards/margins": -0.048168424516916275, "rewards/rejected": -0.5533902645111084, "step": 230 }, { "epoch": 0.24, "learning_rate": 4.717879210201389e-05, "logits/chosen": -2.192275047302246, "logits/rejected": -2.308380365371704, "logps/chosen": -156.87548828125, "logps/rejected": -180.34512329101562, "loss": 0.714, "rewards/accuracies": 0.5, "rewards/chosen": -0.42379727959632874, "rewards/margins": 0.08124817907810211, "rewards/rejected": -0.505045473575592, "step": 231 }, { "epoch": 0.24, "learning_rate": 4.713640064133025e-05, "logits/chosen": -2.057934045791626, "logits/rejected": -1.9755558967590332, "logps/chosen": -156.72357177734375, "logps/rejected": -158.1801300048828, "loss": 0.7132, "rewards/accuracies": 0.5, "rewards/chosen": -0.5255974531173706, "rewards/margins": 0.04890578240156174, "rewards/rejected": -0.5745032429695129, "step": 232 }, { "epoch": 0.24, "learning_rate": 4.7093712402655427e-05, "logits/chosen": -2.055185079574585, "logits/rejected": -2.0594377517700195, "logps/chosen": -139.1905517578125, "logps/rejected": -141.0957489013672, "loss": 0.7112, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4041973948478699, "rewards/margins": 0.0033319219946861267, "rewards/rejected": -0.407529354095459, "step": 233 }, { "epoch": 0.24, "learning_rate": 4.7050727958301506e-05, "logits/chosen": -2.1757850646972656, "logits/rejected": -2.1435933113098145, "logps/chosen": -177.0955810546875, "logps/rejected": -161.39651489257812, "loss": 0.6564, "rewards/accuracies": 0.625, "rewards/chosen": -0.3865699768066406, "rewards/margins": 0.11041317880153656, "rewards/rejected": -0.4969831705093384, "step": 234 }, { "epoch": 0.25, "learning_rate": 4.7007447884551745e-05, "logits/chosen": -2.055013418197632, "logits/rejected": -2.0445072650909424, "logps/chosen": -154.93429565429688, "logps/rejected": -166.30482482910156, "loss": 0.7354, "rewards/accuracies": 0.625, "rewards/chosen": -0.48415255546569824, "rewards/margins": 0.04046877846121788, "rewards/rejected": -0.5246213674545288, "step": 235 }, { "epoch": 0.25, "learning_rate": 4.6963872761652835e-05, "logits/chosen": -2.202390670776367, "logits/rejected": -2.2232775688171387, "logps/chosen": -216.69921875, "logps/rejected": -206.72483825683594, "loss": 0.79, "rewards/accuracies": 0.4375, "rewards/chosen": -0.47140538692474365, "rewards/margins": 0.04289142042398453, "rewards/rejected": -0.51429682970047, "step": 236 }, { "epoch": 0.25, "learning_rate": 4.692000317380715e-05, "logits/chosen": -2.2061116695404053, "logits/rejected": -2.2866199016571045, "logps/chosen": -174.46450805664062, "logps/rejected": -185.262939453125, "loss": 0.628, "rewards/accuracies": 0.625, "rewards/chosen": -0.4978125989437103, "rewards/margins": 0.25329017639160156, "rewards/rejected": -0.7511026859283447, "step": 237 }, { "epoch": 0.25, "learning_rate": 4.687583970916487e-05, "logits/chosen": -2.2171239852905273, "logits/rejected": -2.2869226932525635, "logps/chosen": -186.90225219726562, "logps/rejected": -210.4280548095703, "loss": 0.7644, "rewards/accuracies": 0.4375, "rewards/chosen": -0.4735070765018463, "rewards/margins": -0.040224503725767136, "rewards/rejected": -0.4332825839519501, "step": 238 }, { "epoch": 0.25, "learning_rate": 4.683138295981611e-05, "logits/chosen": -2.1964111328125, "logits/rejected": -2.212043046951294, "logps/chosen": -153.104248046875, "logps/rejected": -165.96299743652344, "loss": 0.6534, "rewards/accuracies": 0.5625, "rewards/chosen": -0.24953705072402954, "rewards/margins": 0.14786407351493835, "rewards/rejected": -0.3974011242389679, "step": 239 }, { "epoch": 0.25, "learning_rate": 4.678663352178301e-05, "logits/chosen": -1.9726707935333252, "logits/rejected": -2.0120890140533447, "logps/chosen": -148.98944091796875, "logps/rejected": -149.28582763671875, "loss": 0.6168, "rewards/accuracies": 0.6875, "rewards/chosen": -0.33494099974632263, "rewards/margins": 0.3014180362224579, "rewards/rejected": -0.6363590359687805, "step": 240 }, { "epoch": 0.25, "learning_rate": 4.674159199501173e-05, "logits/chosen": -2.143721103668213, "logits/rejected": -2.1720468997955322, "logps/chosen": -131.01319885253906, "logps/rejected": -147.077392578125, "loss": 0.7585, "rewards/accuracies": 0.4375, "rewards/chosen": -0.4145124554634094, "rewards/margins": -0.08547190576791763, "rewards/rejected": -0.3290405869483948, "step": 241 }, { "epoch": 0.25, "learning_rate": 4.6696258983364385e-05, "logits/chosen": -2.2236969470977783, "logits/rejected": -2.2577428817749023, "logps/chosen": -184.3029327392578, "logps/rejected": -190.77215576171875, "loss": 0.6862, "rewards/accuracies": 0.5625, "rewards/chosen": -0.23868240416049957, "rewards/margins": 0.11198949813842773, "rewards/rejected": -0.3506719172000885, "step": 242 }, { "epoch": 0.25, "learning_rate": 4.665063509461097e-05, "logits/chosen": -2.060711622238159, "logits/rejected": -2.0380258560180664, "logps/chosen": -174.40530395507812, "logps/rejected": -171.62820434570312, "loss": 0.6911, "rewards/accuracies": 0.5, "rewards/chosen": -0.4223189949989319, "rewards/margins": 0.14973345398902893, "rewards/rejected": -0.5720524191856384, "step": 243 }, { "epoch": 0.25, "learning_rate": 4.660472094042121e-05, "logits/chosen": -2.1594133377075195, "logits/rejected": -2.1700899600982666, "logps/chosen": -207.55848693847656, "logps/rejected": -200.78297424316406, "loss": 0.7768, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5937251448631287, "rewards/margins": -0.08584102243185043, "rewards/rejected": -0.50788414478302, "step": 244 }, { "epoch": 0.26, "learning_rate": 4.655851713635635e-05, "logits/chosen": -2.305196762084961, "logits/rejected": -2.2462093830108643, "logps/chosen": -243.9378662109375, "logps/rejected": -215.36997985839844, "loss": 0.724, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5542324185371399, "rewards/margins": 0.021042201668024063, "rewards/rejected": -0.5752745866775513, "step": 245 }, { "epoch": 0.26, "learning_rate": 4.651202430186092e-05, "logits/chosen": -2.025132179260254, "logits/rejected": -1.9792908430099487, "logps/chosen": -195.5504608154297, "logps/rejected": -195.3519744873047, "loss": 0.7289, "rewards/accuracies": 0.5625, "rewards/chosen": -0.539334237575531, "rewards/margins": 0.23235680162906647, "rewards/rejected": -0.771691083908081, "step": 246 }, { "epoch": 0.26, "learning_rate": 4.6465243060254415e-05, "logits/chosen": -2.147789239883423, "logits/rejected": -2.1263253688812256, "logps/chosen": -189.7777099609375, "logps/rejected": -175.15142822265625, "loss": 0.7963, "rewards/accuracies": 0.5, "rewards/chosen": -0.40796539187431335, "rewards/margins": -0.13925030827522278, "rewards/rejected": -0.2687150835990906, "step": 247 }, { "epoch": 0.26, "learning_rate": 4.641817403872293e-05, "logits/chosen": -2.0318384170532227, "logits/rejected": -2.0659642219543457, "logps/chosen": -165.9607391357422, "logps/rejected": -182.31895446777344, "loss": 0.8084, "rewards/accuracies": 0.375, "rewards/chosen": -0.3802737891674042, "rewards/margins": -0.0964832603931427, "rewards/rejected": -0.28379055857658386, "step": 248 }, { "epoch": 0.26, "learning_rate": 4.637081786831079e-05, "logits/chosen": -2.0719449520111084, "logits/rejected": -2.026743173599243, "logps/chosen": -191.45486450195312, "logps/rejected": -185.71011352539062, "loss": 0.7078, "rewards/accuracies": 0.5, "rewards/chosen": -0.3257860839366913, "rewards/margins": 0.07691369950771332, "rewards/rejected": -0.4026997983455658, "step": 249 }, { "epoch": 0.26, "learning_rate": 4.6323175183912024e-05, "logits/chosen": -2.0912959575653076, "logits/rejected": -2.156938314437866, "logps/chosen": -167.86541748046875, "logps/rejected": -198.28921508789062, "loss": 0.4872, "rewards/accuracies": 0.8125, "rewards/chosen": -0.35692939162254333, "rewards/margins": 0.6100252866744995, "rewards/rejected": -0.9669547080993652, "step": 250 }, { "epoch": 0.26, "learning_rate": 4.627524662426194e-05, "logits/chosen": -1.8769241571426392, "logits/rejected": -1.8571780920028687, "logps/chosen": -175.3948974609375, "logps/rejected": -183.54019165039062, "loss": 0.8564, "rewards/accuracies": 0.375, "rewards/chosen": -0.6141475439071655, "rewards/margins": -0.18047203123569489, "rewards/rejected": -0.43367546796798706, "step": 251 }, { "epoch": 0.26, "learning_rate": 4.6227032831928484e-05, "logits/chosen": -1.9304271936416626, "logits/rejected": -1.8033334016799927, "logps/chosen": -174.45651245117188, "logps/rejected": -144.34657287597656, "loss": 0.7318, "rewards/accuracies": 0.5, "rewards/chosen": -0.625647246837616, "rewards/margins": 0.09466619789600372, "rewards/rejected": -0.7203134298324585, "step": 252 }, { "epoch": 0.26, "learning_rate": 4.6178534453303666e-05, "logits/chosen": -2.082902193069458, "logits/rejected": -2.0136592388153076, "logps/chosen": -201.52587890625, "logps/rejected": -198.62039184570312, "loss": 0.87, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5361589789390564, "rewards/margins": -0.25926750898361206, "rewards/rejected": -0.27689146995544434, "step": 253 }, { "epoch": 0.26, "learning_rate": 4.6129752138594874e-05, "logits/chosen": -1.957344889640808, "logits/rejected": -1.982904314994812, "logps/chosen": -187.89329528808594, "logps/rejected": -186.68289184570312, "loss": 0.7922, "rewards/accuracies": 0.5, "rewards/chosen": -0.5014457106590271, "rewards/margins": -0.08516909182071686, "rewards/rejected": -0.41627663373947144, "step": 254 }, { "epoch": 0.27, "learning_rate": 4.608068654181617e-05, "logits/chosen": -1.654222846031189, "logits/rejected": -1.6798536777496338, "logps/chosen": -184.433349609375, "logps/rejected": -182.4044189453125, "loss": 0.6938, "rewards/accuracies": 0.75, "rewards/chosen": -0.25272616744041443, "rewards/margins": 0.10025610774755478, "rewards/rejected": -0.3529822528362274, "step": 255 }, { "epoch": 0.27, "learning_rate": 4.6031338320779534e-05, "logits/chosen": -2.0019171237945557, "logits/rejected": -2.0632452964782715, "logps/chosen": -162.12811279296875, "logps/rejected": -177.98736572265625, "loss": 0.688, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5015031099319458, "rewards/margins": 0.15753880143165588, "rewards/rejected": -0.6590418815612793, "step": 256 }, { "epoch": 0.27, "learning_rate": 4.5981708137086e-05, "logits/chosen": -2.0519323348999023, "logits/rejected": -2.089592933654785, "logps/chosen": -168.29608154296875, "logps/rejected": -180.1384735107422, "loss": 0.6519, "rewards/accuracies": 0.625, "rewards/chosen": -0.46469518542289734, "rewards/margins": 0.22538693249225616, "rewards/rejected": -0.6900821924209595, "step": 257 }, { "epoch": 0.27, "learning_rate": 4.5931796656116846e-05, "logits/chosen": -1.9542289972305298, "logits/rejected": -2.1067469120025635, "logps/chosen": -141.18040466308594, "logps/rejected": -154.12332153320312, "loss": 0.7912, "rewards/accuracies": 0.375, "rewards/chosen": -0.424506276845932, "rewards/margins": -0.11340674757957458, "rewards/rejected": -0.3110995292663574, "step": 258 }, { "epoch": 0.27, "learning_rate": 4.588160454702462e-05, "logits/chosen": -1.9130336046218872, "logits/rejected": -1.8589096069335938, "logps/chosen": -154.01983642578125, "logps/rejected": -151.2600860595703, "loss": 0.6448, "rewards/accuracies": 0.4375, "rewards/chosen": -0.17828267812728882, "rewards/margins": 0.22233837842941284, "rewards/rejected": -0.40062105655670166, "step": 259 }, { "epoch": 0.27, "learning_rate": 4.5831132482724195e-05, "logits/chosen": -1.969378113746643, "logits/rejected": -1.9947978258132935, "logps/chosen": -221.1347198486328, "logps/rejected": -219.78065490722656, "loss": 0.7675, "rewards/accuracies": 0.5, "rewards/chosen": -0.8122418522834778, "rewards/margins": -0.012366384267807007, "rewards/rejected": -0.7998754978179932, "step": 260 }, { "epoch": 0.27, "learning_rate": 4.578038113988376e-05, "logits/chosen": -1.9141626358032227, "logits/rejected": -1.8990297317504883, "logps/chosen": -181.55563354492188, "logps/rejected": -158.1247100830078, "loss": 0.8615, "rewards/accuracies": 0.25, "rewards/chosen": -0.5867054462432861, "rewards/margins": -0.2548179626464844, "rewards/rejected": -0.33188754320144653, "step": 261 }, { "epoch": 0.27, "learning_rate": 4.572935119891571e-05, "logits/chosen": -1.9274933338165283, "logits/rejected": -2.0607924461364746, "logps/chosen": -200.04896545410156, "logps/rejected": -209.02926635742188, "loss": 0.6567, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5149486064910889, "rewards/margins": 0.22366176545619965, "rewards/rejected": -0.7386104464530945, "step": 262 }, { "epoch": 0.27, "learning_rate": 4.5678043343967554e-05, "logits/chosen": -2.108922243118286, "logits/rejected": -2.041205406188965, "logps/chosen": -174.11276245117188, "logps/rejected": -146.20103454589844, "loss": 0.9138, "rewards/accuracies": 0.1875, "rewards/chosen": -0.47598356008529663, "rewards/margins": -0.3184044659137726, "rewards/rejected": -0.15757909417152405, "step": 263 }, { "epoch": 0.28, "learning_rate": 4.5626458262912745e-05, "logits/chosen": -2.0428683757781982, "logits/rejected": -1.9962735176086426, "logps/chosen": -203.15782165527344, "logps/rejected": -179.91824340820312, "loss": 0.8575, "rewards/accuracies": 0.375, "rewards/chosen": -0.2803889513015747, "rewards/margins": -0.19668762385845184, "rewards/rejected": -0.08370131254196167, "step": 264 }, { "epoch": 0.28, "learning_rate": 4.557459664734141e-05, "logits/chosen": -1.9452859163284302, "logits/rejected": -2.0013086795806885, "logps/chosen": -151.39443969726562, "logps/rejected": -160.42532348632812, "loss": 0.7297, "rewards/accuracies": 0.5, "rewards/chosen": -0.366563618183136, "rewards/margins": 0.06915048509836197, "rewards/rejected": -0.43571415543556213, "step": 265 }, { "epoch": 0.28, "learning_rate": 4.552245919255117e-05, "logits/chosen": -1.9962891340255737, "logits/rejected": -2.0349621772766113, "logps/chosen": -178.134765625, "logps/rejected": -167.83949279785156, "loss": 0.7017, "rewards/accuracies": 0.4375, "rewards/chosen": -0.19582800567150116, "rewards/margins": 0.07028350979089737, "rewards/rejected": -0.2661115229129791, "step": 266 }, { "epoch": 0.28, "learning_rate": 4.5470046597537735e-05, "logits/chosen": -1.9965953826904297, "logits/rejected": -2.0604090690612793, "logps/chosen": -158.63546752929688, "logps/rejected": -177.67669677734375, "loss": 0.7415, "rewards/accuracies": 0.375, "rewards/chosen": -0.3360700309276581, "rewards/margins": 0.05659861862659454, "rewards/rejected": -0.3926686644554138, "step": 267 }, { "epoch": 0.28, "learning_rate": 4.541735956498554e-05, "logits/chosen": -1.95860755443573, "logits/rejected": -1.9922930002212524, "logps/chosen": -130.50543212890625, "logps/rejected": -137.30873107910156, "loss": 0.6415, "rewards/accuracies": 0.625, "rewards/chosen": -0.13774187862873077, "rewards/margins": 0.156549870967865, "rewards/rejected": -0.29429173469543457, "step": 268 }, { "epoch": 0.28, "learning_rate": 4.5364398801258396e-05, "logits/chosen": -2.0323469638824463, "logits/rejected": -1.989902377128601, "logps/chosen": -123.73210144042969, "logps/rejected": -120.40567016601562, "loss": 0.7417, "rewards/accuracies": 0.375, "rewards/chosen": -0.23733878135681152, "rewards/margins": -0.006110057234764099, "rewards/rejected": -0.2312287539243698, "step": 269 }, { "epoch": 0.28, "learning_rate": 4.5311165016389916e-05, "logits/chosen": -2.2102789878845215, "logits/rejected": -2.1945927143096924, "logps/chosen": -178.87513732910156, "logps/rejected": -185.6321258544922, "loss": 0.58, "rewards/accuracies": 0.75, "rewards/chosen": -0.3368569016456604, "rewards/margins": 0.3213872015476227, "rewards/rejected": -0.6582440733909607, "step": 270 }, { "epoch": 0.28, "learning_rate": 4.525765892407409e-05, "logits/chosen": -2.014317750930786, "logits/rejected": -1.9814597368240356, "logps/chosen": -162.72450256347656, "logps/rejected": -162.10733032226562, "loss": 0.7063, "rewards/accuracies": 0.4375, "rewards/chosen": -0.30713194608688354, "rewards/margins": 0.028126142919063568, "rewards/rejected": -0.33525803685188293, "step": 271 }, { "epoch": 0.28, "learning_rate": 4.5203881241655644e-05, "logits/chosen": -2.2270286083221436, "logits/rejected": -2.207059144973755, "logps/chosen": -158.911376953125, "logps/rejected": -163.7472381591797, "loss": 0.8314, "rewards/accuracies": 0.5, "rewards/chosen": -0.24247878789901733, "rewards/margins": -0.18095803260803223, "rewards/rejected": -0.0615207776427269, "step": 272 }, { "epoch": 0.28, "learning_rate": 4.514983269012049e-05, "logits/chosen": -2.163167715072632, "logits/rejected": -2.183046817779541, "logps/chosen": -174.82070922851562, "logps/rejected": -164.1748504638672, "loss": 0.7933, "rewards/accuracies": 0.25, "rewards/chosen": -0.38691380620002747, "rewards/margins": -0.16529785096645355, "rewards/rejected": -0.22161594033241272, "step": 273 }, { "epoch": 0.29, "learning_rate": 4.509551399408598e-05, "logits/chosen": -2.253500461578369, "logits/rejected": -2.260409355163574, "logps/chosen": -190.3398895263672, "logps/rejected": -201.19200134277344, "loss": 0.7775, "rewards/accuracies": 0.625, "rewards/chosen": -0.21146364510059357, "rewards/margins": -0.06666092574596405, "rewards/rejected": -0.14480271935462952, "step": 274 }, { "epoch": 0.29, "learning_rate": 4.504092588179128e-05, "logits/chosen": -2.2398221492767334, "logits/rejected": -2.1929337978363037, "logps/chosen": -231.88616943359375, "logps/rejected": -226.3920135498047, "loss": 0.6504, "rewards/accuracies": 0.625, "rewards/chosen": -0.30005282163619995, "rewards/margins": 0.13275346159934998, "rewards/rejected": -0.43280625343322754, "step": 275 }, { "epoch": 0.29, "learning_rate": 4.498606908508754e-05, "logits/chosen": -2.251856565475464, "logits/rejected": -2.2825677394866943, "logps/chosen": -194.575927734375, "logps/rejected": -208.67977905273438, "loss": 0.7303, "rewards/accuracies": 0.3125, "rewards/chosen": -0.1339748501777649, "rewards/margins": -0.01863221824169159, "rewards/rejected": -0.1153426244854927, "step": 276 }, { "epoch": 0.29, "learning_rate": 4.4930944339428085e-05, "logits/chosen": -1.9029638767242432, "logits/rejected": -2.052668571472168, "logps/chosen": -192.86978149414062, "logps/rejected": -214.9619140625, "loss": 0.734, "rewards/accuracies": 0.4375, "rewards/chosen": -0.31461602449417114, "rewards/margins": -0.04775575175881386, "rewards/rejected": -0.26686030626296997, "step": 277 }, { "epoch": 0.29, "learning_rate": 4.487555238385862e-05, "logits/chosen": -2.242000102996826, "logits/rejected": -2.1664445400238037, "logps/chosen": -177.2509002685547, "logps/rejected": -168.57376098632812, "loss": 0.7252, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3784841001033783, "rewards/margins": -0.008476179093122482, "rewards/rejected": -0.3700079321861267, "step": 278 }, { "epoch": 0.29, "learning_rate": 4.481989396100724e-05, "logits/chosen": -2.1768834590911865, "logits/rejected": -2.122082471847534, "logps/chosen": -136.37344360351562, "logps/rejected": -129.23260498046875, "loss": 0.7226, "rewards/accuracies": 0.5, "rewards/chosen": -0.1186956837773323, "rewards/margins": 0.10684624314308167, "rewards/rejected": -0.22554191946983337, "step": 279 }, { "epoch": 0.29, "learning_rate": 4.476396981707453e-05, "logits/chosen": -2.2543294429779053, "logits/rejected": -2.2154581546783447, "logps/chosen": -170.55038452148438, "logps/rejected": -156.1742706298828, "loss": 0.7387, "rewards/accuracies": 0.4375, "rewards/chosen": -0.22529585659503937, "rewards/margins": -0.04772930592298508, "rewards/rejected": -0.1775665581226349, "step": 280 }, { "epoch": 0.29, "learning_rate": 4.470778070182353e-05, "logits/chosen": -2.2135331630706787, "logits/rejected": -2.170997142791748, "logps/chosen": -140.70477294921875, "logps/rejected": -140.6665496826172, "loss": 0.7037, "rewards/accuracies": 0.5, "rewards/chosen": -0.09401345998048782, "rewards/margins": 0.028659436851739883, "rewards/rejected": -0.122672900557518, "step": 281 }, { "epoch": 0.29, "learning_rate": 4.465132736856969e-05, "logits/chosen": -2.2878525257110596, "logits/rejected": -2.2237839698791504, "logps/chosen": -164.2831573486328, "logps/rejected": -160.92367553710938, "loss": 0.7044, "rewards/accuracies": 0.5, "rewards/chosen": -0.34608572721481323, "rewards/margins": 0.046185556799173355, "rewards/rejected": -0.3922712802886963, "step": 282 }, { "epoch": 0.3, "learning_rate": 4.459461057417078e-05, "logits/chosen": -2.185762882232666, "logits/rejected": -2.099052667617798, "logps/chosen": -151.68963623046875, "logps/rejected": -147.94314575195312, "loss": 0.6582, "rewards/accuracies": 0.5625, "rewards/chosen": -0.20870129764080048, "rewards/margins": 0.16736984252929688, "rewards/rejected": -0.37607109546661377, "step": 283 }, { "epoch": 0.3, "learning_rate": 4.453763107901675e-05, "logits/chosen": -2.1122889518737793, "logits/rejected": -2.010392189025879, "logps/chosen": -187.94906616210938, "logps/rejected": -185.841796875, "loss": 0.6521, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08321194350719452, "rewards/margins": 0.12147242575883865, "rewards/rejected": -0.20468439161777496, "step": 284 }, { "epoch": 0.3, "learning_rate": 4.4480389647019505e-05, "logits/chosen": -2.1214723587036133, "logits/rejected": -2.033308506011963, "logps/chosen": -152.14337158203125, "logps/rejected": -149.34632873535156, "loss": 0.7408, "rewards/accuracies": 0.625, "rewards/chosen": -0.3923906683921814, "rewards/margins": 0.008589165285229683, "rewards/rejected": -0.40097981691360474, "step": 285 }, { "epoch": 0.3, "learning_rate": 4.442288704560268e-05, "logits/chosen": -2.1580543518066406, "logits/rejected": -2.1600093841552734, "logps/chosen": -202.5330810546875, "logps/rejected": -194.6158905029297, "loss": 0.8982, "rewards/accuracies": 0.375, "rewards/chosen": -0.4735994040966034, "rewards/margins": -0.2955199182033539, "rewards/rejected": -0.1780795007944107, "step": 286 }, { "epoch": 0.3, "learning_rate": 4.436512404569136e-05, "logits/chosen": -2.1974916458129883, "logits/rejected": -2.259157657623291, "logps/chosen": -147.84683227539062, "logps/rejected": -164.73829650878906, "loss": 0.6104, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2959311902523041, "rewards/margins": 0.22050346434116364, "rewards/rejected": -0.5164346694946289, "step": 287 }, { "epoch": 0.3, "learning_rate": 4.430710142170176e-05, "logits/chosen": -2.341240644454956, "logits/rejected": -2.3171639442443848, "logps/chosen": -151.489990234375, "logps/rejected": -136.1929931640625, "loss": 0.7421, "rewards/accuracies": 0.3125, "rewards/chosen": -0.12041179090738297, "rewards/margins": -0.06451301276683807, "rewards/rejected": -0.0558987595140934, "step": 288 }, { "epoch": 0.3, "learning_rate": 4.424881995153076e-05, "logits/chosen": -2.076103448867798, "logits/rejected": -2.188572883605957, "logps/chosen": -155.2179718017578, "logps/rejected": -181.94903564453125, "loss": 0.7223, "rewards/accuracies": 0.375, "rewards/chosen": -0.1940208077430725, "rewards/margins": 0.03361191600561142, "rewards/rejected": -0.22763270139694214, "step": 289 }, { "epoch": 0.3, "learning_rate": 4.419028041654559e-05, "logits/chosen": -2.1491026878356934, "logits/rejected": -2.1136281490325928, "logps/chosen": -151.4420928955078, "logps/rejected": -141.8388214111328, "loss": 0.6722, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2384054809808731, "rewards/margins": 0.15212693810462952, "rewards/rejected": -0.39053237438201904, "step": 290 }, { "epoch": 0.3, "learning_rate": 4.4131483601573285e-05, "logits/chosen": -1.9776232242584229, "logits/rejected": -2.004852771759033, "logps/chosen": -170.9802703857422, "logps/rejected": -162.25498962402344, "loss": 0.7193, "rewards/accuracies": 0.625, "rewards/chosen": -0.18611471354961395, "rewards/margins": 0.00812564603984356, "rewards/rejected": -0.19424037635326385, "step": 291 }, { "epoch": 0.3, "learning_rate": 4.4072430294890174e-05, "logits/chosen": -2.0479369163513184, "logits/rejected": -2.0386621952056885, "logps/chosen": -178.59158325195312, "logps/rejected": -174.89373779296875, "loss": 0.6113, "rewards/accuracies": 0.8125, "rewards/chosen": -0.28108635544776917, "rewards/margins": 0.20999087393283844, "rewards/rejected": -0.4910773038864136, "step": 292 }, { "epoch": 0.31, "learning_rate": 4.4013121288211307e-05, "logits/chosen": -2.2802951335906982, "logits/rejected": -2.192854881286621, "logps/chosen": -153.86907958984375, "logps/rejected": -142.56341552734375, "loss": 0.7816, "rewards/accuracies": 0.375, "rewards/chosen": -0.07042498886585236, "rewards/margins": -0.13551297783851624, "rewards/rejected": 0.06508798897266388, "step": 293 }, { "epoch": 0.31, "learning_rate": 4.3953557376679856e-05, "logits/chosen": -2.2507810592651367, "logits/rejected": -2.2218830585479736, "logps/chosen": -125.35671997070312, "logps/rejected": -128.93533325195312, "loss": 0.7147, "rewards/accuracies": 0.5, "rewards/chosen": -0.10209127515554428, "rewards/margins": 0.0391690619289875, "rewards/rejected": -0.14126034080982208, "step": 294 }, { "epoch": 0.31, "learning_rate": 4.389373935885646e-05, "logits/chosen": -2.169445514678955, "logits/rejected": -2.194335460662842, "logps/chosen": -157.78758239746094, "logps/rejected": -165.60736083984375, "loss": 0.7124, "rewards/accuracies": 0.625, "rewards/chosen": -0.24438923597335815, "rewards/margins": 0.05163384974002838, "rewards/rejected": -0.29602310061454773, "step": 295 }, { "epoch": 0.31, "learning_rate": 4.383366803670849e-05, "logits/chosen": -2.2508602142333984, "logits/rejected": -2.30161714553833, "logps/chosen": -167.88427734375, "logps/rejected": -184.48272705078125, "loss": 0.7121, "rewards/accuracies": 0.4375, "rewards/chosen": -0.15736985206604004, "rewards/margins": 0.126334547996521, "rewards/rejected": -0.28370437026023865, "step": 296 }, { "epoch": 0.31, "learning_rate": 4.377334421559932e-05, "logits/chosen": -2.314563035964966, "logits/rejected": -2.2978732585906982, "logps/chosen": -179.24159240722656, "logps/rejected": -188.38165283203125, "loss": 0.7184, "rewards/accuracies": 0.5, "rewards/chosen": 0.012131119146943092, "rewards/margins": -0.01239142008125782, "rewards/rejected": 0.02452254109084606, "step": 297 }, { "epoch": 0.31, "learning_rate": 4.371276870427753e-05, "logits/chosen": -2.066857099533081, "logits/rejected": -2.174121856689453, "logps/chosen": -170.82960510253906, "logps/rejected": -189.02621459960938, "loss": 0.7574, "rewards/accuracies": 0.4375, "rewards/chosen": -0.28441551327705383, "rewards/margins": -0.09379325062036514, "rewards/rejected": -0.1906222403049469, "step": 298 }, { "epoch": 0.31, "learning_rate": 4.365194231486604e-05, "logits/chosen": -2.147336006164551, "logits/rejected": -2.132305383682251, "logps/chosen": -158.25559997558594, "logps/rejected": -152.197509765625, "loss": 0.6692, "rewards/accuracies": 0.4375, "rewards/chosen": -0.104363813996315, "rewards/margins": 0.12111049890518188, "rewards/rejected": -0.2254743129014969, "step": 299 }, { "epoch": 0.31, "learning_rate": 4.359086586285127e-05, "logits/chosen": -2.247628688812256, "logits/rejected": -2.286552667617798, "logps/chosen": -133.13673400878906, "logps/rejected": -183.82647705078125, "loss": 0.6152, "rewards/accuracies": 0.5625, "rewards/chosen": -0.17196473479270935, "rewards/margins": 0.2282213419675827, "rewards/rejected": -0.40018609166145325, "step": 300 }, { "epoch": 0.31, "learning_rate": 4.3529540167072126e-05, "logits/chosen": -1.8818175792694092, "logits/rejected": -1.8767746686935425, "logps/chosen": -134.77548217773438, "logps/rejected": -151.11073303222656, "loss": 0.6999, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2484281212091446, "rewards/margins": 0.12181131541728973, "rewards/rejected": -0.3702394366264343, "step": 301 }, { "epoch": 0.32, "learning_rate": 4.346796604970912e-05, "logits/chosen": -2.107909679412842, "logits/rejected": -2.138780355453491, "logps/chosen": -168.53460693359375, "logps/rejected": -174.63592529296875, "loss": 0.7954, "rewards/accuracies": 0.5625, "rewards/chosen": -0.30226460099220276, "rewards/margins": -0.09562454372644424, "rewards/rejected": -0.20664002001285553, "step": 302 }, { "epoch": 0.32, "learning_rate": 4.340614433627328e-05, "logits/chosen": -2.1604933738708496, "logits/rejected": -2.2617201805114746, "logps/chosen": -155.14198303222656, "logps/rejected": -169.87091064453125, "loss": 0.6444, "rewards/accuracies": 0.5, "rewards/chosen": -0.07352495938539505, "rewards/margins": 0.1430576741695404, "rewards/rejected": -0.21658262610435486, "step": 303 }, { "epoch": 0.32, "learning_rate": 4.3344075855595104e-05, "logits/chosen": -2.1969313621520996, "logits/rejected": -2.2095913887023926, "logps/chosen": -165.4632568359375, "logps/rejected": -158.85653686523438, "loss": 0.7844, "rewards/accuracies": 0.4375, "rewards/chosen": -0.2591025233268738, "rewards/margins": -0.1096937507390976, "rewards/rejected": -0.14940877258777618, "step": 304 }, { "epoch": 0.32, "learning_rate": 4.328176143981343e-05, "logits/chosen": -2.146892786026001, "logits/rejected": -2.1590354442596436, "logps/chosen": -165.599365234375, "logps/rejected": -155.86920166015625, "loss": 0.5594, "rewards/accuracies": 0.75, "rewards/chosen": 0.04538270831108093, "rewards/margins": 0.3189627528190613, "rewards/rejected": -0.27358004450798035, "step": 305 }, { "epoch": 0.32, "learning_rate": 4.321920192436433e-05, "logits/chosen": -2.226012706756592, "logits/rejected": -2.2147953510284424, "logps/chosen": -149.3193817138672, "logps/rejected": -182.29501342773438, "loss": 0.7058, "rewards/accuracies": 0.5625, "rewards/chosen": -0.34394580125808716, "rewards/margins": 0.0967680886387825, "rewards/rejected": -0.44071388244628906, "step": 306 }, { "epoch": 0.32, "learning_rate": 4.315639814796983e-05, "logits/chosen": -2.0329627990722656, "logits/rejected": -2.1080126762390137, "logps/chosen": -145.89712524414062, "logps/rejected": -164.14891052246094, "loss": 0.6894, "rewards/accuracies": 0.5, "rewards/chosen": -0.22714479267597198, "rewards/margins": 0.17113275825977325, "rewards/rejected": -0.39827755093574524, "step": 307 }, { "epoch": 0.32, "learning_rate": 4.309335095262676e-05, "logits/chosen": -2.131873607635498, "logits/rejected": -2.3015997409820557, "logps/chosen": -146.17129516601562, "logps/rejected": -182.18138122558594, "loss": 0.6993, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3666872978210449, "rewards/margins": 0.09421360492706299, "rewards/rejected": -0.4609009325504303, "step": 308 }, { "epoch": 0.32, "learning_rate": 4.303006118359537e-05, "logits/chosen": -2.2067878246307373, "logits/rejected": -2.162324905395508, "logps/chosen": -169.20140075683594, "logps/rejected": -163.42198181152344, "loss": 0.7353, "rewards/accuracies": 0.3125, "rewards/chosen": -0.5356483459472656, "rewards/margins": -0.05132238194346428, "rewards/rejected": -0.48432594537734985, "step": 309 }, { "epoch": 0.32, "learning_rate": 4.296652968938807e-05, "logits/chosen": -2.0966219902038574, "logits/rejected": -2.096193313598633, "logps/chosen": -181.05987548828125, "logps/rejected": -199.04830932617188, "loss": 0.8487, "rewards/accuracies": 0.3125, "rewards/chosen": -0.5439967513084412, "rewards/margins": -0.17244365811347961, "rewards/rejected": -0.3715530335903168, "step": 310 }, { "epoch": 0.32, "learning_rate": 4.2902757321758016e-05, "logits/chosen": -2.0997745990753174, "logits/rejected": -2.11462140083313, "logps/chosen": -154.70413208007812, "logps/rejected": -166.63629150390625, "loss": 0.6186, "rewards/accuracies": 0.625, "rewards/chosen": -0.27216559648513794, "rewards/margins": 0.23034000396728516, "rewards/rejected": -0.5025056004524231, "step": 311 }, { "epoch": 0.33, "learning_rate": 4.283874493568772e-05, "logits/chosen": -2.1701714992523193, "logits/rejected": -2.2467424869537354, "logps/chosen": -172.70042419433594, "logps/rejected": -210.9363250732422, "loss": 0.7279, "rewards/accuracies": 0.4375, "rewards/chosen": -0.49327972531318665, "rewards/margins": 0.008369775488972664, "rewards/rejected": -0.5016494989395142, "step": 312 }, { "epoch": 0.33, "learning_rate": 4.2774493389377545e-05, "logits/chosen": -2.2590439319610596, "logits/rejected": -2.214010000228882, "logps/chosen": -149.47921752929688, "logps/rejected": -162.21078491210938, "loss": 0.7782, "rewards/accuracies": 0.4375, "rewards/chosen": -0.27489376068115234, "rewards/margins": -0.09422563016414642, "rewards/rejected": -0.18066814541816711, "step": 313 }, { "epoch": 0.33, "learning_rate": 4.271000354423426e-05, "logits/chosen": -2.179133892059326, "logits/rejected": -2.3145484924316406, "logps/chosen": -161.55218505859375, "logps/rejected": -180.9322509765625, "loss": 0.766, "rewards/accuracies": 0.5, "rewards/chosen": -0.41766488552093506, "rewards/margins": -0.06459204852581024, "rewards/rejected": -0.353072851896286, "step": 314 }, { "epoch": 0.33, "learning_rate": 4.2645276264859394e-05, "logits/chosen": -2.14973521232605, "logits/rejected": -2.122270107269287, "logps/chosen": -169.07066345214844, "logps/rejected": -148.82968139648438, "loss": 0.7404, "rewards/accuracies": 0.4375, "rewards/chosen": -0.4353243112564087, "rewards/margins": -0.06003642827272415, "rewards/rejected": -0.3752879202365875, "step": 315 }, { "epoch": 0.33, "learning_rate": 4.258031241903778e-05, "logits/chosen": -2.2368862628936768, "logits/rejected": -2.231748104095459, "logps/chosen": -228.32550048828125, "logps/rejected": -248.53265380859375, "loss": 0.7322, "rewards/accuracies": 0.4375, "rewards/chosen": -0.48313820362091064, "rewards/margins": -0.03280310332775116, "rewards/rejected": -0.4503350555896759, "step": 316 }, { "epoch": 0.33, "learning_rate": 4.251511287772579e-05, "logits/chosen": -2.172724485397339, "logits/rejected": -2.166696786880493, "logps/chosen": -166.26548767089844, "logps/rejected": -189.61898803710938, "loss": 0.7494, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4010167717933655, "rewards/margins": -0.011104248464107513, "rewards/rejected": -0.38991254568099976, "step": 317 }, { "epoch": 0.33, "learning_rate": 4.2449678515039747e-05, "logits/chosen": -2.168539047241211, "logits/rejected": -2.230973243713379, "logps/chosen": -150.7926025390625, "logps/rejected": -141.18051147460938, "loss": 0.801, "rewards/accuracies": 0.5, "rewards/chosen": -0.29772108793258667, "rewards/margins": -0.10154817998409271, "rewards/rejected": -0.19617292284965515, "step": 318 }, { "epoch": 0.33, "learning_rate": 4.238401020824416e-05, "logits/chosen": -2.1671128273010254, "logits/rejected": -2.1290652751922607, "logps/chosen": -163.53701782226562, "logps/rejected": -176.98638916015625, "loss": 0.6244, "rewards/accuracies": 0.625, "rewards/chosen": -0.26465463638305664, "rewards/margins": 0.2812088131904602, "rewards/rejected": -0.5458635091781616, "step": 319 }, { "epoch": 0.33, "learning_rate": 4.231810883773999e-05, "logits/chosen": -2.0769715309143066, "logits/rejected": -2.2087574005126953, "logps/chosen": -143.20196533203125, "logps/rejected": -185.92027282714844, "loss": 0.7082, "rewards/accuracies": 0.625, "rewards/chosen": -0.41921350359916687, "rewards/margins": 0.10322752594947815, "rewards/rejected": -0.522441029548645, "step": 320 }, { "epoch": 0.33, "learning_rate": 4.2251975287052804e-05, "logits/chosen": -2.1802303791046143, "logits/rejected": -2.2122409343719482, "logps/chosen": -156.01007080078125, "logps/rejected": -183.6471405029297, "loss": 0.6816, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2279636561870575, "rewards/margins": 0.09283652901649475, "rewards/rejected": -0.32080018520355225, "step": 321 }, { "epoch": 0.34, "learning_rate": 4.218561044282099e-05, "logits/chosen": -2.113987684249878, "logits/rejected": -2.1755659580230713, "logps/chosen": -183.98928833007812, "logps/rejected": -201.93418884277344, "loss": 0.6784, "rewards/accuracies": 0.625, "rewards/chosen": -0.39034149050712585, "rewards/margins": 0.08643309772014618, "rewards/rejected": -0.47677451372146606, "step": 322 }, { "epoch": 0.34, "learning_rate": 4.211901519478382e-05, "logits/chosen": -2.139608144760132, "logits/rejected": -2.3428738117218018, "logps/chosen": -165.4562530517578, "logps/rejected": -215.16778564453125, "loss": 0.631, "rewards/accuracies": 0.5, "rewards/chosen": -0.6147371530532837, "rewards/margins": 0.23880484700202942, "rewards/rejected": -0.8535419702529907, "step": 323 }, { "epoch": 0.34, "learning_rate": 4.2052190435769554e-05, "logits/chosen": -2.1603267192840576, "logits/rejected": -2.0630412101745605, "logps/chosen": -173.13119506835938, "logps/rejected": -156.80117797851562, "loss": 0.6334, "rewards/accuracies": 0.875, "rewards/chosen": -0.2138206511735916, "rewards/margins": 0.18611598014831543, "rewards/rejected": -0.39993664622306824, "step": 324 }, { "epoch": 0.34, "learning_rate": 4.198513706168345e-05, "logits/chosen": -2.132692813873291, "logits/rejected": -2.117668390274048, "logps/chosen": -163.94891357421875, "logps/rejected": -177.30056762695312, "loss": 0.6352, "rewards/accuracies": 0.625, "rewards/chosen": -0.3021223843097687, "rewards/margins": 0.18045057356357574, "rewards/rejected": -0.4825729727745056, "step": 325 }, { "epoch": 0.34, "learning_rate": 4.191785597149577e-05, "logits/chosen": -2.129894495010376, "logits/rejected": -2.126570224761963, "logps/chosen": -233.8136749267578, "logps/rejected": -209.49566650390625, "loss": 0.7343, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5364924073219299, "rewards/margins": -0.013996928930282593, "rewards/rejected": -0.5224955677986145, "step": 326 }, { "epoch": 0.34, "learning_rate": 4.1850348067229696e-05, "logits/chosen": -2.096973419189453, "logits/rejected": -2.1738548278808594, "logps/chosen": -152.7852020263672, "logps/rejected": -173.36817932128906, "loss": 0.7105, "rewards/accuracies": 0.5, "rewards/chosen": -0.07808439433574677, "rewards/margins": 0.017765391618013382, "rewards/rejected": -0.09584978222846985, "step": 327 }, { "epoch": 0.34, "learning_rate": 4.178261425394926e-05, "logits/chosen": -2.026822566986084, "logits/rejected": -2.074733257293701, "logps/chosen": -171.08468627929688, "logps/rejected": -204.09425354003906, "loss": 0.8344, "rewards/accuracies": 0.25, "rewards/chosen": -0.6632390022277832, "rewards/margins": -0.23066554963588715, "rewards/rejected": -0.4325733780860901, "step": 328 }, { "epoch": 0.34, "learning_rate": 4.171465543974723e-05, "logits/chosen": -2.205124855041504, "logits/rejected": -2.198807716369629, "logps/chosen": -153.81307983398438, "logps/rejected": -165.59127807617188, "loss": 0.6848, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3507269620895386, "rewards/margins": 0.07101988792419434, "rewards/rejected": -0.4217468202114105, "step": 329 }, { "epoch": 0.34, "learning_rate": 4.1646472535732895e-05, "logits/chosen": -2.2543835639953613, "logits/rejected": -2.169010877609253, "logps/chosen": -193.3108673095703, "logps/rejected": -163.5917510986328, "loss": 0.7477, "rewards/accuracies": 0.3125, "rewards/chosen": -0.4203927218914032, "rewards/margins": -0.05665392428636551, "rewards/rejected": -0.3637387752532959, "step": 330 }, { "epoch": 0.35, "learning_rate": 4.157806645601988e-05, "logits/chosen": -1.9615943431854248, "logits/rejected": -2.026665210723877, "logps/chosen": -188.08934020996094, "logps/rejected": -210.8447265625, "loss": 0.6278, "rewards/accuracies": 0.625, "rewards/chosen": -0.26600322127342224, "rewards/margins": 0.22521573305130005, "rewards/rejected": -0.4912189245223999, "step": 331 }, { "epoch": 0.35, "learning_rate": 4.1509438117713866e-05, "logits/chosen": -2.1103501319885254, "logits/rejected": -2.092162609100342, "logps/chosen": -152.57652282714844, "logps/rejected": -155.68255615234375, "loss": 0.7481, "rewards/accuracies": 0.5625, "rewards/chosen": -0.19243402779102325, "rewards/margins": -0.01794758439064026, "rewards/rejected": -0.1744864583015442, "step": 332 }, { "epoch": 0.35, "learning_rate": 4.144058844090032e-05, "logits/chosen": -2.059112310409546, "logits/rejected": -2.1364006996154785, "logps/chosen": -134.24253845214844, "logps/rejected": -140.66732788085938, "loss": 0.7082, "rewards/accuracies": 0.4375, "rewards/chosen": -0.19404403865337372, "rewards/margins": 0.034823037683963776, "rewards/rejected": -0.2288670837879181, "step": 333 }, { "epoch": 0.35, "learning_rate": 4.137151834863213e-05, "logits/chosen": -2.178894519805908, "logits/rejected": -2.218296766281128, "logps/chosen": -167.58921813964844, "logps/rejected": -184.11642456054688, "loss": 0.6905, "rewards/accuracies": 0.5, "rewards/chosen": -0.18130388855934143, "rewards/margins": 0.06935537606477737, "rewards/rejected": -0.2506592571735382, "step": 334 }, { "epoch": 0.35, "learning_rate": 4.130222876691726e-05, "logits/chosen": -1.9210792779922485, "logits/rejected": -1.9056644439697266, "logps/chosen": -248.0112762451172, "logps/rejected": -249.83152770996094, "loss": 0.6878, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5254245400428772, "rewards/margins": 0.07501597702503204, "rewards/rejected": -0.600440502166748, "step": 335 }, { "epoch": 0.35, "learning_rate": 4.123272062470633e-05, "logits/chosen": -2.2695250511169434, "logits/rejected": -2.3075647354125977, "logps/chosen": -175.56838989257812, "logps/rejected": -185.3209686279297, "loss": 0.6858, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5953464508056641, "rewards/margins": 0.08486279845237732, "rewards/rejected": -0.680209219455719, "step": 336 }, { "epoch": 0.35, "learning_rate": 4.116299485388014e-05, "logits/chosen": -2.1485931873321533, "logits/rejected": -2.143951177597046, "logps/chosen": -147.05918884277344, "logps/rejected": -146.83811950683594, "loss": 0.8233, "rewards/accuracies": 0.25, "rewards/chosen": -0.5940690040588379, "rewards/margins": -0.17442089319229126, "rewards/rejected": -0.4196482002735138, "step": 337 }, { "epoch": 0.35, "learning_rate": 4.109305238923718e-05, "logits/chosen": -2.151376247406006, "logits/rejected": -2.2524118423461914, "logps/chosen": -258.1955261230469, "logps/rejected": -254.03448486328125, "loss": 0.6856, "rewards/accuracies": 0.5, "rewards/chosen": -0.5329893231391907, "rewards/margins": 0.19465385377407074, "rewards/rejected": -0.7276431322097778, "step": 338 }, { "epoch": 0.35, "learning_rate": 4.102289416848114e-05, "logits/chosen": -2.141131639480591, "logits/rejected": -2.094794511795044, "logps/chosen": -143.72801208496094, "logps/rejected": -137.58067321777344, "loss": 0.7808, "rewards/accuracies": 0.4375, "rewards/chosen": -0.384622186422348, "rewards/margins": -0.13031712174415588, "rewards/rejected": -0.25430506467819214, "step": 339 }, { "epoch": 0.35, "learning_rate": 4.095252113220827e-05, "logits/chosen": -2.16725492477417, "logits/rejected": -2.1304190158843994, "logps/chosen": -168.14285278320312, "logps/rejected": -173.74656677246094, "loss": 0.7767, "rewards/accuracies": 0.375, "rewards/chosen": -0.37200167775154114, "rewards/margins": -0.032151952385902405, "rewards/rejected": -0.3398497402667999, "step": 340 }, { "epoch": 0.36, "learning_rate": 4.088193422389484e-05, "logits/chosen": -2.1071646213531494, "logits/rejected": -2.1935393810272217, "logps/chosen": -165.9573516845703, "logps/rejected": -193.26974487304688, "loss": 0.5765, "rewards/accuracies": 0.6875, "rewards/chosen": -0.30919840931892395, "rewards/margins": 0.3557150065898895, "rewards/rejected": -0.6649134159088135, "step": 341 }, { "epoch": 0.36, "learning_rate": 4.0811134389884433e-05, "logits/chosen": -1.9773459434509277, "logits/rejected": -2.059852361679077, "logps/chosen": -149.0285186767578, "logps/rejected": -159.69705200195312, "loss": 0.641, "rewards/accuracies": 0.625, "rewards/chosen": -0.1402963101863861, "rewards/margins": 0.19628655910491943, "rewards/rejected": -0.33658286929130554, "step": 342 }, { "epoch": 0.36, "learning_rate": 4.0740122579375286e-05, "logits/chosen": -2.0288474559783936, "logits/rejected": -2.244412422180176, "logps/chosen": -158.99160766601562, "logps/rejected": -198.47886657714844, "loss": 0.6393, "rewards/accuracies": 0.5, "rewards/chosen": -0.3524281680583954, "rewards/margins": 0.20226937532424927, "rewards/rejected": -0.5546976327896118, "step": 343 }, { "epoch": 0.36, "learning_rate": 4.066889974440757e-05, "logits/chosen": -1.9884339570999146, "logits/rejected": -2.0600476264953613, "logps/chosen": -149.9541473388672, "logps/rejected": -168.91061401367188, "loss": 0.7141, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3000350594520569, "rewards/margins": 0.055553682148456573, "rewards/rejected": -0.35558873414993286, "step": 344 }, { "epoch": 0.36, "learning_rate": 4.0597466839850595e-05, "logits/chosen": -2.229095935821533, "logits/rejected": -2.208395481109619, "logps/chosen": -180.67138671875, "logps/rejected": -191.8995361328125, "loss": 0.8027, "rewards/accuracies": 0.375, "rewards/chosen": -0.5766149163246155, "rewards/margins": -0.08274443447589874, "rewards/rejected": -0.4938705563545227, "step": 345 }, { "epoch": 0.36, "learning_rate": 4.0525824823390045e-05, "logits/chosen": -1.9827308654785156, "logits/rejected": -2.038292646408081, "logps/chosen": -137.3387451171875, "logps/rejected": -156.16510009765625, "loss": 0.6739, "rewards/accuracies": 0.5, "rewards/chosen": -0.24382978677749634, "rewards/margins": 0.09684039652347565, "rewards/rejected": -0.3406701982021332, "step": 346 }, { "epoch": 0.36, "learning_rate": 4.045397465551513e-05, "logits/chosen": -2.0480711460113525, "logits/rejected": -2.0361733436584473, "logps/chosen": -173.17909240722656, "logps/rejected": -157.0221405029297, "loss": 0.7591, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5380522012710571, "rewards/margins": -0.046980153769254684, "rewards/rejected": -0.4910720884799957, "step": 347 }, { "epoch": 0.36, "learning_rate": 4.038191729950569e-05, "logits/chosen": -2.229896068572998, "logits/rejected": -2.211841106414795, "logps/chosen": -167.17428588867188, "logps/rejected": -167.23196411132812, "loss": 0.8467, "rewards/accuracies": 0.25, "rewards/chosen": -0.6671632528305054, "rewards/margins": -0.22287489473819733, "rewards/rejected": -0.44428837299346924, "step": 348 }, { "epoch": 0.36, "learning_rate": 4.030965372141927e-05, "logits/chosen": -2.0725326538085938, "logits/rejected": -2.0685665607452393, "logps/chosen": -151.9163360595703, "logps/rejected": -158.74620056152344, "loss": 0.6537, "rewards/accuracies": 0.5, "rewards/chosen": -0.27225998044013977, "rewards/margins": 0.1315668821334839, "rewards/rejected": -0.40382686257362366, "step": 349 }, { "epoch": 0.37, "learning_rate": 4.0237184890078245e-05, "logits/chosen": -2.1178064346313477, "logits/rejected": -2.1523594856262207, "logps/chosen": -157.16265869140625, "logps/rejected": -176.5404052734375, "loss": 0.622, "rewards/accuracies": 0.6875, "rewards/chosen": -0.31149694323539734, "rewards/margins": 0.21072791516780853, "rewards/rejected": -0.5222248435020447, "step": 350 }, { "epoch": 0.37, "learning_rate": 4.0164511777056725e-05, "logits/chosen": -2.2286159992218018, "logits/rejected": -2.2007691860198975, "logps/chosen": -186.49661254882812, "logps/rejected": -188.19149780273438, "loss": 0.7013, "rewards/accuracies": 0.75, "rewards/chosen": -0.5486454963684082, "rewards/margins": 0.11480455100536346, "rewards/rejected": -0.6634500622749329, "step": 351 }, { "epoch": 0.37, "learning_rate": 4.009163535666761e-05, "logits/chosen": -2.182291030883789, "logits/rejected": -2.1893458366394043, "logps/chosen": -148.7130126953125, "logps/rejected": -159.863037109375, "loss": 0.6501, "rewards/accuracies": 0.5625, "rewards/chosen": -0.23504705727100372, "rewards/margins": 0.17834332585334778, "rewards/rejected": -0.4133903682231903, "step": 352 }, { "epoch": 0.37, "learning_rate": 4.001855660594948e-05, "logits/chosen": -2.0689799785614014, "logits/rejected": -2.133513927459717, "logps/chosen": -193.48846435546875, "logps/rejected": -231.55738830566406, "loss": 0.6689, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5353763699531555, "rewards/margins": 0.13139232993125916, "rewards/rejected": -0.6667687296867371, "step": 353 }, { "epoch": 0.37, "learning_rate": 3.994527650465352e-05, "logits/chosen": -2.2244315147399902, "logits/rejected": -2.206336259841919, "logps/chosen": -153.9037628173828, "logps/rejected": -153.5791778564453, "loss": 0.6506, "rewards/accuracies": 0.6875, "rewards/chosen": -0.10444588959217072, "rewards/margins": 0.24528437852859497, "rewards/rejected": -0.3497302234172821, "step": 354 }, { "epoch": 0.37, "learning_rate": 3.98717960352304e-05, "logits/chosen": -2.0277304649353027, "logits/rejected": -1.9793894290924072, "logps/chosen": -153.72535705566406, "logps/rejected": -152.46961975097656, "loss": 0.7438, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5082700252532959, "rewards/margins": 0.030984222888946533, "rewards/rejected": -0.5392543077468872, "step": 355 }, { "epoch": 0.37, "learning_rate": 3.979811618281706e-05, "logits/chosen": -2.0062384605407715, "logits/rejected": -2.057262420654297, "logps/chosen": -136.8114013671875, "logps/rejected": -134.69456481933594, "loss": 0.7313, "rewards/accuracies": 0.5, "rewards/chosen": -0.2975345849990845, "rewards/margins": 0.03512765094637871, "rewards/rejected": -0.3326622247695923, "step": 356 }, { "epoch": 0.37, "learning_rate": 3.972423793522352e-05, "logits/chosen": -2.0485219955444336, "logits/rejected": -2.085298776626587, "logps/chosen": -193.29806518554688, "logps/rejected": -208.20973205566406, "loss": 0.818, "rewards/accuracies": 0.25, "rewards/chosen": -0.6571996212005615, "rewards/margins": -0.1324111372232437, "rewards/rejected": -0.524788498878479, "step": 357 }, { "epoch": 0.37, "learning_rate": 3.9650162282919655e-05, "logits/chosen": -1.9818403720855713, "logits/rejected": -2.0531868934631348, "logps/chosen": -158.05345153808594, "logps/rejected": -157.64669799804688, "loss": 0.7875, "rewards/accuracies": 0.5, "rewards/chosen": -0.27151066064834595, "rewards/margins": -0.05360978841781616, "rewards/rejected": -0.21790087223052979, "step": 358 }, { "epoch": 0.37, "learning_rate": 3.957589021902191e-05, "logits/chosen": -2.1913740634918213, "logits/rejected": -2.147808790206909, "logps/chosen": -158.68458557128906, "logps/rejected": -168.61692810058594, "loss": 0.8811, "rewards/accuracies": 0.375, "rewards/chosen": -0.6251986622810364, "rewards/margins": -0.24323511123657227, "rewards/rejected": -0.3819635510444641, "step": 359 }, { "epoch": 0.38, "learning_rate": 3.9501422739279956e-05, "logits/chosen": -1.9855284690856934, "logits/rejected": -2.0047171115875244, "logps/chosen": -158.85598754882812, "logps/rejected": -186.7561492919922, "loss": 0.8237, "rewards/accuracies": 0.1875, "rewards/chosen": -0.4131236970424652, "rewards/margins": -0.2267259657382965, "rewards/rejected": -0.1863977611064911, "step": 360 }, { "epoch": 0.38, "learning_rate": 3.942676084206338e-05, "logits/chosen": -2.1845693588256836, "logits/rejected": -2.2711105346679688, "logps/chosen": -153.8560791015625, "logps/rejected": -188.9151611328125, "loss": 0.661, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2779008448123932, "rewards/margins": 0.1375725418329239, "rewards/rejected": -0.4154733717441559, "step": 361 }, { "epoch": 0.38, "learning_rate": 3.9351905528348285e-05, "logits/chosen": -2.063652515411377, "logits/rejected": -2.1154680252075195, "logps/chosen": -154.21665954589844, "logps/rejected": -166.70904541015625, "loss": 0.6842, "rewards/accuracies": 0.5, "rewards/chosen": -0.302326500415802, "rewards/margins": 0.07578597962856293, "rewards/rejected": -0.3781124949455261, "step": 362 }, { "epoch": 0.38, "learning_rate": 3.927685780170385e-05, "logits/chosen": -2.115208625793457, "logits/rejected": -2.042466878890991, "logps/chosen": -133.27244567871094, "logps/rejected": -123.8929214477539, "loss": 0.667, "rewards/accuracies": 0.6875, "rewards/chosen": -0.09392361342906952, "rewards/margins": 0.1019444689154625, "rewards/rejected": -0.1958681046962738, "step": 363 }, { "epoch": 0.38, "learning_rate": 3.920161866827889e-05, "logits/chosen": -2.167541980743408, "logits/rejected": -2.15377140045166, "logps/chosen": -152.6026611328125, "logps/rejected": -145.47889709472656, "loss": 0.723, "rewards/accuracies": 0.5625, "rewards/chosen": -0.22231236100196838, "rewards/margins": 0.015478478744626045, "rewards/rejected": -0.23779082298278809, "step": 364 }, { "epoch": 0.38, "learning_rate": 3.9126189136788416e-05, "logits/chosen": -2.1280405521392822, "logits/rejected": -1.9908369779586792, "logps/chosen": -146.22325134277344, "logps/rejected": -131.01043701171875, "loss": 0.6667, "rewards/accuracies": 0.5625, "rewards/chosen": -0.32886457443237305, "rewards/margins": 0.1325063705444336, "rewards/rejected": -0.46137094497680664, "step": 365 }, { "epoch": 0.38, "learning_rate": 3.90505702185e-05, "logits/chosen": -2.0596060752868652, "logits/rejected": -1.9969902038574219, "logps/chosen": -164.17156982421875, "logps/rejected": -137.92181396484375, "loss": 0.8594, "rewards/accuracies": 0.375, "rewards/chosen": -0.4894620478153229, "rewards/margins": -0.21743880212306976, "rewards/rejected": -0.2720232605934143, "step": 366 }, { "epoch": 0.38, "learning_rate": 3.897476292722034e-05, "logits/chosen": -1.9921385049819946, "logits/rejected": -2.1082358360290527, "logps/chosen": -140.42034912109375, "logps/rejected": -164.16915893554688, "loss": 0.6899, "rewards/accuracies": 0.375, "rewards/chosen": -0.26265978813171387, "rewards/margins": 0.043181706219911575, "rewards/rejected": -0.30584150552749634, "step": 367 }, { "epoch": 0.38, "learning_rate": 3.889876827928156e-05, "logits/chosen": -2.0175795555114746, "logits/rejected": -2.040492296218872, "logps/chosen": -152.78158569335938, "logps/rejected": -151.21533203125, "loss": 0.595, "rewards/accuracies": 0.625, "rewards/chosen": -0.16340675950050354, "rewards/margins": 0.2702358067035675, "rewards/rejected": -0.43364256620407104, "step": 368 }, { "epoch": 0.38, "learning_rate": 3.882258729352768e-05, "logits/chosen": -2.0957415103912354, "logits/rejected": -2.0559308528900146, "logps/chosen": -185.1763153076172, "logps/rejected": -192.24658203125, "loss": 0.6894, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2837347686290741, "rewards/margins": 0.07096924632787704, "rewards/rejected": -0.35470402240753174, "step": 369 }, { "epoch": 0.39, "learning_rate": 3.874622099130087e-05, "logits/chosen": -1.9157230854034424, "logits/rejected": -1.9512850046157837, "logps/chosen": -149.11521911621094, "logps/rejected": -165.88624572753906, "loss": 0.6754, "rewards/accuracies": 0.6875, "rewards/chosen": -0.335940957069397, "rewards/margins": 0.11438290774822235, "rewards/rejected": -0.45032384991645813, "step": 370 }, { "epoch": 0.39, "learning_rate": 3.866967039642784e-05, "logits/chosen": -2.0574257373809814, "logits/rejected": -2.203120470046997, "logps/chosen": -158.74758911132812, "logps/rejected": -173.32127380371094, "loss": 0.6842, "rewards/accuracies": 0.6875, "rewards/chosen": -0.40988606214523315, "rewards/margins": 0.061044152826070786, "rewards/rejected": -0.4709302484989166, "step": 371 }, { "epoch": 0.39, "learning_rate": 3.859293653520604e-05, "logits/chosen": -2.053711175918579, "logits/rejected": -1.980366587638855, "logps/chosen": -214.1997528076172, "logps/rejected": -201.98345947265625, "loss": 0.6636, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2103947103023529, "rewards/margins": 0.11527465283870697, "rewards/rejected": -0.3256693482398987, "step": 372 }, { "epoch": 0.39, "learning_rate": 3.851602043638994e-05, "logits/chosen": -2.012058973312378, "logits/rejected": -1.9634625911712646, "logps/chosen": -167.78753662109375, "logps/rejected": -190.8964385986328, "loss": 0.6111, "rewards/accuracies": 0.5625, "rewards/chosen": -0.26867952942848206, "rewards/margins": 0.29532575607299805, "rewards/rejected": -0.5640051960945129, "step": 373 }, { "epoch": 0.39, "learning_rate": 3.843892313117724e-05, "logits/chosen": -2.0894453525543213, "logits/rejected": -2.1030113697052, "logps/chosen": -155.4733123779297, "logps/rejected": -177.70977783203125, "loss": 0.784, "rewards/accuracies": 0.4375, "rewards/chosen": -0.658606767654419, "rewards/margins": -0.06122620403766632, "rewards/rejected": -0.597380518913269, "step": 374 }, { "epoch": 0.39, "learning_rate": 3.8361645653195026e-05, "logits/chosen": -2.1214489936828613, "logits/rejected": -2.1964516639709473, "logps/chosen": -163.82373046875, "logps/rejected": -185.65740966796875, "loss": 0.7088, "rewards/accuracies": 0.5, "rewards/chosen": -0.20250454545021057, "rewards/margins": 0.09702645242214203, "rewards/rejected": -0.2995309829711914, "step": 375 }, { "epoch": 0.39, "learning_rate": 3.8284189038485936e-05, "logits/chosen": -2.225022554397583, "logits/rejected": -2.2104990482330322, "logps/chosen": -160.45584106445312, "logps/rejected": -158.27365112304688, "loss": 0.6533, "rewards/accuracies": 0.75, "rewards/chosen": -0.3125608265399933, "rewards/margins": 0.14464329183101654, "rewards/rejected": -0.45720410346984863, "step": 376 }, { "epoch": 0.39, "learning_rate": 3.8206554325494225e-05, "logits/chosen": -2.246929168701172, "logits/rejected": -2.206899881362915, "logps/chosen": -177.77415466308594, "logps/rejected": -168.1491241455078, "loss": 0.6993, "rewards/accuracies": 0.625, "rewards/chosen": -0.4578195810317993, "rewards/margins": 0.06638437509536743, "rewards/rejected": -0.5242039561271667, "step": 377 }, { "epoch": 0.39, "learning_rate": 3.812874255505191e-05, "logits/chosen": -2.2009379863739014, "logits/rejected": -2.1823983192443848, "logps/chosen": -149.4908905029297, "logps/rejected": -165.87646484375, "loss": 0.822, "rewards/accuracies": 0.375, "rewards/chosen": -0.47621116042137146, "rewards/margins": -0.1242537796497345, "rewards/rejected": -0.35195738077163696, "step": 378 }, { "epoch": 0.4, "learning_rate": 3.805075477036476e-05, "logits/chosen": -2.1507351398468018, "logits/rejected": -2.098275661468506, "logps/chosen": -155.75393676757812, "logps/rejected": -150.32437133789062, "loss": 0.6496, "rewards/accuracies": 0.5, "rewards/chosen": -0.23333942890167236, "rewards/margins": 0.19434207677841187, "rewards/rejected": -0.42768150568008423, "step": 379 }, { "epoch": 0.4, "learning_rate": 3.797259201699833e-05, "logits/chosen": -2.231349468231201, "logits/rejected": -2.2406768798828125, "logps/chosen": -165.03302001953125, "logps/rejected": -160.68557739257812, "loss": 0.6536, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2146168351173401, "rewards/margins": 0.12036348879337311, "rewards/rejected": -0.334980309009552, "step": 380 }, { "epoch": 0.4, "learning_rate": 3.789425534286394e-05, "logits/chosen": -2.3824687004089355, "logits/rejected": -2.3478920459747314, "logps/chosen": -267.89453125, "logps/rejected": -268.1957702636719, "loss": 0.8122, "rewards/accuracies": 0.4375, "rewards/chosen": -0.3535911440849304, "rewards/margins": -0.1552121490240097, "rewards/rejected": -0.19837898015975952, "step": 381 }, { "epoch": 0.4, "learning_rate": 3.781574579820464e-05, "logits/chosen": -2.171052932739258, "logits/rejected": -2.204153299331665, "logps/chosen": -226.82186889648438, "logps/rejected": -241.44273376464844, "loss": 0.6833, "rewards/accuracies": 0.5, "rewards/chosen": -0.4559452533721924, "rewards/margins": 0.0971999317407608, "rewards/rejected": -0.553145170211792, "step": 382 }, { "epoch": 0.4, "learning_rate": 3.773706443558111e-05, "logits/chosen": -2.1312382221221924, "logits/rejected": -2.159982442855835, "logps/chosen": -169.75729370117188, "logps/rejected": -180.92498779296875, "loss": 0.769, "rewards/accuracies": 0.375, "rewards/chosen": -0.4790065586566925, "rewards/margins": -0.034655213356018066, "rewards/rejected": -0.44435134530067444, "step": 383 }, { "epoch": 0.4, "learning_rate": 3.765821230985758e-05, "logits/chosen": -2.1556124687194824, "logits/rejected": -2.1452436447143555, "logps/chosen": -205.10719299316406, "logps/rejected": -178.97406005859375, "loss": 0.6892, "rewards/accuracies": 0.3125, "rewards/chosen": -0.4622449278831482, "rewards/margins": 0.09195668250322342, "rewards/rejected": -0.554201602935791, "step": 384 }, { "epoch": 0.4, "learning_rate": 3.75791904781876e-05, "logits/chosen": -2.2256433963775635, "logits/rejected": -2.1840739250183105, "logps/chosen": -173.39869689941406, "logps/rejected": -180.0209503173828, "loss": 0.7169, "rewards/accuracies": 0.625, "rewards/chosen": -0.41330060362815857, "rewards/margins": 0.014306016266345978, "rewards/rejected": -0.42760664224624634, "step": 385 }, { "epoch": 0.4, "learning_rate": 3.7500000000000003e-05, "logits/chosen": -2.2018544673919678, "logits/rejected": -2.175457000732422, "logps/chosen": -141.57928466796875, "logps/rejected": -142.81686401367188, "loss": 0.6963, "rewards/accuracies": 0.4375, "rewards/chosen": -0.1722419261932373, "rewards/margins": 0.04862082004547119, "rewards/rejected": -0.2208627462387085, "step": 386 }, { "epoch": 0.4, "learning_rate": 3.74206419369846e-05, "logits/chosen": -2.224078893661499, "logits/rejected": -2.2861950397491455, "logps/chosen": -193.62677001953125, "logps/rejected": -196.46714782714844, "loss": 0.8549, "rewards/accuracies": 0.125, "rewards/chosen": -0.5895551443099976, "rewards/margins": -0.2451256364583969, "rewards/rejected": -0.34442949295043945, "step": 387 }, { "epoch": 0.4, "learning_rate": 3.7341117353077966e-05, "logits/chosen": -2.3696727752685547, "logits/rejected": -2.3207218647003174, "logps/chosen": -237.3257598876953, "logps/rejected": -208.5120391845703, "loss": 0.6565, "rewards/accuracies": 0.5, "rewards/chosen": -0.47394251823425293, "rewards/margins": 0.12043605744838715, "rewards/rejected": -0.5943784713745117, "step": 388 }, { "epoch": 0.41, "learning_rate": 3.726142731444921e-05, "logits/chosen": -2.1822972297668457, "logits/rejected": -2.2831835746765137, "logps/chosen": -150.12652587890625, "logps/rejected": -144.74551391601562, "loss": 0.7884, "rewards/accuracies": 0.4375, "rewards/chosen": -0.33526331186294556, "rewards/margins": -0.13741034269332886, "rewards/rejected": -0.1978529691696167, "step": 389 }, { "epoch": 0.41, "learning_rate": 3.718157288948563e-05, "logits/chosen": -2.2395238876342773, "logits/rejected": -2.2703099250793457, "logps/chosen": -177.6690216064453, "logps/rejected": -185.6038818359375, "loss": 0.5714, "rewards/accuracies": 0.6875, "rewards/chosen": -0.45524442195892334, "rewards/margins": 0.34756022691726685, "rewards/rejected": -0.8028046488761902, "step": 390 }, { "epoch": 0.41, "learning_rate": 3.710155514877844e-05, "logits/chosen": -2.2453153133392334, "logits/rejected": -2.2400312423706055, "logps/chosen": -161.18931579589844, "logps/rejected": -155.0644073486328, "loss": 0.9213, "rewards/accuracies": 0.4375, "rewards/chosen": -0.633940577507019, "rewards/margins": -0.3272451162338257, "rewards/rejected": -0.30669546127319336, "step": 391 }, { "epoch": 0.41, "learning_rate": 3.702137516510838e-05, "logits/chosen": -2.1709861755371094, "logits/rejected": -2.1527490615844727, "logps/chosen": -149.6127166748047, "logps/rejected": -135.25987243652344, "loss": 0.6775, "rewards/accuracies": 0.5625, "rewards/chosen": -0.31984999775886536, "rewards/margins": 0.08133503049612045, "rewards/rejected": -0.4011850357055664, "step": 392 }, { "epoch": 0.41, "learning_rate": 3.694103401343136e-05, "logits/chosen": -2.3013548851013184, "logits/rejected": -2.2986433506011963, "logps/chosen": -165.6312255859375, "logps/rejected": -174.7577362060547, "loss": 0.7364, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5793277025222778, "rewards/margins": -0.015010036528110504, "rewards/rejected": -0.5643177032470703, "step": 393 }, { "epoch": 0.41, "learning_rate": 3.686053277086401e-05, "logits/chosen": -2.1550047397613525, "logits/rejected": -2.246464252471924, "logps/chosen": -147.44175720214844, "logps/rejected": -154.8894805908203, "loss": 0.7561, "rewards/accuracies": 0.375, "rewards/chosen": -0.3496069014072418, "rewards/margins": -0.07592416554689407, "rewards/rejected": -0.27368271350860596, "step": 394 }, { "epoch": 0.41, "learning_rate": 3.6779872516669295e-05, "logits/chosen": -2.1460325717926025, "logits/rejected": -2.154590129852295, "logps/chosen": -151.6021728515625, "logps/rejected": -168.52456665039062, "loss": 0.5368, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2864820659160614, "rewards/margins": 0.4405989944934845, "rewards/rejected": -0.7270810008049011, "step": 395 }, { "epoch": 0.41, "learning_rate": 3.669905433224199e-05, "logits/chosen": -2.315129041671753, "logits/rejected": -2.4050261974334717, "logps/chosen": -146.9856414794922, "logps/rejected": -169.13397216796875, "loss": 0.7616, "rewards/accuracies": 0.5, "rewards/chosen": -0.2588258385658264, "rewards/margins": -0.059488385915756226, "rewards/rejected": -0.199337437748909, "step": 396 }, { "epoch": 0.41, "learning_rate": 3.6618079301094216e-05, "logits/chosen": -2.233609199523926, "logits/rejected": -2.2259573936462402, "logps/chosen": -179.19204711914062, "logps/rejected": -178.01219177246094, "loss": 0.5825, "rewards/accuracies": 0.625, "rewards/chosen": -0.2573452591896057, "rewards/margins": 0.3134271800518036, "rewards/rejected": -0.5707724094390869, "step": 397 }, { "epoch": 0.42, "learning_rate": 3.653694850884091e-05, "logits/chosen": -2.2443690299987793, "logits/rejected": -2.3436453342437744, "logps/chosen": -141.96392822265625, "logps/rejected": -165.8162384033203, "loss": 0.6064, "rewards/accuracies": 0.625, "rewards/chosen": -0.1481434404850006, "rewards/margins": 0.25018060207366943, "rewards/rejected": -0.39832407236099243, "step": 398 }, { "epoch": 0.42, "learning_rate": 3.645566304318526e-05, "logits/chosen": -2.251343250274658, "logits/rejected": -2.2624480724334717, "logps/chosen": -199.05337524414062, "logps/rejected": -199.2899627685547, "loss": 0.6354, "rewards/accuracies": 0.625, "rewards/chosen": -0.46376729011535645, "rewards/margins": 0.15301668643951416, "rewards/rejected": -0.6167839765548706, "step": 399 }, { "epoch": 0.42, "learning_rate": 3.637422399390413e-05, "logits/chosen": -2.309321165084839, "logits/rejected": -2.249835252761841, "logps/chosen": -187.30145263671875, "logps/rejected": -175.0135498046875, "loss": 0.8022, "rewards/accuracies": 0.5, "rewards/chosen": -0.6318432688713074, "rewards/margins": -0.13741618394851685, "rewards/rejected": -0.49442705512046814, "step": 400 }, { "epoch": 0.42, "learning_rate": 3.6292632452833436e-05, "logits/chosen": -2.149308681488037, "logits/rejected": -2.1867353916168213, "logps/chosen": -157.0370635986328, "logps/rejected": -179.92384338378906, "loss": 0.6469, "rewards/accuracies": 0.5, "rewards/chosen": -0.2832821309566498, "rewards/margins": 0.17430852353572845, "rewards/rejected": -0.45759066939353943, "step": 401 }, { "epoch": 0.42, "learning_rate": 3.621088951385353e-05, "logits/chosen": -2.430102825164795, "logits/rejected": -2.4103968143463135, "logps/chosen": -174.2300262451172, "logps/rejected": -194.47872924804688, "loss": 0.6782, "rewards/accuracies": 0.625, "rewards/chosen": -0.6608787178993225, "rewards/margins": 0.14938460290431976, "rewards/rejected": -0.8102633953094482, "step": 402 }, { "epoch": 0.42, "learning_rate": 3.612899627287452e-05, "logits/chosen": -2.381316661834717, "logits/rejected": -2.4797677993774414, "logps/chosen": -183.57020568847656, "logps/rejected": -211.94908142089844, "loss": 0.7676, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6477088332176208, "rewards/margins": 0.04121372848749161, "rewards/rejected": -0.6889225840568542, "step": 403 }, { "epoch": 0.42, "learning_rate": 3.604695382782159e-05, "logits/chosen": -2.2722840309143066, "logits/rejected": -2.2684082984924316, "logps/chosen": -157.826171875, "logps/rejected": -155.97283935546875, "loss": 0.7743, "rewards/accuracies": 0.4375, "rewards/chosen": -0.18509769439697266, "rewards/margins": -0.07210510969161987, "rewards/rejected": -0.11299259960651398, "step": 404 }, { "epoch": 0.42, "learning_rate": 3.596476327862024e-05, "logits/chosen": -2.128013849258423, "logits/rejected": -2.2023983001708984, "logps/chosen": -194.61569213867188, "logps/rejected": -211.86927795410156, "loss": 0.6399, "rewards/accuracies": 0.625, "rewards/chosen": -0.36270129680633545, "rewards/margins": 0.2554909288883209, "rewards/rejected": -0.618192195892334, "step": 405 }, { "epoch": 0.42, "learning_rate": 3.588242572718162e-05, "logits/chosen": -2.377016305923462, "logits/rejected": -2.2582497596740723, "logps/chosen": -161.89572143554688, "logps/rejected": -164.3067169189453, "loss": 0.744, "rewards/accuracies": 0.5625, "rewards/chosen": -0.38204970955848694, "rewards/margins": -0.01100611686706543, "rewards/rejected": -0.3710435926914215, "step": 406 }, { "epoch": 0.42, "learning_rate": 3.579994227738767e-05, "logits/chosen": -2.208984851837158, "logits/rejected": -2.288970708847046, "logps/chosen": -191.62936401367188, "logps/rejected": -223.2491455078125, "loss": 0.6478, "rewards/accuracies": 0.5, "rewards/chosen": -0.34289631247520447, "rewards/margins": 0.17745056748390198, "rewards/rejected": -0.5203468799591064, "step": 407 }, { "epoch": 0.43, "learning_rate": 3.5717314035076355e-05, "logits/chosen": -2.2903645038604736, "logits/rejected": -2.224257469177246, "logps/chosen": -173.94061279296875, "logps/rejected": -183.39857482910156, "loss": 0.9867, "rewards/accuracies": 0.4375, "rewards/chosen": -0.8537546396255493, "rewards/margins": -0.33834022283554077, "rewards/rejected": -0.5154143571853638, "step": 408 }, { "epoch": 0.43, "learning_rate": 3.5634542108026876e-05, "logits/chosen": -2.2586324214935303, "logits/rejected": -2.333674192428589, "logps/chosen": -128.76527404785156, "logps/rejected": -138.5326690673828, "loss": 0.6894, "rewards/accuracies": 0.5, "rewards/chosen": -0.3050040900707245, "rewards/margins": 0.024737656116485596, "rewards/rejected": -0.32974177598953247, "step": 409 }, { "epoch": 0.43, "learning_rate": 3.5551627605944745e-05, "logits/chosen": -2.2455837726593018, "logits/rejected": -2.2226006984710693, "logps/chosen": -165.0658416748047, "logps/rejected": -158.5568084716797, "loss": 0.7748, "rewards/accuracies": 0.3125, "rewards/chosen": -0.47917866706848145, "rewards/margins": -0.09431587904691696, "rewards/rejected": -0.3848627507686615, "step": 410 }, { "epoch": 0.43, "learning_rate": 3.5468571640446994e-05, "logits/chosen": -2.220954179763794, "logits/rejected": -2.2065913677215576, "logps/chosen": -155.57606506347656, "logps/rejected": -196.17453002929688, "loss": 0.681, "rewards/accuracies": 0.5625, "rewards/chosen": -0.43619242310523987, "rewards/margins": 0.16942578554153442, "rewards/rejected": -0.6056181788444519, "step": 411 }, { "epoch": 0.43, "learning_rate": 3.5385375325047166e-05, "logits/chosen": -2.28615665435791, "logits/rejected": -2.3207404613494873, "logps/chosen": -139.8089599609375, "logps/rejected": -146.37413024902344, "loss": 0.5752, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2894131541252136, "rewards/margins": 0.32500603795051575, "rewards/rejected": -0.6144192218780518, "step": 412 }, { "epoch": 0.43, "learning_rate": 3.5302039775140486e-05, "logits/chosen": -2.223402500152588, "logits/rejected": -2.2294511795043945, "logps/chosen": -192.4325714111328, "logps/rejected": -195.81764221191406, "loss": 0.6225, "rewards/accuracies": 0.5, "rewards/chosen": -0.422715961933136, "rewards/margins": 0.23224471509456635, "rewards/rejected": -0.654960572719574, "step": 413 }, { "epoch": 0.43, "learning_rate": 3.521856610798887e-05, "logits/chosen": -2.1355066299438477, "logits/rejected": -2.2011489868164062, "logps/chosen": -186.72837829589844, "logps/rejected": -198.95672607421875, "loss": 0.7121, "rewards/accuracies": 0.625, "rewards/chosen": -0.8084388971328735, "rewards/margins": 0.06887459009885788, "rewards/rejected": -0.8773134350776672, "step": 414 }, { "epoch": 0.43, "learning_rate": 3.513495544270592e-05, "logits/chosen": -2.2741241455078125, "logits/rejected": -2.2826316356658936, "logps/chosen": -167.8680877685547, "logps/rejected": -167.94290161132812, "loss": 0.7632, "rewards/accuracies": 0.375, "rewards/chosen": -0.5057553648948669, "rewards/margins": -0.06321151554584503, "rewards/rejected": -0.44254380464553833, "step": 415 }, { "epoch": 0.43, "learning_rate": 3.505120890024195e-05, "logits/chosen": -2.2100603580474854, "logits/rejected": -2.1715095043182373, "logps/chosen": -178.6398162841797, "logps/rejected": -199.203857421875, "loss": 0.7779, "rewards/accuracies": 0.5, "rewards/chosen": -0.6270021796226501, "rewards/margins": -0.08634312450885773, "rewards/rejected": -0.5406590700149536, "step": 416 }, { "epoch": 0.43, "learning_rate": 3.496732760336895e-05, "logits/chosen": -2.3388140201568604, "logits/rejected": -2.3569979667663574, "logps/chosen": -183.85336303710938, "logps/rejected": -176.68597412109375, "loss": 0.6665, "rewards/accuracies": 0.5625, "rewards/chosen": -0.38947370648384094, "rewards/margins": 0.12428087741136551, "rewards/rejected": -0.5137546062469482, "step": 417 }, { "epoch": 0.44, "learning_rate": 3.4883312676665536e-05, "logits/chosen": -2.0799946784973145, "logits/rejected": -2.127676248550415, "logps/chosen": -157.06455993652344, "logps/rejected": -196.19081115722656, "loss": 0.6533, "rewards/accuracies": 0.625, "rewards/chosen": -0.38800424337387085, "rewards/margins": 0.2187524437904358, "rewards/rejected": -0.6067566275596619, "step": 418 }, { "epoch": 0.44, "learning_rate": 3.479916524650188e-05, "logits/chosen": -2.2445905208587646, "logits/rejected": -2.2511062622070312, "logps/chosen": -191.49032592773438, "logps/rejected": -211.65402221679688, "loss": 0.6537, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4725024104118347, "rewards/margins": 0.14553888142108917, "rewards/rejected": -0.6180413365364075, "step": 419 }, { "epoch": 0.44, "learning_rate": 3.4714886441024574e-05, "logits/chosen": -2.3066306114196777, "logits/rejected": -2.295111894607544, "logps/chosen": -166.61192321777344, "logps/rejected": -183.85430908203125, "loss": 0.7243, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6095054149627686, "rewards/margins": 0.045834727585315704, "rewards/rejected": -0.6553401350975037, "step": 420 }, { "epoch": 0.44, "learning_rate": 3.4630477390141556e-05, "logits/chosen": -2.0795845985412598, "logits/rejected": -2.0988335609436035, "logps/chosen": -176.13385009765625, "logps/rejected": -162.16116333007812, "loss": 0.8325, "rewards/accuracies": 0.1875, "rewards/chosen": -0.6280168890953064, "rewards/margins": -0.21699213981628418, "rewards/rejected": -0.41102465987205505, "step": 421 }, { "epoch": 0.44, "learning_rate": 3.4545939225506934e-05, "logits/chosen": -2.2829484939575195, "logits/rejected": -2.369950294494629, "logps/chosen": -120.27825164794922, "logps/rejected": -135.86590576171875, "loss": 0.6277, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3205239772796631, "rewards/margins": 0.2128859907388687, "rewards/rejected": -0.5334099531173706, "step": 422 }, { "epoch": 0.44, "learning_rate": 3.4461273080505793e-05, "logits/chosen": -2.227790117263794, "logits/rejected": -2.3280563354492188, "logps/chosen": -166.44442749023438, "logps/rejected": -190.11199951171875, "loss": 0.658, "rewards/accuracies": 0.625, "rewards/chosen": -0.6263283491134644, "rewards/margins": 0.1760719120502472, "rewards/rejected": -0.8024002909660339, "step": 423 }, { "epoch": 0.44, "learning_rate": 3.437648009023905e-05, "logits/chosen": -2.2951772212982178, "logits/rejected": -2.2783102989196777, "logps/chosen": -168.9396209716797, "logps/rejected": -161.99826049804688, "loss": 0.7575, "rewards/accuracies": 0.3125, "rewards/chosen": -0.4612702429294586, "rewards/margins": -0.09408943355083466, "rewards/rejected": -0.36718082427978516, "step": 424 }, { "epoch": 0.44, "learning_rate": 3.4291561391508185e-05, "logits/chosen": -2.233304023742676, "logits/rejected": -2.1193814277648926, "logps/chosen": -191.5747833251953, "logps/rejected": -198.6702423095703, "loss": 0.6742, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6324459910392761, "rewards/margins": 0.13748225569725037, "rewards/rejected": -0.7699282169342041, "step": 425 }, { "epoch": 0.44, "learning_rate": 3.420651812280006e-05, "logits/chosen": -2.0054640769958496, "logits/rejected": -2.0682601928710938, "logps/chosen": -179.95880126953125, "logps/rejected": -178.59805297851562, "loss": 0.7556, "rewards/accuracies": 0.5, "rewards/chosen": -0.5854605436325073, "rewards/margins": -0.06146989390254021, "rewards/rejected": -0.5239906311035156, "step": 426 }, { "epoch": 0.45, "learning_rate": 3.4121351424271594e-05, "logits/chosen": -2.220736026763916, "logits/rejected": -2.229501247406006, "logps/chosen": -161.79852294921875, "logps/rejected": -150.96261596679688, "loss": 0.634, "rewards/accuracies": 0.625, "rewards/chosen": -0.5760630369186401, "rewards/margins": 0.19034941494464874, "rewards/rejected": -0.7664124965667725, "step": 427 }, { "epoch": 0.45, "learning_rate": 3.4036062437734484e-05, "logits/chosen": -2.084941864013672, "logits/rejected": -2.1283769607543945, "logps/chosen": -138.01251220703125, "logps/rejected": -141.75653076171875, "loss": 0.6951, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5684069991111755, "rewards/margins": 0.06860056519508362, "rewards/rejected": -0.6370075345039368, "step": 428 }, { "epoch": 0.45, "learning_rate": 3.395065230663996e-05, "logits/chosen": -2.356782913208008, "logits/rejected": -2.3323380947113037, "logps/chosen": -164.42636108398438, "logps/rejected": -157.51885986328125, "loss": 0.8111, "rewards/accuracies": 0.3125, "rewards/chosen": -0.5730559825897217, "rewards/margins": -0.158206045627594, "rewards/rejected": -0.4148499667644501, "step": 429 }, { "epoch": 0.45, "learning_rate": 3.386512217606339e-05, "logits/chosen": -2.306445837020874, "logits/rejected": -2.305457353591919, "logps/chosen": -177.36483764648438, "logps/rejected": -180.41497802734375, "loss": 0.7929, "rewards/accuracies": 0.5, "rewards/chosen": -0.6710464358329773, "rewards/margins": -0.10232071578502655, "rewards/rejected": -0.5687257647514343, "step": 430 }, { "epoch": 0.45, "learning_rate": 3.3779473192688954e-05, "logits/chosen": -2.2322001457214355, "logits/rejected": -2.2678284645080566, "logps/chosen": -180.5767822265625, "logps/rejected": -215.00439453125, "loss": 0.6272, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7634757161140442, "rewards/margins": 0.3175090253353119, "rewards/rejected": -1.0809847116470337, "step": 431 }, { "epoch": 0.45, "learning_rate": 3.369370650479425e-05, "logits/chosen": -2.3506946563720703, "logits/rejected": -2.272690534591675, "logps/chosen": -191.31764221191406, "logps/rejected": -167.90931701660156, "loss": 0.6944, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6492268443107605, "rewards/margins": 0.056166499853134155, "rewards/rejected": -0.7053933143615723, "step": 432 }, { "epoch": 0.45, "learning_rate": 3.360782326223493e-05, "logits/chosen": -2.21726131439209, "logits/rejected": -2.1750893592834473, "logps/chosen": -130.21981811523438, "logps/rejected": -122.38761138916016, "loss": 0.7093, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6112926006317139, "rewards/margins": 0.05839107558131218, "rewards/rejected": -0.6696836948394775, "step": 433 }, { "epoch": 0.45, "learning_rate": 3.3521824616429285e-05, "logits/chosen": -2.276099681854248, "logits/rejected": -2.3207895755767822, "logps/chosen": -152.9349822998047, "logps/rejected": -176.16763305664062, "loss": 0.6335, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6175845861434937, "rewards/margins": 0.19425898790359497, "rewards/rejected": -0.8118435740470886, "step": 434 }, { "epoch": 0.45, "learning_rate": 3.3435711720342764e-05, "logits/chosen": -2.3540244102478027, "logits/rejected": -2.4207704067230225, "logps/chosen": -162.0496063232422, "logps/rejected": -180.61257934570312, "loss": 0.6201, "rewards/accuracies": 0.625, "rewards/chosen": -0.7302818298339844, "rewards/margins": 0.24378572404384613, "rewards/rejected": -0.9740675687789917, "step": 435 }, { "epoch": 0.45, "learning_rate": 3.3349485728472535e-05, "logits/chosen": -2.2981767654418945, "logits/rejected": -2.403442144393921, "logps/chosen": -169.84153747558594, "logps/rejected": -196.2303466796875, "loss": 0.6202, "rewards/accuracies": 0.75, "rewards/chosen": -0.631123960018158, "rewards/margins": 0.2665478587150574, "rewards/rejected": -0.8976718187332153, "step": 436 }, { "epoch": 0.46, "learning_rate": 3.326314779683207e-05, "logits/chosen": -2.45729923248291, "logits/rejected": -2.3062028884887695, "logps/chosen": -180.6869354248047, "logps/rejected": -158.7431640625, "loss": 0.8641, "rewards/accuracies": 0.375, "rewards/chosen": -0.7027081251144409, "rewards/margins": -0.22353459894657135, "rewards/rejected": -0.47917354106903076, "step": 437 }, { "epoch": 0.46, "learning_rate": 3.3176699082935545e-05, "logits/chosen": -2.310640811920166, "logits/rejected": -2.3036937713623047, "logps/chosen": -186.8025665283203, "logps/rejected": -198.4528045654297, "loss": 0.6925, "rewards/accuracies": 0.5, "rewards/chosen": -0.8213132619857788, "rewards/margins": 0.07846779376268387, "rewards/rejected": -0.8997809886932373, "step": 438 }, { "epoch": 0.46, "learning_rate": 3.3090140745782396e-05, "logits/chosen": -2.3146181106567383, "logits/rejected": -2.318394660949707, "logps/chosen": -207.11068725585938, "logps/rejected": -205.5038604736328, "loss": 0.6093, "rewards/accuracies": 0.625, "rewards/chosen": -0.6563000679016113, "rewards/margins": 0.33885622024536133, "rewards/rejected": -0.9951564073562622, "step": 439 }, { "epoch": 0.46, "learning_rate": 3.300347394584172e-05, "logits/chosen": -2.4188132286071777, "logits/rejected": -2.4725587368011475, "logps/chosen": -156.28175354003906, "logps/rejected": -178.71530151367188, "loss": 0.7442, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6938459277153015, "rewards/margins": 0.030388107523322105, "rewards/rejected": -0.7242341041564941, "step": 440 }, { "epoch": 0.46, "learning_rate": 3.2916699845036816e-05, "logits/chosen": -2.326338768005371, "logits/rejected": -2.3983230590820312, "logps/chosen": -116.61759948730469, "logps/rejected": -127.69525909423828, "loss": 0.6664, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6837319135665894, "rewards/margins": 0.17259922623634338, "rewards/rejected": -0.8563311696052551, "step": 441 }, { "epoch": 0.46, "learning_rate": 3.282981960672948e-05, "logits/chosen": -2.239466905593872, "logits/rejected": -2.3111233711242676, "logps/chosen": -159.78004455566406, "logps/rejected": -177.1030731201172, "loss": 0.6711, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7229784727096558, "rewards/margins": 0.17383922636508942, "rewards/rejected": -0.896817684173584, "step": 442 }, { "epoch": 0.46, "learning_rate": 3.2742834395704486e-05, "logits/chosen": -2.203927516937256, "logits/rejected": -2.1760244369506836, "logps/chosen": -133.5518341064453, "logps/rejected": -157.52099609375, "loss": 0.7027, "rewards/accuracies": 0.5, "rewards/chosen": -0.71323561668396, "rewards/margins": 0.07185419648885727, "rewards/rejected": -0.7850897908210754, "step": 443 }, { "epoch": 0.46, "learning_rate": 3.265574537815398e-05, "logits/chosen": -2.1297600269317627, "logits/rejected": -2.0856151580810547, "logps/chosen": -149.8739776611328, "logps/rejected": -155.45904541015625, "loss": 0.7246, "rewards/accuracies": 0.4375, "rewards/chosen": -0.7274818420410156, "rewards/margins": 0.02891545556485653, "rewards/rejected": -0.7563972473144531, "step": 444 }, { "epoch": 0.46, "learning_rate": 3.25685537216618e-05, "logits/chosen": -2.446577787399292, "logits/rejected": -2.4465479850769043, "logps/chosen": -205.2270050048828, "logps/rejected": -194.4155731201172, "loss": 0.7937, "rewards/accuracies": 0.4375, "rewards/chosen": -0.9242798686027527, "rewards/margins": -0.04263466224074364, "rewards/rejected": -0.8816452622413635, "step": 445 }, { "epoch": 0.47, "learning_rate": 3.248126059518785e-05, "logits/chosen": -2.11894154548645, "logits/rejected": -2.1701767444610596, "logps/chosen": -146.13601684570312, "logps/rejected": -163.9047088623047, "loss": 0.6358, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5888944864273071, "rewards/margins": 0.164439395070076, "rewards/rejected": -0.7533338665962219, "step": 446 }, { "epoch": 0.47, "learning_rate": 3.2393867169052385e-05, "logits/chosen": -2.250922203063965, "logits/rejected": -2.2642922401428223, "logps/chosen": -222.06298828125, "logps/rejected": -232.71881103515625, "loss": 0.8317, "rewards/accuracies": 0.4375, "rewards/chosen": -0.9078823924064636, "rewards/margins": -0.15829764306545258, "rewards/rejected": -0.7495847344398499, "step": 447 }, { "epoch": 0.47, "learning_rate": 3.230637461492043e-05, "logits/chosen": -2.254838228225708, "logits/rejected": -2.2903223037719727, "logps/chosen": -182.53579711914062, "logps/rejected": -193.14736938476562, "loss": 0.6348, "rewards/accuracies": 0.6875, "rewards/chosen": -0.626493513584137, "rewards/margins": 0.2684532105922699, "rewards/rejected": -0.8949467539787292, "step": 448 }, { "epoch": 0.47, "learning_rate": 3.221878410578593e-05, "logits/chosen": -2.258246898651123, "logits/rejected": -2.1956984996795654, "logps/chosen": -210.88497924804688, "logps/rejected": -204.88185119628906, "loss": 0.8482, "rewards/accuracies": 0.375, "rewards/chosen": -0.5795395374298096, "rewards/margins": -0.15963196754455566, "rewards/rejected": -0.4199075698852539, "step": 449 }, { "epoch": 0.47, "learning_rate": 3.213109681595612e-05, "logits/chosen": -2.3528032302856445, "logits/rejected": -2.296219825744629, "logps/chosen": -207.69216918945312, "logps/rejected": -225.57296752929688, "loss": 0.7991, "rewards/accuracies": 0.4375, "rewards/chosen": -0.9065631031990051, "rewards/margins": -0.03170624002814293, "rewards/rejected": -0.8748568892478943, "step": 450 }, { "epoch": 0.47, "learning_rate": 3.2043313921035743e-05, "logits/chosen": -2.1952693462371826, "logits/rejected": -2.2114603519439697, "logps/chosen": -204.89718627929688, "logps/rejected": -201.62734985351562, "loss": 0.9834, "rewards/accuracies": 0.3125, "rewards/chosen": -1.2711020708084106, "rewards/margins": -0.4141416847705841, "rewards/rejected": -0.8569603562355042, "step": 451 }, { "epoch": 0.47, "learning_rate": 3.195543659791132e-05, "logits/chosen": -2.1842381954193115, "logits/rejected": -2.2123162746429443, "logps/chosen": -169.44754028320312, "logps/rejected": -167.3434295654297, "loss": 0.64, "rewards/accuracies": 0.5, "rewards/chosen": -0.6639611124992371, "rewards/margins": 0.15555183589458466, "rewards/rejected": -0.8195129632949829, "step": 452 }, { "epoch": 0.47, "learning_rate": 3.186746602473533e-05, "logits/chosen": -2.1493401527404785, "logits/rejected": -2.1739141941070557, "logps/chosen": -149.2794189453125, "logps/rejected": -154.29624938964844, "loss": 0.8507, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6466760039329529, "rewards/margins": -0.20036213099956512, "rewards/rejected": -0.44631391763687134, "step": 453 }, { "epoch": 0.47, "learning_rate": 3.177940338091043e-05, "logits/chosen": -2.2300286293029785, "logits/rejected": -2.3122761249542236, "logps/chosen": -191.32516479492188, "logps/rejected": -194.84938049316406, "loss": 0.7401, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7524707317352295, "rewards/margins": 0.010163038969039917, "rewards/rejected": -0.7626338005065918, "step": 454 }, { "epoch": 0.47, "learning_rate": 3.169124984707367e-05, "logits/chosen": -2.2368390560150146, "logits/rejected": -2.29437255859375, "logps/chosen": -163.08969116210938, "logps/rejected": -171.7424774169922, "loss": 0.8942, "rewards/accuracies": 0.375, "rewards/chosen": -0.9385874271392822, "rewards/margins": -0.3221665024757385, "rewards/rejected": -0.6164208650588989, "step": 455 }, { "epoch": 0.48, "learning_rate": 3.160300660508064e-05, "logits/chosen": -2.2047293186187744, "logits/rejected": -2.1667182445526123, "logps/chosen": -156.24505615234375, "logps/rejected": -150.25613403320312, "loss": 0.7661, "rewards/accuracies": 0.5, "rewards/chosen": -0.5315223932266235, "rewards/margins": -0.031079813838005066, "rewards/rejected": -0.500442624092102, "step": 456 }, { "epoch": 0.48, "learning_rate": 3.151467483798961e-05, "logits/chosen": -2.2086293697357178, "logits/rejected": -2.196566581726074, "logps/chosen": -163.7198944091797, "logps/rejected": -154.07669067382812, "loss": 0.7642, "rewards/accuracies": 0.5, "rewards/chosen": -0.7637531757354736, "rewards/margins": -0.03873248025774956, "rewards/rejected": -0.7250206470489502, "step": 457 }, { "epoch": 0.48, "learning_rate": 3.14262557300457e-05, "logits/chosen": -2.1346004009246826, "logits/rejected": -2.2624478340148926, "logps/chosen": -157.80322265625, "logps/rejected": -193.9817657470703, "loss": 0.6025, "rewards/accuracies": 0.625, "rewards/chosen": -0.5126041173934937, "rewards/margins": 0.3903144598007202, "rewards/rejected": -0.9029185175895691, "step": 458 }, { "epoch": 0.48, "learning_rate": 3.1337750466665e-05, "logits/chosen": -2.120087146759033, "logits/rejected": -2.164226770401001, "logps/chosen": -189.44192504882812, "logps/rejected": -220.5596466064453, "loss": 0.7477, "rewards/accuracies": 0.4375, "rewards/chosen": -0.8327434659004211, "rewards/margins": -0.02192605845630169, "rewards/rejected": -0.8108173608779907, "step": 459 }, { "epoch": 0.48, "learning_rate": 3.124916023441865e-05, "logits/chosen": -2.2006072998046875, "logits/rejected": -2.1659958362579346, "logps/chosen": -182.32632446289062, "logps/rejected": -194.20724487304688, "loss": 0.8611, "rewards/accuracies": 0.375, "rewards/chosen": -0.8323721885681152, "rewards/margins": -0.26599258184432983, "rewards/rejected": -0.5663796067237854, "step": 460 }, { "epoch": 0.48, "learning_rate": 3.116048622101694e-05, "logits/chosen": -2.143481969833374, "logits/rejected": -2.1845016479492188, "logps/chosen": -165.87046813964844, "logps/rejected": -171.90936279296875, "loss": 0.7098, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8616752028465271, "rewards/margins": 0.11464538425207138, "rewards/rejected": -0.9763206243515015, "step": 461 }, { "epoch": 0.48, "learning_rate": 3.107172961529343e-05, "logits/chosen": -2.1274116039276123, "logits/rejected": -2.162541389465332, "logps/chosen": -158.4412841796875, "logps/rejected": -173.54653930664062, "loss": 0.7462, "rewards/accuracies": 0.4375, "rewards/chosen": -0.7143791317939758, "rewards/margins": 0.004445172846317291, "rewards/rejected": -0.7188242673873901, "step": 462 }, { "epoch": 0.48, "learning_rate": 3.098289160718895e-05, "logits/chosen": -2.1465463638305664, "logits/rejected": -2.1098814010620117, "logps/chosen": -130.60450744628906, "logps/rejected": -149.80252075195312, "loss": 0.6369, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4852756857872009, "rewards/margins": 0.2071731984615326, "rewards/rejected": -0.6924489140510559, "step": 463 }, { "epoch": 0.48, "learning_rate": 3.0893973387735687e-05, "logits/chosen": -2.323080539703369, "logits/rejected": -2.2071361541748047, "logps/chosen": -180.71392822265625, "logps/rejected": -165.3758544921875, "loss": 0.9189, "rewards/accuracies": 0.375, "rewards/chosen": -0.8797387480735779, "rewards/margins": -0.3374296724796295, "rewards/rejected": -0.5423091053962708, "step": 464 }, { "epoch": 0.49, "learning_rate": 3.0804976149041195e-05, "logits/chosen": -2.3689966201782227, "logits/rejected": -2.432495355606079, "logps/chosen": -183.48805236816406, "logps/rejected": -180.59786987304688, "loss": 0.6557, "rewards/accuracies": 0.625, "rewards/chosen": -0.7070946097373962, "rewards/margins": 0.12962420284748077, "rewards/rejected": -0.836718738079071, "step": 465 }, { "epoch": 0.49, "learning_rate": 3.071590108427244e-05, "logits/chosen": -2.2095448970794678, "logits/rejected": -2.22792387008667, "logps/chosen": -194.24359130859375, "logps/rejected": -181.46434020996094, "loss": 0.5084, "rewards/accuracies": 0.8125, "rewards/chosen": -0.46445751190185547, "rewards/margins": 0.5187379121780396, "rewards/rejected": -0.9831954836845398, "step": 466 }, { "epoch": 0.49, "learning_rate": 3.062674938763976e-05, "logits/chosen": -2.224792718887329, "logits/rejected": -2.276299476623535, "logps/chosen": -151.77529907226562, "logps/rejected": -171.2163543701172, "loss": 0.589, "rewards/accuracies": 0.625, "rewards/chosen": -0.49393290281295776, "rewards/margins": 0.3881426751613617, "rewards/rejected": -0.8820755481719971, "step": 467 }, { "epoch": 0.49, "learning_rate": 3.0537522254380905e-05, "logits/chosen": -2.327399730682373, "logits/rejected": -2.2717721462249756, "logps/chosen": -178.17420959472656, "logps/rejected": -183.68106079101562, "loss": 0.7317, "rewards/accuracies": 0.5, "rewards/chosen": -0.3572332262992859, "rewards/margins": 0.004655532538890839, "rewards/rejected": -0.3618887662887573, "step": 468 }, { "epoch": 0.49, "learning_rate": 3.044822088074496e-05, "logits/chosen": -2.150599479675293, "logits/rejected": -2.1766562461853027, "logps/chosen": -150.70323181152344, "logps/rejected": -175.47964477539062, "loss": 0.6856, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5796679258346558, "rewards/margins": 0.14709994196891785, "rewards/rejected": -0.726767897605896, "step": 469 }, { "epoch": 0.49, "learning_rate": 3.0358846463976372e-05, "logits/chosen": -2.2366018295288086, "logits/rejected": -2.338874578475952, "logps/chosen": -192.69740295410156, "logps/rejected": -190.33204650878906, "loss": 0.6836, "rewards/accuracies": 0.4375, "rewards/chosen": -0.48144611716270447, "rewards/margins": 0.07051944732666016, "rewards/rejected": -0.551965594291687, "step": 470 }, { "epoch": 0.49, "learning_rate": 3.026940020229882e-05, "logits/chosen": -2.133188247680664, "logits/rejected": -2.177133798599243, "logps/chosen": -150.59495544433594, "logps/rejected": -149.1016845703125, "loss": 0.818, "rewards/accuracies": 0.375, "rewards/chosen": -0.7209250926971436, "rewards/margins": -0.17671580612659454, "rewards/rejected": -0.5442093014717102, "step": 471 }, { "epoch": 0.49, "learning_rate": 3.017988329489923e-05, "logits/chosen": -2.2492454051971436, "logits/rejected": -2.2075250148773193, "logps/chosen": -218.95291137695312, "logps/rejected": -213.46139526367188, "loss": 0.7839, "rewards/accuracies": 0.625, "rewards/chosen": -0.6977720856666565, "rewards/margins": -0.08655133843421936, "rewards/rejected": -0.6112207770347595, "step": 472 }, { "epoch": 0.49, "learning_rate": 3.0090296941911633e-05, "logits/chosen": -2.1852970123291016, "logits/rejected": -2.1652181148529053, "logps/chosen": -196.5089874267578, "logps/rejected": -201.77569580078125, "loss": 0.7244, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5932771563529968, "rewards/margins": -0.0026064813137054443, "rewards/rejected": -0.5906707644462585, "step": 473 }, { "epoch": 0.49, "learning_rate": 3.0000642344401113e-05, "logits/chosen": -2.115180015563965, "logits/rejected": -2.0559911727905273, "logps/chosen": -157.2303924560547, "logps/rejected": -145.6020050048828, "loss": 0.7223, "rewards/accuracies": 0.4375, "rewards/chosen": -0.48168429732322693, "rewards/margins": 0.11891864240169525, "rewards/rejected": -0.600602924823761, "step": 474 }, { "epoch": 0.5, "learning_rate": 2.9910920704347696e-05, "logits/chosen": -2.387964963912964, "logits/rejected": -2.433955669403076, "logps/chosen": -245.86285400390625, "logps/rejected": -259.4566955566406, "loss": 0.7292, "rewards/accuracies": 0.5, "rewards/chosen": -0.6999150514602661, "rewards/margins": 0.08391554653644562, "rewards/rejected": -0.7838307023048401, "step": 475 }, { "epoch": 0.5, "learning_rate": 2.9821133224630226e-05, "logits/chosen": -2.1827383041381836, "logits/rejected": -2.2108314037323, "logps/chosen": -172.43350219726562, "logps/rejected": -167.54298400878906, "loss": 0.6998, "rewards/accuracies": 0.5, "rewards/chosen": -0.40625911951065063, "rewards/margins": 0.13796135783195496, "rewards/rejected": -0.5442204475402832, "step": 476 }, { "epoch": 0.5, "learning_rate": 2.9731281109010256e-05, "logits/chosen": -2.393608331680298, "logits/rejected": -2.4628074169158936, "logps/chosen": -155.9365234375, "logps/rejected": -150.9811248779297, "loss": 0.6863, "rewards/accuracies": 0.5, "rewards/chosen": -0.4619404375553131, "rewards/margins": 0.08013444393873215, "rewards/rejected": -0.5420749187469482, "step": 477 }, { "epoch": 0.5, "learning_rate": 2.9641365562115887e-05, "logits/chosen": -2.1305439472198486, "logits/rejected": -2.158849000930786, "logps/chosen": -157.4604034423828, "logps/rejected": -158.77505493164062, "loss": 0.6796, "rewards/accuracies": 0.5, "rewards/chosen": -0.5900415182113647, "rewards/margins": 0.08278737962245941, "rewards/rejected": -0.6728289127349854, "step": 478 }, { "epoch": 0.5, "learning_rate": 2.9551387789425638e-05, "logits/chosen": -2.111013412475586, "logits/rejected": -2.1469898223876953, "logps/chosen": -177.7059326171875, "logps/rejected": -199.083251953125, "loss": 0.6744, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6065003871917725, "rewards/margins": 0.17838376760482788, "rewards/rejected": -0.7848842144012451, "step": 479 }, { "epoch": 0.5, "learning_rate": 2.9461348997252265e-05, "logits/chosen": -2.2793450355529785, "logits/rejected": -2.2518503665924072, "logps/chosen": -167.08595275878906, "logps/rejected": -162.74386596679688, "loss": 0.6446, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5793865323066711, "rewards/margins": 0.19307895004749298, "rewards/rejected": -0.7724654674530029, "step": 480 }, { "epoch": 0.5, "learning_rate": 2.9371250392726614e-05, "logits/chosen": -2.156540632247925, "logits/rejected": -2.1982791423797607, "logps/chosen": -232.06939697265625, "logps/rejected": -225.7415771484375, "loss": 0.6585, "rewards/accuracies": 0.625, "rewards/chosen": -0.6940824389457703, "rewards/margins": 0.11553283035755157, "rewards/rejected": -0.8096152544021606, "step": 481 }, { "epoch": 0.5, "learning_rate": 2.9281093183781403e-05, "logits/chosen": -2.0882251262664795, "logits/rejected": -2.2663707733154297, "logps/chosen": -130.05990600585938, "logps/rejected": -188.05630493164062, "loss": 0.6944, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4038165509700775, "rewards/margins": 0.06018731743097305, "rewards/rejected": -0.46400386095046997, "step": 482 }, { "epoch": 0.5, "learning_rate": 2.919087857913508e-05, "logits/chosen": -2.3520162105560303, "logits/rejected": -2.321183443069458, "logps/chosen": -182.3740997314453, "logps/rejected": -178.23336791992188, "loss": 0.6351, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5323800444602966, "rewards/margins": 0.18767206370830536, "rewards/rejected": -0.7200521230697632, "step": 483 }, { "epoch": 0.5, "learning_rate": 2.9100607788275545e-05, "logits/chosen": -2.1776552200317383, "logits/rejected": -2.2282662391662598, "logps/chosen": -163.6830596923828, "logps/rejected": -172.34671020507812, "loss": 0.7805, "rewards/accuracies": 0.25, "rewards/chosen": -0.6728564500808716, "rewards/margins": -0.11494327336549759, "rewards/rejected": -0.5579131245613098, "step": 484 }, { "epoch": 0.51, "learning_rate": 2.9010282021444008e-05, "logits/chosen": -2.239274501800537, "logits/rejected": -2.17651104927063, "logps/chosen": -174.9864044189453, "logps/rejected": -169.44493103027344, "loss": 0.8076, "rewards/accuracies": 0.5, "rewards/chosen": -0.5585433840751648, "rewards/margins": -0.09774555265903473, "rewards/rejected": -0.4607977867126465, "step": 485 }, { "epoch": 0.51, "learning_rate": 2.891990248961871e-05, "logits/chosen": -2.1217386722564697, "logits/rejected": -2.1039137840270996, "logps/chosen": -159.67498779296875, "logps/rejected": -174.40069580078125, "loss": 0.6205, "rewards/accuracies": 0.5, "rewards/chosen": -0.24506211280822754, "rewards/margins": 0.40262073278427124, "rewards/rejected": -0.647682785987854, "step": 486 }, { "epoch": 0.51, "learning_rate": 2.8829470404498697e-05, "logits/chosen": -2.1323282718658447, "logits/rejected": -2.1301045417785645, "logps/chosen": -129.35870361328125, "logps/rejected": -170.45484924316406, "loss": 0.6848, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4312661588191986, "rewards/margins": 0.12291737645864487, "rewards/rejected": -0.5541835427284241, "step": 487 }, { "epoch": 0.51, "learning_rate": 2.8738986978487625e-05, "logits/chosen": -2.2189228534698486, "logits/rejected": -2.1614956855773926, "logps/chosen": -193.06204223632812, "logps/rejected": -182.66293334960938, "loss": 0.7176, "rewards/accuracies": 0.625, "rewards/chosen": -0.6811609864234924, "rewards/margins": 0.08441457152366638, "rewards/rejected": -0.7655755877494812, "step": 488 }, { "epoch": 0.51, "learning_rate": 2.8648453424677434e-05, "logits/chosen": -2.2789225578308105, "logits/rejected": -2.3813822269439697, "logps/chosen": -168.468017578125, "logps/rejected": -183.34982299804688, "loss": 0.655, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5851632952690125, "rewards/margins": 0.17568376660346985, "rewards/rejected": -0.7608469724655151, "step": 489 }, { "epoch": 0.51, "learning_rate": 2.8557870956832132e-05, "logits/chosen": -2.264902114868164, "logits/rejected": -2.2514560222625732, "logps/chosen": -174.7198486328125, "logps/rejected": -179.86676025390625, "loss": 0.6624, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6332632303237915, "rewards/margins": 0.08560072630643845, "rewards/rejected": -0.7188639640808105, "step": 490 }, { "epoch": 0.51, "learning_rate": 2.846724078937149e-05, "logits/chosen": -2.1317250728607178, "logits/rejected": -2.1464059352874756, "logps/chosen": -174.97686767578125, "logps/rejected": -182.57919311523438, "loss": 0.6618, "rewards/accuracies": 0.625, "rewards/chosen": -0.6465609073638916, "rewards/margins": 0.17847394943237305, "rewards/rejected": -0.8250348567962646, "step": 491 }, { "epoch": 0.51, "learning_rate": 2.8376564137354795e-05, "logits/chosen": -2.1236746311187744, "logits/rejected": -2.148552894592285, "logps/chosen": -156.05751037597656, "logps/rejected": -151.4677734375, "loss": 0.773, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5343315601348877, "rewards/margins": -0.059497520327568054, "rewards/rejected": -0.47483405470848083, "step": 492 }, { "epoch": 0.51, "learning_rate": 2.8285842216464543e-05, "logits/chosen": -2.2011935710906982, "logits/rejected": -2.3106913566589355, "logps/chosen": -183.11766052246094, "logps/rejected": -201.7404022216797, "loss": 0.5969, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5661877989768982, "rewards/margins": 0.39767351746559143, "rewards/rejected": -0.9638612866401672, "step": 493 }, { "epoch": 0.52, "learning_rate": 2.8195076242990122e-05, "logits/chosen": -2.245713472366333, "logits/rejected": -2.243020534515381, "logps/chosen": -159.26397705078125, "logps/rejected": -174.874267578125, "loss": 0.8165, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6563709378242493, "rewards/margins": -0.1701582670211792, "rewards/rejected": -0.48621270060539246, "step": 494 }, { "epoch": 0.52, "learning_rate": 2.8104267433811533e-05, "logits/chosen": -2.1842641830444336, "logits/rejected": -2.1591455936431885, "logps/chosen": -121.25286102294922, "logps/rejected": -115.32366180419922, "loss": 0.6663, "rewards/accuracies": 0.625, "rewards/chosen": -0.4449831545352936, "rewards/margins": 0.13848333060741425, "rewards/rejected": -0.5834664702415466, "step": 495 }, { "epoch": 0.52, "learning_rate": 2.8013417006383076e-05, "logits/chosen": -2.1810221672058105, "logits/rejected": -2.2239696979522705, "logps/chosen": -151.152099609375, "logps/rejected": -176.65977478027344, "loss": 0.601, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4074276387691498, "rewards/margins": 0.25585755705833435, "rewards/rejected": -0.6632851958274841, "step": 496 }, { "epoch": 0.52, "learning_rate": 2.7922526178717017e-05, "logits/chosen": -2.1347427368164062, "logits/rejected": -2.1655385494232178, "logps/chosen": -159.8424072265625, "logps/rejected": -178.92288208007812, "loss": 0.6182, "rewards/accuracies": 0.625, "rewards/chosen": -0.48965874314308167, "rewards/margins": 0.2526865601539612, "rewards/rejected": -0.7423452734947205, "step": 497 }, { "epoch": 0.52, "learning_rate": 2.783159616936723e-05, "logits/chosen": -2.141169309616089, "logits/rejected": -2.141371726989746, "logps/chosen": -158.35968017578125, "logps/rejected": -176.37464904785156, "loss": 0.642, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5580905079841614, "rewards/margins": 0.24599358439445496, "rewards/rejected": -0.8040841221809387, "step": 498 }, { "epoch": 0.52, "learning_rate": 2.774062819741293e-05, "logits/chosen": -2.250638246536255, "logits/rejected": -2.1852548122406006, "logps/chosen": -165.15774536132812, "logps/rejected": -180.04124450683594, "loss": 0.6831, "rewards/accuracies": 0.4375, "rewards/chosen": -0.4352782368659973, "rewards/margins": 0.0841899961233139, "rewards/rejected": -0.5194682478904724, "step": 499 }, { "epoch": 0.52, "learning_rate": 2.764962348244228e-05, "logits/chosen": -2.187967538833618, "logits/rejected": -2.1378700733184814, "logps/chosen": -179.86184692382812, "logps/rejected": -174.86183166503906, "loss": 0.7768, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8409562110900879, "rewards/margins": -0.06990113109350204, "rewards/rejected": -0.7710551619529724, "step": 500 } ], "logging_steps": 1, "max_steps": 958, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }