{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9981298423724285, "eval_steps": 100, "global_step": 467, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0021373230029388193, "grad_norm": 5.9665925980173595, "learning_rate": 1.7021276595744678e-08, "logits/chosen": -1.1223822832107544, "logits/rejected": -1.1192574501037598, "logps/chosen": -257.6567687988281, "logps/rejected": -268.0864562988281, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.004274646005877639, "grad_norm": 5.827849982093965, "learning_rate": 3.4042553191489356e-08, "logits/chosen": -1.1125223636627197, "logits/rejected": -1.0834215879440308, "logps/chosen": -333.40960693359375, "logps/rejected": -285.8963928222656, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.006411969008816457, "grad_norm": 6.028572669421686, "learning_rate": 5.106382978723404e-08, "logits/chosen": -0.9786796569824219, "logits/rejected": -0.9414944052696228, "logps/chosen": -242.9634552001953, "logps/rejected": -243.12557983398438, "loss": 0.6936, "rewards/accuracies": 0.625, "rewards/chosen": 0.00023853778839111328, "rewards/margins": 7.291347719728947e-05, "rewards/rejected": 0.0001656246604397893, "step": 3 }, { "epoch": 0.008549292011755277, "grad_norm": 5.496288011485378, "learning_rate": 6.808510638297871e-08, "logits/chosen": -1.1184200048446655, "logits/rejected": -1.1394802331924438, "logps/chosen": -210.64151000976562, "logps/rejected": -217.98028564453125, "loss": 0.6934, "rewards/accuracies": 0.3125, "rewards/chosen": -0.001957583473995328, "rewards/margins": -0.0010110187577083707, "rewards/rejected": -0.0009465646580792964, "step": 4 }, { "epoch": 0.010686615014694095, "grad_norm": 6.1423037995058385, "learning_rate": 8.51063829787234e-08, "logits/chosen": -1.1924649477005005, "logits/rejected": -1.2157042026519775, "logps/chosen": -229.38040161132812, "logps/rejected": -228.48341369628906, "loss": 0.6924, "rewards/accuracies": 0.625, "rewards/chosen": -0.0004083870444446802, "rewards/margins": 0.002144141122698784, "rewards/rejected": -0.002552528167143464, "step": 5 }, { "epoch": 0.012823938017632914, "grad_norm": 6.659651187507062, "learning_rate": 1.0212765957446807e-07, "logits/chosen": -1.152483344078064, "logits/rejected": -1.0924268960952759, "logps/chosen": -317.67449951171875, "logps/rejected": -295.94219970703125, "loss": 0.693, "rewards/accuracies": 0.5625, "rewards/chosen": -0.001339879003353417, "rewards/margins": -0.00023622988373972476, "rewards/rejected": -0.0011036490323022008, "step": 6 }, { "epoch": 0.014961261020571734, "grad_norm": 7.17635843844568, "learning_rate": 1.1914893617021275e-07, "logits/chosen": -1.2364102602005005, "logits/rejected": -1.1785768270492554, "logps/chosen": -242.46392822265625, "logps/rejected": -235.65455627441406, "loss": 0.6926, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0010340880835428834, "rewards/margins": 0.0029830788262188435, "rewards/rejected": -0.004017166793346405, "step": 7 }, { "epoch": 0.017098584023510555, "grad_norm": 5.718849974150984, "learning_rate": 1.3617021276595742e-07, "logits/chosen": -1.1600620746612549, "logits/rejected": -1.1021112203598022, "logps/chosen": -253.3247528076172, "logps/rejected": -266.04168701171875, "loss": 0.6926, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0006916046258993447, "rewards/margins": 0.0022089341655373573, "rewards/rejected": -0.0015173291321843863, "step": 8 }, { "epoch": 0.01923590702644937, "grad_norm": 5.084147578805341, "learning_rate": 1.531914893617021e-07, "logits/chosen": -1.079232096672058, "logits/rejected": -1.0460667610168457, "logps/chosen": -286.5476379394531, "logps/rejected": -294.2388610839844, "loss": 0.6935, "rewards/accuracies": 0.625, "rewards/chosen": 0.000238885753788054, "rewards/margins": 0.0006698797806166112, "rewards/rejected": -0.00043099402682855725, "step": 9 }, { "epoch": 0.02137323002938819, "grad_norm": 6.119066349537701, "learning_rate": 1.702127659574468e-07, "logits/chosen": -1.3416539430618286, "logits/rejected": -1.3090091943740845, "logps/chosen": -269.2237548828125, "logps/rejected": -290.6747741699219, "loss": 0.6933, "rewards/accuracies": 0.5, "rewards/chosen": -0.0022063730284571648, "rewards/margins": -0.00022077548783272505, "rewards/rejected": -0.001985597424209118, "step": 10 }, { "epoch": 0.02351055303232701, "grad_norm": 6.296522722355036, "learning_rate": 1.872340425531915e-07, "logits/chosen": -1.1501847505569458, "logits/rejected": -1.1223665475845337, "logps/chosen": -268.7527160644531, "logps/rejected": -282.72100830078125, "loss": 0.6927, "rewards/accuracies": 0.75, "rewards/chosen": 0.0022443102207034826, "rewards/margins": 0.0018837449606508017, "rewards/rejected": 0.0003605652309488505, "step": 11 }, { "epoch": 0.02564787603526583, "grad_norm": 4.946921694352282, "learning_rate": 2.0425531914893615e-07, "logits/chosen": -1.0480014085769653, "logits/rejected": -1.0393519401550293, "logps/chosen": -236.5433349609375, "logps/rejected": -217.593017578125, "loss": 0.693, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0006607104442082345, "rewards/margins": -0.000438790419138968, "rewards/rejected": -0.00022191996686160564, "step": 12 }, { "epoch": 0.027785199038204648, "grad_norm": 5.73018010615819, "learning_rate": 2.2127659574468084e-07, "logits/chosen": -1.0228816270828247, "logits/rejected": -0.9232650995254517, "logps/chosen": -265.681884765625, "logps/rejected": -292.8697509765625, "loss": 0.6932, "rewards/accuracies": 0.5, "rewards/chosen": -0.004240307491272688, "rewards/margins": -0.001113958191126585, "rewards/rejected": -0.003126349300146103, "step": 13 }, { "epoch": 0.029922522041143467, "grad_norm": 6.486456105963774, "learning_rate": 2.382978723404255e-07, "logits/chosen": -1.1559597253799438, "logits/rejected": -1.204541563987732, "logps/chosen": -302.4142150878906, "logps/rejected": -285.40521240234375, "loss": 0.6927, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0032664297614246607, "rewards/margins": 0.0017348529072478414, "rewards/rejected": -0.0050012823194265366, "step": 14 }, { "epoch": 0.03205984504408229, "grad_norm": 5.03221979114363, "learning_rate": 2.553191489361702e-07, "logits/chosen": -1.3137496709823608, "logits/rejected": -1.2676270008087158, "logps/chosen": -271.43218994140625, "logps/rejected": -278.537109375, "loss": 0.6928, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0035740709863603115, "rewards/margins": 0.0014838932547718287, "rewards/rejected": -0.0050579640083014965, "step": 15 }, { "epoch": 0.03419716804702111, "grad_norm": 5.805233594511206, "learning_rate": 2.7234042553191485e-07, "logits/chosen": -1.2339807748794556, "logits/rejected": -1.1998343467712402, "logps/chosen": -278.9430236816406, "logps/rejected": -287.36444091796875, "loss": 0.6929, "rewards/accuracies": 0.4375, "rewards/chosen": -0.005366215482354164, "rewards/margins": -0.0015738580841571093, "rewards/rejected": -0.003792357398197055, "step": 16 }, { "epoch": 0.03633449104995993, "grad_norm": 5.939188331286007, "learning_rate": 2.893617021276596e-07, "logits/chosen": -1.267714500427246, "logits/rejected": -1.2154172658920288, "logps/chosen": -243.30702209472656, "logps/rejected": -243.65267944335938, "loss": 0.6924, "rewards/accuracies": 0.4375, "rewards/chosen": -0.005914730951189995, "rewards/margins": -0.003907041624188423, "rewards/rejected": -0.0020076895598322153, "step": 17 }, { "epoch": 0.03847181405289874, "grad_norm": 5.736074036017852, "learning_rate": 3.063829787234042e-07, "logits/chosen": -1.211553692817688, "logits/rejected": -1.218402624130249, "logps/chosen": -246.43824768066406, "logps/rejected": -262.7088928222656, "loss": 0.6922, "rewards/accuracies": 0.5, "rewards/chosen": -0.0035472153685986996, "rewards/margins": -0.00027114871772937477, "rewards/rejected": -0.0032760666217654943, "step": 18 }, { "epoch": 0.04060913705583756, "grad_norm": 8.028547080597, "learning_rate": 3.234042553191489e-07, "logits/chosen": -1.2778618335723877, "logits/rejected": -1.151484727859497, "logps/chosen": -263.84149169921875, "logps/rejected": -285.849365234375, "loss": 0.6917, "rewards/accuracies": 0.5625, "rewards/chosen": -0.004957227967679501, "rewards/margins": 0.004752359353005886, "rewards/rejected": -0.009709586389362812, "step": 19 }, { "epoch": 0.04274646005877638, "grad_norm": 5.812341074260827, "learning_rate": 3.404255319148936e-07, "logits/chosen": -1.2942702770233154, "logits/rejected": -1.2191565036773682, "logps/chosen": -309.5222473144531, "logps/rejected": -300.4433898925781, "loss": 0.6914, "rewards/accuracies": 0.5625, "rewards/chosen": -0.014837831258773804, "rewards/margins": 0.004754701629281044, "rewards/rejected": -0.019592532888054848, "step": 20 }, { "epoch": 0.0448837830617152, "grad_norm": 5.722892787713038, "learning_rate": 3.574468085106383e-07, "logits/chosen": -1.2925513982772827, "logits/rejected": -1.2902947664260864, "logps/chosen": -201.62168884277344, "logps/rejected": -207.10025024414062, "loss": 0.6911, "rewards/accuracies": 0.75, "rewards/chosen": -0.005997480824589729, "rewards/margins": 0.002215881133452058, "rewards/rejected": -0.008213362656533718, "step": 21 }, { "epoch": 0.04702110606465402, "grad_norm": 5.066561101041897, "learning_rate": 3.74468085106383e-07, "logits/chosen": -1.2172021865844727, "logits/rejected": -1.2375967502593994, "logps/chosen": -213.11264038085938, "logps/rejected": -208.974609375, "loss": 0.6908, "rewards/accuracies": 0.5, "rewards/chosen": -0.011541333049535751, "rewards/margins": 0.0011011171154677868, "rewards/rejected": -0.012642450630664825, "step": 22 }, { "epoch": 0.04915842906759284, "grad_norm": 5.8862642088746355, "learning_rate": 3.914893617021276e-07, "logits/chosen": -1.0798557996749878, "logits/rejected": -0.9454042911529541, "logps/chosen": -252.95816040039062, "logps/rejected": -280.6626892089844, "loss": 0.6905, "rewards/accuracies": 0.625, "rewards/chosen": -0.014871005900204182, "rewards/margins": 0.007070617750287056, "rewards/rejected": -0.021941622719168663, "step": 23 }, { "epoch": 0.05129575207053166, "grad_norm": 9.358799637903934, "learning_rate": 4.085106382978723e-07, "logits/chosen": -1.229708194732666, "logits/rejected": -1.1125763654708862, "logps/chosen": -205.67190551757812, "logps/rejected": -238.33160400390625, "loss": 0.6902, "rewards/accuracies": 0.5, "rewards/chosen": -0.012925582006573677, "rewards/margins": 0.0010348652722314, "rewards/rejected": -0.01396044623106718, "step": 24 }, { "epoch": 0.053433075073470476, "grad_norm": 5.386776465639343, "learning_rate": 4.25531914893617e-07, "logits/chosen": -1.0455987453460693, "logits/rejected": -1.0133016109466553, "logps/chosen": -286.6942138671875, "logps/rejected": -276.3359069824219, "loss": 0.6897, "rewards/accuracies": 0.6875, "rewards/chosen": -0.017084330320358276, "rewards/margins": 0.006912917364388704, "rewards/rejected": -0.023997247219085693, "step": 25 }, { "epoch": 0.055570398076409296, "grad_norm": 5.50227897442877, "learning_rate": 4.425531914893617e-07, "logits/chosen": -1.0649359226226807, "logits/rejected": -1.0070443153381348, "logps/chosen": -246.0877685546875, "logps/rejected": -261.14007568359375, "loss": 0.689, "rewards/accuracies": 0.375, "rewards/chosen": -0.020686931908130646, "rewards/margins": 0.0015359835233539343, "rewards/rejected": -0.022222913801670074, "step": 26 }, { "epoch": 0.057707721079348115, "grad_norm": 5.926262302660941, "learning_rate": 4.595744680851064e-07, "logits/chosen": -1.1902655363082886, "logits/rejected": -1.2254102230072021, "logps/chosen": -257.3299255371094, "logps/rejected": -252.44285583496094, "loss": 0.6881, "rewards/accuracies": 0.625, "rewards/chosen": -0.015050854533910751, "rewards/margins": 0.007191185839474201, "rewards/rejected": -0.022242041304707527, "step": 27 }, { "epoch": 0.059845044082286934, "grad_norm": 5.816673583594112, "learning_rate": 4.76595744680851e-07, "logits/chosen": -1.2353150844573975, "logits/rejected": -1.2537351846694946, "logps/chosen": -239.78131103515625, "logps/rejected": -237.89956665039062, "loss": 0.6871, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0323203019797802, "rewards/margins": 0.00978852715343237, "rewards/rejected": -0.04210883006453514, "step": 28 }, { "epoch": 0.061982367085225754, "grad_norm": 6.282142408603975, "learning_rate": 4.936170212765957e-07, "logits/chosen": -1.1677157878875732, "logits/rejected": -1.1619088649749756, "logps/chosen": -272.3318176269531, "logps/rejected": -272.49407958984375, "loss": 0.687, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03659503534436226, "rewards/margins": 0.010677139274775982, "rewards/rejected": -0.047272179275751114, "step": 29 }, { "epoch": 0.06411969008816458, "grad_norm": 6.059359505202537, "learning_rate": 5.106382978723404e-07, "logits/chosen": -1.1942038536071777, "logits/rejected": -1.2899129390716553, "logps/chosen": -215.77291870117188, "logps/rejected": -200.72760009765625, "loss": 0.6857, "rewards/accuracies": 0.5625, "rewards/chosen": -0.02227736823260784, "rewards/margins": 0.0018395374063402414, "rewards/rejected": -0.02411690726876259, "step": 30 }, { "epoch": 0.06625701309110339, "grad_norm": 5.482721306670962, "learning_rate": 5.276595744680851e-07, "logits/chosen": -1.0668655633926392, "logits/rejected": -0.9902481436729431, "logps/chosen": -231.6899871826172, "logps/rejected": -255.43142700195312, "loss": 0.6856, "rewards/accuracies": 0.75, "rewards/chosen": -0.03268000856041908, "rewards/margins": 0.016983937472105026, "rewards/rejected": -0.04966394975781441, "step": 31 }, { "epoch": 0.06839433609404222, "grad_norm": 6.948193767681259, "learning_rate": 5.446808510638297e-07, "logits/chosen": -1.193525791168213, "logits/rejected": -1.2220956087112427, "logps/chosen": -300.5246887207031, "logps/rejected": -285.05584716796875, "loss": 0.6797, "rewards/accuracies": 0.625, "rewards/chosen": -0.06849750876426697, "rewards/margins": 0.023764295503497124, "rewards/rejected": -0.09226180613040924, "step": 32 }, { "epoch": 0.07053165909698103, "grad_norm": 6.841964931394082, "learning_rate": 5.617021276595744e-07, "logits/chosen": -1.037040114402771, "logits/rejected": -1.0746084451675415, "logps/chosen": -326.0146484375, "logps/rejected": -331.83868408203125, "loss": 0.6816, "rewards/accuracies": 0.75, "rewards/chosen": -0.07449866831302643, "rewards/margins": 0.029566621407866478, "rewards/rejected": -0.10406528413295746, "step": 33 }, { "epoch": 0.07266898209991986, "grad_norm": 6.505679088181679, "learning_rate": 5.787234042553192e-07, "logits/chosen": -1.2144380807876587, "logits/rejected": -1.172149658203125, "logps/chosen": -285.81732177734375, "logps/rejected": -277.4313659667969, "loss": 0.6811, "rewards/accuracies": 0.6875, "rewards/chosen": -0.06557968258857727, "rewards/margins": 0.01972799003124237, "rewards/rejected": -0.08530767261981964, "step": 34 }, { "epoch": 0.07480630510285867, "grad_norm": 5.874795982379736, "learning_rate": 5.957446808510638e-07, "logits/chosen": -1.2059704065322876, "logits/rejected": -1.1337679624557495, "logps/chosen": -316.4410400390625, "logps/rejected": -358.4386901855469, "loss": 0.6786, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08414824306964874, "rewards/margins": 0.04845026880502701, "rewards/rejected": -0.13259851932525635, "step": 35 }, { "epoch": 0.07694362810579748, "grad_norm": 6.990449192598191, "learning_rate": 6.127659574468084e-07, "logits/chosen": -1.2996914386749268, "logits/rejected": -1.1766685247421265, "logps/chosen": -225.32235717773438, "logps/rejected": -242.12782287597656, "loss": 0.6773, "rewards/accuracies": 0.625, "rewards/chosen": -0.0663793534040451, "rewards/margins": 0.00477500818669796, "rewards/rejected": -0.07115435600280762, "step": 36 }, { "epoch": 0.07908095110873631, "grad_norm": 6.539356394704236, "learning_rate": 6.297872340425532e-07, "logits/chosen": -1.3612180948257446, "logits/rejected": -1.365153431892395, "logps/chosen": -247.11227416992188, "logps/rejected": -245.68914794921875, "loss": 0.6767, "rewards/accuracies": 0.625, "rewards/chosen": -0.10457728803157806, "rewards/margins": 0.018666967749595642, "rewards/rejected": -0.12324424088001251, "step": 37 }, { "epoch": 0.08121827411167512, "grad_norm": 7.001992517945579, "learning_rate": 6.468085106382978e-07, "logits/chosen": -1.206650733947754, "logits/rejected": -1.1853901147842407, "logps/chosen": -290.7265319824219, "logps/rejected": -278.7143249511719, "loss": 0.6706, "rewards/accuracies": 0.875, "rewards/chosen": -0.08214958757162094, "rewards/margins": 0.03963039815425873, "rewards/rejected": -0.12177997827529907, "step": 38 }, { "epoch": 0.08335559711461395, "grad_norm": 6.9572266440707455, "learning_rate": 6.638297872340425e-07, "logits/chosen": -1.2570126056671143, "logits/rejected": -1.209625005722046, "logps/chosen": -220.4678955078125, "logps/rejected": -248.70547485351562, "loss": 0.6708, "rewards/accuracies": 0.6875, "rewards/chosen": -0.10038035362958908, "rewards/margins": 0.0464475080370903, "rewards/rejected": -0.14682786166667938, "step": 39 }, { "epoch": 0.08549292011755276, "grad_norm": 6.646103732875039, "learning_rate": 6.808510638297872e-07, "logits/chosen": -1.2964568138122559, "logits/rejected": -1.2421560287475586, "logps/chosen": -268.4561767578125, "logps/rejected": -277.7322998046875, "loss": 0.6691, "rewards/accuracies": 0.625, "rewards/chosen": -0.12192462384700775, "rewards/margins": 0.024914879351854324, "rewards/rejected": -0.14683951437473297, "step": 40 }, { "epoch": 0.08763024312049159, "grad_norm": 6.963601763048083, "learning_rate": 6.978723404255319e-07, "logits/chosen": -1.2182767391204834, "logits/rejected": -1.144841194152832, "logps/chosen": -208.9151611328125, "logps/rejected": -223.7606201171875, "loss": 0.6683, "rewards/accuracies": 0.8125, "rewards/chosen": -0.10518630594015121, "rewards/margins": 0.0583706796169281, "rewards/rejected": -0.1635569930076599, "step": 41 }, { "epoch": 0.0897675661234304, "grad_norm": 6.059193610856394, "learning_rate": 7.148936170212766e-07, "logits/chosen": -1.2251960039138794, "logits/rejected": -1.1969135999679565, "logps/chosen": -275.060791015625, "logps/rejected": -310.41168212890625, "loss": 0.6666, "rewards/accuracies": 0.8125, "rewards/chosen": -0.17626452445983887, "rewards/margins": 0.06843668222427368, "rewards/rejected": -0.24470120668411255, "step": 42 }, { "epoch": 0.09190488912636922, "grad_norm": 6.5707188570559385, "learning_rate": 7.319148936170212e-07, "logits/chosen": -1.1565831899642944, "logits/rejected": -1.101088285446167, "logps/chosen": -232.02914428710938, "logps/rejected": -240.09799194335938, "loss": 0.6632, "rewards/accuracies": 0.6875, "rewards/chosen": -0.10670860856771469, "rewards/margins": 0.05728934705257416, "rewards/rejected": -0.16399797797203064, "step": 43 }, { "epoch": 0.09404221212930804, "grad_norm": 6.11669964043902, "learning_rate": 7.48936170212766e-07, "logits/chosen": -1.042961835861206, "logits/rejected": -1.1094971895217896, "logps/chosen": -259.85546875, "logps/rejected": -273.7874755859375, "loss": 0.6633, "rewards/accuracies": 0.625, "rewards/chosen": -0.1373087465763092, "rewards/margins": 0.02613016963005066, "rewards/rejected": -0.16343891620635986, "step": 44 }, { "epoch": 0.09617953513224686, "grad_norm": 6.39410867008386, "learning_rate": 7.659574468085107e-07, "logits/chosen": -1.2064584493637085, "logits/rejected": -1.1550904512405396, "logps/chosen": -342.9076232910156, "logps/rejected": -351.82830810546875, "loss": 0.6587, "rewards/accuracies": 0.625, "rewards/chosen": -0.20879533886909485, "rewards/margins": 0.049167849123477936, "rewards/rejected": -0.2579631805419922, "step": 45 }, { "epoch": 0.09831685813518568, "grad_norm": 5.9662902412289816, "learning_rate": 7.829787234042552e-07, "logits/chosen": -1.2047083377838135, "logits/rejected": -1.1663964986801147, "logps/chosen": -279.71356201171875, "logps/rejected": -309.82489013671875, "loss": 0.6615, "rewards/accuracies": 0.5625, "rewards/chosen": -0.21875953674316406, "rewards/margins": 0.14003317058086395, "rewards/rejected": -0.3587927222251892, "step": 46 }, { "epoch": 0.1004541811381245, "grad_norm": 6.848260346060141, "learning_rate": 8e-07, "logits/chosen": -1.2341694831848145, "logits/rejected": -1.198740839958191, "logps/chosen": -273.4669189453125, "logps/rejected": -289.4903564453125, "loss": 0.6422, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2072269469499588, "rewards/margins": 0.15279962122440338, "rewards/rejected": -0.3600265383720398, "step": 47 }, { "epoch": 0.10259150414106331, "grad_norm": 6.627952291422947, "learning_rate": 7.999888100245079e-07, "logits/chosen": -1.1258509159088135, "logits/rejected": -1.0670114755630493, "logps/chosen": -218.09140014648438, "logps/rejected": -236.913818359375, "loss": 0.6483, "rewards/accuracies": 0.875, "rewards/chosen": -0.10230693221092224, "rewards/margins": 0.11233773827552795, "rewards/rejected": -0.2146446704864502, "step": 48 }, { "epoch": 0.10472882714400214, "grad_norm": 7.206747256622627, "learning_rate": 7.999552407241093e-07, "logits/chosen": -1.1644402742385864, "logits/rejected": -1.1633307933807373, "logps/chosen": -222.858642578125, "logps/rejected": -237.24234008789062, "loss": 0.6281, "rewards/accuracies": 0.75, "rewards/chosen": -0.17772327363491058, "rewards/margins": 0.06820763647556305, "rewards/rejected": -0.24593091011047363, "step": 49 }, { "epoch": 0.10686615014694095, "grad_norm": 6.2831933314047435, "learning_rate": 7.998992939770026e-07, "logits/chosen": -1.2497477531433105, "logits/rejected": -1.1775400638580322, "logps/chosen": -223.41224670410156, "logps/rejected": -229.77484130859375, "loss": 0.6441, "rewards/accuracies": 0.5625, "rewards/chosen": -0.176996111869812, "rewards/margins": 0.10983872413635254, "rewards/rejected": -0.28683483600616455, "step": 50 }, { "epoch": 0.10900347314987978, "grad_norm": 6.77971339252681, "learning_rate": 7.998209729134013e-07, "logits/chosen": -1.2855724096298218, "logits/rejected": -1.1933931112289429, "logps/chosen": -291.30792236328125, "logps/rejected": -304.0755310058594, "loss": 0.6383, "rewards/accuracies": 0.625, "rewards/chosen": -0.43583106994628906, "rewards/margins": 0.13815590739250183, "rewards/rejected": -0.5739869475364685, "step": 51 }, { "epoch": 0.11114079615281859, "grad_norm": 7.218967742515957, "learning_rate": 7.997202819153594e-07, "logits/chosen": -1.1941241025924683, "logits/rejected": -1.2352594137191772, "logps/chosen": -296.0772399902344, "logps/rejected": -312.9319152832031, "loss": 0.6107, "rewards/accuracies": 0.75, "rewards/chosen": -0.3638754189014435, "rewards/margins": 0.1599450707435608, "rewards/rejected": -0.5238205194473267, "step": 52 }, { "epoch": 0.11327811915575742, "grad_norm": 6.701000620809773, "learning_rate": 7.995972266165259e-07, "logits/chosen": -1.2163739204406738, "logits/rejected": -1.1790796518325806, "logps/chosen": -342.8826599121094, "logps/rejected": -385.2430419921875, "loss": 0.6203, "rewards/accuracies": 0.625, "rewards/chosen": -0.6190586686134338, "rewards/margins": 0.20686545968055725, "rewards/rejected": -0.8259240984916687, "step": 53 }, { "epoch": 0.11541544215869623, "grad_norm": 7.252043517716344, "learning_rate": 7.994518139018295e-07, "logits/chosen": -1.2206201553344727, "logits/rejected": -1.1347448825836182, "logps/chosen": -285.89044189453125, "logps/rejected": -303.69091796875, "loss": 0.6351, "rewards/accuracies": 0.5625, "rewards/chosen": -0.45011359453201294, "rewards/margins": 0.11390627920627594, "rewards/rejected": -0.5640199184417725, "step": 54 }, { "epoch": 0.11755276516163506, "grad_norm": 9.977900301785272, "learning_rate": 7.99284051907094e-07, "logits/chosen": -1.0799576044082642, "logits/rejected": -1.00977623462677, "logps/chosen": -271.39154052734375, "logps/rejected": -310.511474609375, "loss": 0.5842, "rewards/accuracies": 0.75, "rewards/chosen": -0.5156710147857666, "rewards/margins": 0.30485641956329346, "rewards/rejected": -0.8205274343490601, "step": 55 }, { "epoch": 0.11969008816457387, "grad_norm": 6.400760110377084, "learning_rate": 7.990939500185824e-07, "logits/chosen": -1.1088342666625977, "logits/rejected": -1.1030653715133667, "logps/chosen": -364.3983459472656, "logps/rejected": -385.44000244140625, "loss": 0.618, "rewards/accuracies": 0.625, "rewards/chosen": -0.8900222182273865, "rewards/margins": 0.1941274106502533, "rewards/rejected": -1.0841495990753174, "step": 56 }, { "epoch": 0.1218274111675127, "grad_norm": 6.9939690893971695, "learning_rate": 7.98881518872472e-07, "logits/chosen": -1.2345085144042969, "logits/rejected": -1.148017406463623, "logps/chosen": -302.2966003417969, "logps/rejected": -330.4090576171875, "loss": 0.5864, "rewards/accuracies": 0.75, "rewards/chosen": -0.7428092360496521, "rewards/margins": 0.30247360467910767, "rewards/rejected": -1.0452828407287598, "step": 57 }, { "epoch": 0.12396473417045151, "grad_norm": 7.089640899087372, "learning_rate": 7.986467703542595e-07, "logits/chosen": -1.2412291765213013, "logits/rejected": -1.2027617692947388, "logps/chosen": -309.7779235839844, "logps/rejected": -313.6658935546875, "loss": 0.6085, "rewards/accuracies": 0.625, "rewards/chosen": -0.9433287978172302, "rewards/margins": 0.06926216185092926, "rewards/rejected": -1.0125908851623535, "step": 58 }, { "epoch": 0.12610205717339032, "grad_norm": 8.578352434954352, "learning_rate": 7.983897175980956e-07, "logits/chosen": -1.187761902809143, "logits/rejected": -1.1473008394241333, "logps/chosen": -522.899169921875, "logps/rejected": -563.5382080078125, "loss": 0.5888, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4695754051208496, "rewards/margins": 0.39022523164749146, "rewards/rejected": -1.8598008155822754, "step": 59 }, { "epoch": 0.12823938017632916, "grad_norm": 7.479375095677279, "learning_rate": 7.981103749860505e-07, "logits/chosen": -1.1445815563201904, "logits/rejected": -1.1645028591156006, "logps/chosen": -345.0823974609375, "logps/rejected": -342.9745788574219, "loss": 0.5963, "rewards/accuracies": 0.625, "rewards/chosen": -0.995075523853302, "rewards/margins": 0.0031197145581245422, "rewards/rejected": -0.9981951713562012, "step": 60 }, { "epoch": 0.13037670317926797, "grad_norm": 9.767907382058546, "learning_rate": 7.978087581473093e-07, "logits/chosen": -1.1243810653686523, "logits/rejected": -1.064988374710083, "logps/chosen": -456.0157470703125, "logps/rejected": -530.0255737304688, "loss": 0.5985, "rewards/accuracies": 0.5625, "rewards/chosen": -1.5085893869400024, "rewards/margins": 0.4878966808319092, "rewards/rejected": -1.996485948562622, "step": 61 }, { "epoch": 0.13251402618220678, "grad_norm": 8.307089325438865, "learning_rate": 7.97484883957297e-07, "logits/chosen": -1.2906808853149414, "logits/rejected": -1.232366919517517, "logps/chosen": -459.9493103027344, "logps/rejected": -529.822021484375, "loss": 0.5535, "rewards/accuracies": 0.875, "rewards/chosen": -1.7710328102111816, "rewards/margins": 0.7507337331771851, "rewards/rejected": -2.5217666625976562, "step": 62 }, { "epoch": 0.1346513491851456, "grad_norm": 9.937869344696164, "learning_rate": 7.971387705367349e-07, "logits/chosen": -1.0343647003173828, "logits/rejected": -1.009024739265442, "logps/chosen": -344.3237609863281, "logps/rejected": -414.5412292480469, "loss": 0.5707, "rewards/accuracies": 0.8125, "rewards/chosen": -1.04449462890625, "rewards/margins": 0.5895835757255554, "rewards/rejected": -1.6340781450271606, "step": 63 }, { "epoch": 0.13678867218808444, "grad_norm": 8.102078008830782, "learning_rate": 7.967704372506265e-07, "logits/chosen": -1.136821985244751, "logits/rejected": -1.145324468612671, "logps/chosen": -454.86968994140625, "logps/rejected": -522.869140625, "loss": 0.5878, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6470240354537964, "rewards/margins": 0.73206627368927, "rewards/rejected": -2.3790900707244873, "step": 64 }, { "epoch": 0.13892599519102325, "grad_norm": 9.062627671259957, "learning_rate": 7.963799047071739e-07, "logits/chosen": -1.1571276187896729, "logits/rejected": -1.1334136724472046, "logps/chosen": -454.8720397949219, "logps/rejected": -572.2128295898438, "loss": 0.5829, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8390952348709106, "rewards/margins": 1.0353538990020752, "rewards/rejected": -2.8744490146636963, "step": 65 }, { "epoch": 0.14106331819396206, "grad_norm": 9.107794878722252, "learning_rate": 7.959671947566251e-07, "logits/chosen": -1.3037822246551514, "logits/rejected": -1.254806637763977, "logps/chosen": -514.9037475585938, "logps/rejected": -532.9312133789062, "loss": 0.6234, "rewards/accuracies": 0.5625, "rewards/chosen": -2.43453049659729, "rewards/margins": 0.19514000415802002, "rewards/rejected": -2.6296703815460205, "step": 66 }, { "epoch": 0.14320064119690087, "grad_norm": 8.171092519186132, "learning_rate": 7.955323304900513e-07, "logits/chosen": -1.2573140859603882, "logits/rejected": -1.2275471687316895, "logps/chosen": -609.5222778320312, "logps/rejected": -656.034912109375, "loss": 0.5297, "rewards/accuracies": 0.625, "rewards/chosen": -2.932328224182129, "rewards/margins": 0.5183333158493042, "rewards/rejected": -3.4506616592407227, "step": 67 }, { "epoch": 0.14533796419983971, "grad_norm": 8.998419071484543, "learning_rate": 7.95075336238055e-07, "logits/chosen": -1.0774922370910645, "logits/rejected": -1.0322424173355103, "logps/chosen": -358.639892578125, "logps/rejected": -422.7215576171875, "loss": 0.5631, "rewards/accuracies": 0.5, "rewards/chosen": -1.590600609779358, "rewards/margins": 0.5412168502807617, "rewards/rejected": -2.13181734085083, "step": 68 }, { "epoch": 0.14747528720277853, "grad_norm": 9.57704334804126, "learning_rate": 7.945962375694085e-07, "logits/chosen": -1.1822316646575928, "logits/rejected": -1.061968207359314, "logps/chosen": -442.8787536621094, "logps/rejected": -535.091552734375, "loss": 0.6019, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9403026103973389, "rewards/margins": 0.7558570504188538, "rewards/rejected": -2.696159601211548, "step": 69 }, { "epoch": 0.14961261020571734, "grad_norm": 8.335354984340409, "learning_rate": 7.940950612896237e-07, "logits/chosen": -1.1393486261367798, "logits/rejected": -1.148930311203003, "logps/chosen": -565.1220703125, "logps/rejected": -670.720458984375, "loss": 0.562, "rewards/accuracies": 0.5625, "rewards/chosen": -2.9813404083251953, "rewards/margins": 0.9398919343948364, "rewards/rejected": -3.921232223510742, "step": 70 }, { "epoch": 0.15174993320865615, "grad_norm": 9.614295261199452, "learning_rate": 7.935718354394518e-07, "logits/chosen": -1.2340784072875977, "logits/rejected": -1.2443894147872925, "logps/chosen": -579.1088256835938, "logps/rejected": -634.750732421875, "loss": 0.5595, "rewards/accuracies": 0.5, "rewards/chosen": -2.7751572132110596, "rewards/margins": 0.6060917973518372, "rewards/rejected": -3.381248950958252, "step": 71 }, { "epoch": 0.15388725621159496, "grad_norm": 12.368431705912641, "learning_rate": 7.930265892933153e-07, "logits/chosen": -1.2153799533843994, "logits/rejected": -1.24309241771698, "logps/chosen": -538.3513793945312, "logps/rejected": -561.787353515625, "loss": 0.574, "rewards/accuracies": 0.6875, "rewards/chosen": -2.4809069633483887, "rewards/margins": 0.3710068166255951, "rewards/rejected": -2.8519139289855957, "step": 72 }, { "epoch": 0.1560245792145338, "grad_norm": 11.318087514647862, "learning_rate": 7.92459353357669e-07, "logits/chosen": -1.1397067308425903, "logits/rejected": -1.1050128936767578, "logps/chosen": -494.8123779296875, "logps/rejected": -600.887451171875, "loss": 0.607, "rewards/accuracies": 0.75, "rewards/chosen": -2.3441858291625977, "rewards/margins": 0.8333718776702881, "rewards/rejected": -3.177557945251465, "step": 73 }, { "epoch": 0.15816190221747262, "grad_norm": 12.66693529075148, "learning_rate": 7.918701593692942e-07, "logits/chosen": -1.1999236345291138, "logits/rejected": -1.1343518495559692, "logps/chosen": -537.6787719726562, "logps/rejected": -661.78271484375, "loss": 0.6214, "rewards/accuracies": 0.75, "rewards/chosen": -2.2442550659179688, "rewards/margins": 0.8728527426719666, "rewards/rejected": -3.117107629776001, "step": 74 }, { "epoch": 0.16029922522041143, "grad_norm": 18.744343302034743, "learning_rate": 7.912590402935222e-07, "logits/chosen": -1.210806965827942, "logits/rejected": -1.2487571239471436, "logps/chosen": -598.8636474609375, "logps/rejected": -607.9540405273438, "loss": 0.6434, "rewards/accuracies": 0.625, "rewards/chosen": -3.0624806880950928, "rewards/margins": 0.30628111958503723, "rewards/rejected": -3.3687617778778076, "step": 75 }, { "epoch": 0.16243654822335024, "grad_norm": 10.387480961194791, "learning_rate": 7.906260303223906e-07, "logits/chosen": -1.2011109590530396, "logits/rejected": -1.1724956035614014, "logps/chosen": -523.1990356445312, "logps/rejected": -635.5238647460938, "loss": 0.5103, "rewards/accuracies": 0.6875, "rewards/chosen": -2.499415636062622, "rewards/margins": 0.956606388092041, "rewards/rejected": -3.456022024154663, "step": 76 }, { "epoch": 0.16457387122628908, "grad_norm": 8.286082639772292, "learning_rate": 7.899711648727293e-07, "logits/chosen": -1.1222546100616455, "logits/rejected": -1.0313266515731812, "logps/chosen": -422.1160583496094, "logps/rejected": -486.4732971191406, "loss": 0.5272, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8823583126068115, "rewards/margins": 0.557906985282898, "rewards/rejected": -2.44026517868042, "step": 77 }, { "epoch": 0.1667111942292279, "grad_norm": 14.710358539113331, "learning_rate": 7.892944805841805e-07, "logits/chosen": -1.18240487575531, "logits/rejected": -1.1336525678634644, "logps/chosen": -457.12652587890625, "logps/rejected": -520.3361206054688, "loss": 0.5602, "rewards/accuracies": 0.8125, "rewards/chosen": -2.255852222442627, "rewards/margins": 0.5986082553863525, "rewards/rejected": -2.8544602394104004, "step": 78 }, { "epoch": 0.1688485172321667, "grad_norm": 13.126930480373984, "learning_rate": 7.885960153171469e-07, "logits/chosen": -1.0567469596862793, "logits/rejected": -1.0342040061950684, "logps/chosen": -449.7267761230469, "logps/rejected": -539.2164306640625, "loss": 0.535, "rewards/accuracies": 0.75, "rewards/chosen": -2.0118534564971924, "rewards/margins": 0.6608402729034424, "rewards/rejected": -2.672694206237793, "step": 79 }, { "epoch": 0.17098584023510552, "grad_norm": 11.307140941363295, "learning_rate": 7.878758081506747e-07, "logits/chosen": -1.114970326423645, "logits/rejected": -1.1288405656814575, "logps/chosen": -453.8138732910156, "logps/rejected": -491.1424865722656, "loss": 0.5674, "rewards/accuracies": 0.8125, "rewards/chosen": -2.1642098426818848, "rewards/margins": 0.4818393886089325, "rewards/rejected": -2.6460490226745605, "step": 80 }, { "epoch": 0.17312316323804436, "grad_norm": 11.077741078561617, "learning_rate": 7.871338993802666e-07, "logits/chosen": -1.127083659172058, "logits/rejected": -1.0359755754470825, "logps/chosen": -568.3497314453125, "logps/rejected": -610.1666259765625, "loss": 0.5281, "rewards/accuracies": 0.75, "rewards/chosen": -2.4274895191192627, "rewards/margins": 0.5119785070419312, "rewards/rejected": -2.9394679069519043, "step": 81 }, { "epoch": 0.17526048624098317, "grad_norm": 9.584538258710456, "learning_rate": 7.863703305156272e-07, "logits/chosen": -1.2093491554260254, "logits/rejected": -1.2194944620132446, "logps/chosen": -477.71746826171875, "logps/rejected": -516.8004760742188, "loss": 0.5721, "rewards/accuracies": 0.8125, "rewards/chosen": -2.3491222858428955, "rewards/margins": 0.46743592619895935, "rewards/rejected": -2.8165578842163086, "step": 82 }, { "epoch": 0.17739780924392198, "grad_norm": 9.911683367044649, "learning_rate": 7.855851442783412e-07, "logits/chosen": -1.185421109199524, "logits/rejected": -1.1721405982971191, "logps/chosen": -466.0152587890625, "logps/rejected": -567.0381469726562, "loss": 0.5891, "rewards/accuracies": 0.875, "rewards/chosen": -1.8479394912719727, "rewards/margins": 1.1030150651931763, "rewards/rejected": -2.9509544372558594, "step": 83 }, { "epoch": 0.1795351322468608, "grad_norm": 9.609331845196555, "learning_rate": 7.847783845994824e-07, "logits/chosen": -1.2735304832458496, "logits/rejected": -1.3312420845031738, "logps/chosen": -473.52703857421875, "logps/rejected": -493.1807861328125, "loss": 0.554, "rewards/accuracies": 0.8125, "rewards/chosen": -2.2089741230010986, "rewards/margins": 0.2816333770751953, "rewards/rejected": -2.490607500076294, "step": 84 }, { "epoch": 0.18167245524979964, "grad_norm": 9.131968723887285, "learning_rate": 7.839500966171556e-07, "logits/chosen": -1.2005490064620972, "logits/rejected": -1.029706358909607, "logps/chosen": -393.1463928222656, "logps/rejected": -525.7657470703125, "loss": 0.5741, "rewards/accuracies": 0.75, "rewards/chosen": -1.82315993309021, "rewards/margins": 1.128662109375, "rewards/rejected": -2.95182204246521, "step": 85 }, { "epoch": 0.18380977825273845, "grad_norm": 9.252010710109433, "learning_rate": 7.831003266739721e-07, "logits/chosen": -1.2110841274261475, "logits/rejected": -1.1710755825042725, "logps/chosen": -540.1366577148438, "logps/rejected": -684.7119140625, "loss": 0.5182, "rewards/accuracies": 0.875, "rewards/chosen": -2.6053380966186523, "rewards/margins": 1.2827905416488647, "rewards/rejected": -3.8881282806396484, "step": 86 }, { "epoch": 0.18594710125567726, "grad_norm": 9.076549815928802, "learning_rate": 7.822291223144563e-07, "logits/chosen": -1.0235605239868164, "logits/rejected": -0.9067903757095337, "logps/chosen": -615.156982421875, "logps/rejected": -752.937255859375, "loss": 0.5167, "rewards/accuracies": 0.75, "rewards/chosen": -2.8930633068084717, "rewards/margins": 1.34428071975708, "rewards/rejected": -4.237343788146973, "step": 87 }, { "epoch": 0.18808442425861607, "grad_norm": 11.373953224514699, "learning_rate": 7.81336532282385e-07, "logits/chosen": -1.1721646785736084, "logits/rejected": -1.0930900573730469, "logps/chosen": -280.4423828125, "logps/rejected": -336.436279296875, "loss": 0.597, "rewards/accuracies": 0.625, "rewards/chosen": -1.236398458480835, "rewards/margins": 0.5181585550308228, "rewards/rejected": -1.7545570135116577, "step": 88 }, { "epoch": 0.1902217472615549, "grad_norm": 17.85017844805296, "learning_rate": 7.804226065180615e-07, "logits/chosen": -1.1734278202056885, "logits/rejected": -1.1101915836334229, "logps/chosen": -382.6453552246094, "logps/rejected": -466.7173156738281, "loss": 0.5902, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7278035879135132, "rewards/margins": 0.6392509937286377, "rewards/rejected": -2.3670544624328613, "step": 89 }, { "epoch": 0.19235907026449373, "grad_norm": 12.773370813435598, "learning_rate": 7.794873961555198e-07, "logits/chosen": -1.1437866687774658, "logits/rejected": -1.1589542627334595, "logps/chosen": -544.008544921875, "logps/rejected": -615.4200439453125, "loss": 0.5274, "rewards/accuracies": 0.875, "rewards/chosen": -2.523742198944092, "rewards/margins": 0.7184704542160034, "rewards/rejected": -3.2422127723693848, "step": 90 }, { "epoch": 0.19449639326743254, "grad_norm": 9.908778922785883, "learning_rate": 7.785309535196656e-07, "logits/chosen": -1.1318367719650269, "logits/rejected": -1.089951992034912, "logps/chosen": -529.7106323242188, "logps/rejected": -568.3753051757812, "loss": 0.528, "rewards/accuracies": 0.6875, "rewards/chosen": -2.680912494659424, "rewards/margins": 0.3908342123031616, "rewards/rejected": -3.071746826171875, "step": 91 }, { "epoch": 0.19663371627037135, "grad_norm": 10.509283821989197, "learning_rate": 7.77553332123347e-07, "logits/chosen": -1.2806425094604492, "logits/rejected": -1.2847963571548462, "logps/chosen": -492.8918762207031, "logps/rejected": -563.6588134765625, "loss": 0.537, "rewards/accuracies": 0.75, "rewards/chosen": -2.380342483520508, "rewards/margins": 0.6923278570175171, "rewards/rejected": -3.0726704597473145, "step": 92 }, { "epoch": 0.1987710392733102, "grad_norm": 9.899922277845048, "learning_rate": 7.765545866643613e-07, "logits/chosen": -1.0114997625350952, "logits/rejected": -1.0235319137573242, "logps/chosen": -381.5235290527344, "logps/rejected": -387.6109619140625, "loss": 0.5506, "rewards/accuracies": 0.5, "rewards/chosen": -1.6469519138336182, "rewards/margins": 0.23874562978744507, "rewards/rejected": -1.8856974840164185, "step": 93 }, { "epoch": 0.200908362276249, "grad_norm": 11.043390646196134, "learning_rate": 7.755347730223943e-07, "logits/chosen": -1.1114813089370728, "logits/rejected": -1.1337555646896362, "logps/chosen": -550.198486328125, "logps/rejected": -619.2704467773438, "loss": 0.553, "rewards/accuracies": 0.8125, "rewards/chosen": -2.6842494010925293, "rewards/margins": 0.6715414524078369, "rewards/rejected": -3.355790615081787, "step": 94 }, { "epoch": 0.20304568527918782, "grad_norm": 9.638994670369192, "learning_rate": 7.744939482558948e-07, "logits/chosen": -1.1464406251907349, "logits/rejected": -1.1081212759017944, "logps/chosen": -422.6688537597656, "logps/rejected": -483.55584716796875, "loss": 0.5182, "rewards/accuracies": 0.6875, "rewards/chosen": -2.2197909355163574, "rewards/margins": 0.568549633026123, "rewards/rejected": -2.7883403301239014, "step": 95 }, { "epoch": 0.20518300828212663, "grad_norm": 10.66227269593484, "learning_rate": 7.734321705988807e-07, "logits/chosen": -1.2760132551193237, "logits/rejected": -1.2601702213287354, "logps/chosen": -515.2978515625, "logps/rejected": -529.2376098632812, "loss": 0.5024, "rewards/accuracies": 0.6875, "rewards/chosen": -1.9085440635681152, "rewards/margins": 0.5009690523147583, "rewards/rejected": -2.409513235092163, "step": 96 }, { "epoch": 0.20732033128506547, "grad_norm": 11.057889862172901, "learning_rate": 7.723494994576816e-07, "logits/chosen": -1.1963937282562256, "logits/rejected": -1.125280499458313, "logps/chosen": -494.753662109375, "logps/rejected": -545.3011474609375, "loss": 0.5381, "rewards/accuracies": 0.75, "rewards/chosen": -2.26602840423584, "rewards/margins": 0.47012466192245483, "rewards/rejected": -2.7361528873443604, "step": 97 }, { "epoch": 0.20945765428800428, "grad_norm": 11.127088588048807, "learning_rate": 7.712459954076156e-07, "logits/chosen": -1.0651479959487915, "logits/rejected": -1.1355326175689697, "logps/chosen": -647.6420288085938, "logps/rejected": -641.8303833007812, "loss": 0.5441, "rewards/accuracies": 0.75, "rewards/chosen": -3.322498083114624, "rewards/margins": 0.05157832056283951, "rewards/rejected": -3.3740763664245605, "step": 98 }, { "epoch": 0.2115949772909431, "grad_norm": 11.207670370998112, "learning_rate": 7.701217201895986e-07, "logits/chosen": -1.052258014678955, "logits/rejected": -1.1536017656326294, "logps/chosen": -514.69287109375, "logps/rejected": -530.2621459960938, "loss": 0.5424, "rewards/accuracies": 0.6875, "rewards/chosen": -2.5572896003723145, "rewards/margins": 0.346367210149765, "rewards/rejected": -2.9036567211151123, "step": 99 }, { "epoch": 0.2137323002938819, "grad_norm": 10.113988708766959, "learning_rate": 7.689767367066917e-07, "logits/chosen": -0.8341108560562134, "logits/rejected": -0.8912850618362427, "logps/chosen": -377.82342529296875, "logps/rejected": -454.91064453125, "loss": 0.5793, "rewards/accuracies": 0.75, "rewards/chosen": -1.8863927125930786, "rewards/margins": 0.5190551280975342, "rewards/rejected": -2.4054477214813232, "step": 100 }, { "epoch": 0.2137323002938819, "eval_logits/chosen": -1.2699782848358154, "eval_logits/rejected": -1.2844398021697998, "eval_logps/chosen": -544.71923828125, "eval_logps/rejected": -616.7446899414062, "eval_loss": 0.5182811617851257, "eval_rewards/accuracies": 0.7459677457809448, "eval_rewards/chosen": -2.652132511138916, "eval_rewards/margins": 0.7797600030899048, "eval_rewards/rejected": -3.4318928718566895, "eval_runtime": 142.3104, "eval_samples_per_second": 13.78, "eval_steps_per_second": 0.436, "step": 100 }, { "epoch": 0.21586962329682075, "grad_norm": 16.391709000284067, "learning_rate": 7.678111090205803e-07, "logits/chosen": -1.2273062467575073, "logits/rejected": -1.2417653799057007, "logps/chosen": -559.35888671875, "logps/rejected": -655.1536865234375, "loss": 0.6137, "rewards/accuracies": 0.75, "rewards/chosen": -2.6603152751922607, "rewards/margins": 0.9901705384254456, "rewards/rejected": -3.6504859924316406, "step": 101 }, { "epoch": 0.21800694629975956, "grad_norm": 10.524629928454354, "learning_rate": 7.666249023479905e-07, "logits/chosen": -1.253726601600647, "logits/rejected": -1.0882092714309692, "logps/chosen": -479.20758056640625, "logps/rejected": -595.8485107421875, "loss": 0.4448, "rewards/accuracies": 0.75, "rewards/chosen": -2.2103652954101562, "rewards/margins": 0.9676500558853149, "rewards/rejected": -3.1780154705047607, "step": 102 }, { "epoch": 0.22014426930269837, "grad_norm": 9.72211474274192, "learning_rate": 7.654181830570403e-07, "logits/chosen": -1.0320769548416138, "logits/rejected": -0.9883528351783752, "logps/chosen": -515.2326049804688, "logps/rejected": -605.5062255859375, "loss": 0.4677, "rewards/accuracies": 0.8125, "rewards/chosen": -2.22302508354187, "rewards/margins": 0.9804313778877258, "rewards/rejected": -3.203456401824951, "step": 103 }, { "epoch": 0.22228159230563718, "grad_norm": 10.463756107018286, "learning_rate": 7.641910186635264e-07, "logits/chosen": -1.2872788906097412, "logits/rejected": -1.2068657875061035, "logps/chosen": -533.40576171875, "logps/rejected": -672.7639770507812, "loss": 0.4858, "rewards/accuracies": 0.8125, "rewards/chosen": -2.8758130073547363, "rewards/margins": 1.288372278213501, "rewards/rejected": -4.164185523986816, "step": 104 }, { "epoch": 0.224418915308576, "grad_norm": 9.94692219065209, "learning_rate": 7.629434778271459e-07, "logits/chosen": -1.2782255411148071, "logits/rejected": -1.2008510828018188, "logps/chosen": -422.54052734375, "logps/rejected": -536.9180297851562, "loss": 0.5088, "rewards/accuracies": 0.875, "rewards/chosen": -2.0897161960601807, "rewards/margins": 0.9998084306716919, "rewards/rejected": -3.089524269104004, "step": 105 }, { "epoch": 0.22655623831151483, "grad_norm": 15.519123908773544, "learning_rate": 7.616756303476559e-07, "logits/chosen": -1.2410894632339478, "logits/rejected": -1.2540771961212158, "logps/chosen": -524.3763427734375, "logps/rejected": -598.2949829101562, "loss": 0.6234, "rewards/accuracies": 0.6875, "rewards/chosen": -2.4764623641967773, "rewards/margins": 0.8137304186820984, "rewards/rejected": -3.2901928424835205, "step": 106 }, { "epoch": 0.22869356131445365, "grad_norm": 10.169089359472466, "learning_rate": 7.603875471609676e-07, "logits/chosen": -1.0019772052764893, "logits/rejected": -0.9863028526306152, "logps/chosen": -614.7957763671875, "logps/rejected": -719.146728515625, "loss": 0.5029, "rewards/accuracies": 0.8125, "rewards/chosen": -3.299185276031494, "rewards/margins": 0.8879890441894531, "rewards/rejected": -4.187174320220947, "step": 107 }, { "epoch": 0.23083088431739246, "grad_norm": 10.471849086193684, "learning_rate": 7.590793003351773e-07, "logits/chosen": -1.132460355758667, "logits/rejected": -1.115264654159546, "logps/chosen": -640.571044921875, "logps/rejected": -740.5921020507812, "loss": 0.4963, "rewards/accuracies": 0.8125, "rewards/chosen": -3.463143825531006, "rewards/margins": 0.9680495858192444, "rewards/rejected": -4.431193828582764, "step": 108 }, { "epoch": 0.23296820732033127, "grad_norm": 11.823001541225018, "learning_rate": 7.577509630665347e-07, "logits/chosen": -1.1503872871398926, "logits/rejected": -1.0826492309570312, "logps/chosen": -555.7222290039062, "logps/rejected": -684.6669311523438, "loss": 0.5508, "rewards/accuracies": 0.8125, "rewards/chosen": -2.875490427017212, "rewards/margins": 1.2122423648834229, "rewards/rejected": -4.087732315063477, "step": 109 }, { "epoch": 0.2351055303232701, "grad_norm": 9.86238586725783, "learning_rate": 7.564026096753471e-07, "logits/chosen": -1.2278306484222412, "logits/rejected": -1.140399694442749, "logps/chosen": -543.5636596679688, "logps/rejected": -715.5054321289062, "loss": 0.4682, "rewards/accuracies": 0.8125, "rewards/chosen": -2.983625888824463, "rewards/margins": 1.4977326393127441, "rewards/rejected": -4.481358528137207, "step": 110 }, { "epoch": 0.23724285332620892, "grad_norm": 9.440230861402858, "learning_rate": 7.550343156018216e-07, "logits/chosen": -1.2607563734054565, "logits/rejected": -1.253735065460205, "logps/chosen": -500.310546875, "logps/rejected": -607.3995971679688, "loss": 0.5163, "rewards/accuracies": 0.75, "rewards/chosen": -2.420441150665283, "rewards/margins": 1.0024170875549316, "rewards/rejected": -3.422858238220215, "step": 111 }, { "epoch": 0.23938017632914774, "grad_norm": 12.215223201725872, "learning_rate": 7.536461574018439e-07, "logits/chosen": -1.108474850654602, "logits/rejected": -1.146998643875122, "logps/chosen": -474.21197509765625, "logps/rejected": -562.3438110351562, "loss": 0.5669, "rewards/accuracies": 0.6875, "rewards/chosen": -2.460186243057251, "rewards/margins": 0.8443668484687805, "rewards/rejected": -3.3045530319213867, "step": 112 }, { "epoch": 0.24151749933208655, "grad_norm": 12.559970980315386, "learning_rate": 7.522382127426952e-07, "logits/chosen": -1.2318087816238403, "logits/rejected": -1.1735649108886719, "logps/chosen": -432.732421875, "logps/rejected": -507.0394287109375, "loss": 0.4479, "rewards/accuracies": 0.6875, "rewards/chosen": -2.251821756362915, "rewards/margins": 0.5928990840911865, "rewards/rejected": -2.8447208404541016, "step": 113 }, { "epoch": 0.2436548223350254, "grad_norm": 13.07315315212208, "learning_rate": 7.508105603987066e-07, "logits/chosen": -1.0219734907150269, "logits/rejected": -0.9402412176132202, "logps/chosen": -579.5034790039062, "logps/rejected": -664.8965454101562, "loss": 0.5591, "rewards/accuracies": 0.6875, "rewards/chosen": -2.9037399291992188, "rewards/margins": 0.7655159831047058, "rewards/rejected": -3.6692559719085693, "step": 114 }, { "epoch": 0.2457921453379642, "grad_norm": 10.646810060304825, "learning_rate": 7.493632802468518e-07, "logits/chosen": -1.1602777242660522, "logits/rejected": -1.0831139087677002, "logps/chosen": -389.80419921875, "logps/rejected": -458.3884582519531, "loss": 0.5419, "rewards/accuracies": 0.625, "rewards/chosen": -1.8653407096862793, "rewards/margins": 0.5108134746551514, "rewards/rejected": -2.3761539459228516, "step": 115 }, { "epoch": 0.24792946834090301, "grad_norm": 10.71566783907731, "learning_rate": 7.478964532622782e-07, "logits/chosen": -1.1830374002456665, "logits/rejected": -1.1820306777954102, "logps/chosen": -591.6640625, "logps/rejected": -658.7415771484375, "loss": 0.4764, "rewards/accuracies": 0.8125, "rewards/chosen": -2.90578031539917, "rewards/margins": 0.7861341238021851, "rewards/rejected": -3.6919145584106445, "step": 116 }, { "epoch": 0.25006679134384185, "grad_norm": 11.694350723954798, "learning_rate": 7.464101615137755e-07, "logits/chosen": -1.0726900100708008, "logits/rejected": -1.1676019430160522, "logps/chosen": -554.7371215820312, "logps/rejected": -629.0760498046875, "loss": 0.534, "rewards/accuracies": 0.625, "rewards/chosen": -2.7668631076812744, "rewards/margins": 0.9604626297950745, "rewards/rejected": -3.727325916290283, "step": 117 }, { "epoch": 0.25220411434678064, "grad_norm": 11.734514848455067, "learning_rate": 7.44904488159185e-07, "logits/chosen": -1.0949318408966064, "logits/rejected": -1.0768948793411255, "logps/chosen": -397.58837890625, "logps/rejected": -468.90264892578125, "loss": 0.4996, "rewards/accuracies": 1.0, "rewards/chosen": -2.029899835586548, "rewards/margins": 0.6923526525497437, "rewards/rejected": -2.722252607345581, "step": 118 }, { "epoch": 0.2543414373497195, "grad_norm": 10.42794481881483, "learning_rate": 7.433795174407464e-07, "logits/chosen": -1.266892910003662, "logits/rejected": -1.1528326272964478, "logps/chosen": -570.2450561523438, "logps/rejected": -709.3895263671875, "loss": 0.5043, "rewards/accuracies": 0.875, "rewards/chosen": -2.995210647583008, "rewards/margins": 1.1049041748046875, "rewards/rejected": -4.100114822387695, "step": 119 }, { "epoch": 0.2564787603526583, "grad_norm": 10.929127899973288, "learning_rate": 7.418353346803846e-07, "logits/chosen": -1.0802345275878906, "logits/rejected": -1.1326872110366821, "logps/chosen": -526.001708984375, "logps/rejected": -621.600341796875, "loss": 0.4601, "rewards/accuracies": 0.875, "rewards/chosen": -2.6054108142852783, "rewards/margins": 1.0312682390213013, "rewards/rejected": -3.636679172515869, "step": 120 }, { "epoch": 0.2586160833555971, "grad_norm": 11.494420054527998, "learning_rate": 7.402720262749359e-07, "logits/chosen": -1.2067303657531738, "logits/rejected": -1.2558921575546265, "logps/chosen": -564.7210693359375, "logps/rejected": -711.4730224609375, "loss": 0.498, "rewards/accuracies": 0.625, "rewards/chosen": -2.781289577484131, "rewards/margins": 1.5313748121261597, "rewards/rejected": -4.31266450881958, "step": 121 }, { "epoch": 0.26075340635853594, "grad_norm": 11.148239249520412, "learning_rate": 7.386896796913136e-07, "logits/chosen": -1.1556284427642822, "logits/rejected": -1.01291823387146, "logps/chosen": -429.47967529296875, "logps/rejected": -533.7469482421875, "loss": 0.4546, "rewards/accuracies": 0.875, "rewards/chosen": -2.000453233718872, "rewards/margins": 0.8496807813644409, "rewards/rejected": -2.8501341342926025, "step": 122 }, { "epoch": 0.26289072936147473, "grad_norm": 11.713757273838597, "learning_rate": 7.370883834616156e-07, "logits/chosen": -1.1531327962875366, "logits/rejected": -1.1401838064193726, "logps/chosen": -551.8589477539062, "logps/rejected": -632.5919799804688, "loss": 0.5588, "rewards/accuracies": 0.625, "rewards/chosen": -2.5737900733947754, "rewards/margins": 0.8271616101264954, "rewards/rejected": -3.400951623916626, "step": 123 }, { "epoch": 0.26502805236441357, "grad_norm": 10.989454375451436, "learning_rate": 7.354682271781696e-07, "logits/chosen": -1.1755902767181396, "logits/rejected": -1.1291191577911377, "logps/chosen": -571.854736328125, "logps/rejected": -655.4735107421875, "loss": 0.5171, "rewards/accuracies": 0.6875, "rewards/chosen": -2.706859827041626, "rewards/margins": 0.9184459447860718, "rewards/rejected": -3.625305652618408, "step": 124 }, { "epoch": 0.2671653753673524, "grad_norm": 9.756298897089394, "learning_rate": 7.33829301488521e-07, "logits/chosen": -1.1387619972229004, "logits/rejected": -1.1437007188796997, "logps/chosen": -448.17193603515625, "logps/rejected": -520.0236206054688, "loss": 0.4732, "rewards/accuracies": 0.75, "rewards/chosen": -2.275339126586914, "rewards/margins": 0.6221529841423035, "rewards/rejected": -2.8974921703338623, "step": 125 }, { "epoch": 0.2693026983702912, "grad_norm": 11.552300951274553, "learning_rate": 7.321716980903615e-07, "logits/chosen": -1.2335395812988281, "logits/rejected": -1.2084323167800903, "logps/chosen": -494.1284484863281, "logps/rejected": -644.2607421875, "loss": 0.5453, "rewards/accuracies": 0.8125, "rewards/chosen": -2.216590404510498, "rewards/margins": 1.4140255451202393, "rewards/rejected": -3.630615711212158, "step": 126 }, { "epoch": 0.27144002137323003, "grad_norm": 14.030573939890216, "learning_rate": 7.304955097263979e-07, "logits/chosen": -1.0867464542388916, "logits/rejected": -1.1234714984893799, "logps/chosen": -524.39453125, "logps/rejected": -586.8427734375, "loss": 0.5351, "rewards/accuracies": 0.5625, "rewards/chosen": -2.543902635574341, "rewards/margins": 0.9019243717193604, "rewards/rejected": -3.445826530456543, "step": 127 }, { "epoch": 0.2735773443761689, "grad_norm": 11.313049836006805, "learning_rate": 7.288008301791639e-07, "logits/chosen": -1.1349008083343506, "logits/rejected": -1.1048234701156616, "logps/chosen": -461.23516845703125, "logps/rejected": -517.0540771484375, "loss": 0.5452, "rewards/accuracies": 0.8125, "rewards/chosen": -2.255331039428711, "rewards/margins": 0.661991536617279, "rewards/rejected": -2.9173226356506348, "step": 128 }, { "epoch": 0.27571466737910766, "grad_norm": 10.703951432530653, "learning_rate": 7.270877542657725e-07, "logits/chosen": -1.1094213724136353, "logits/rejected": -1.1715359687805176, "logps/chosen": -489.816650390625, "logps/rejected": -586.4013671875, "loss": 0.4797, "rewards/accuracies": 0.75, "rewards/chosen": -2.231893301010132, "rewards/margins": 1.1094894409179688, "rewards/rejected": -3.3413827419281006, "step": 129 }, { "epoch": 0.2778519903820465, "grad_norm": 10.775339737914736, "learning_rate": 7.253563778326111e-07, "logits/chosen": -1.2365562915802002, "logits/rejected": -1.1621637344360352, "logps/chosen": -537.2908325195312, "logps/rejected": -638.2408447265625, "loss": 0.5017, "rewards/accuracies": 0.875, "rewards/chosen": -2.384437322616577, "rewards/margins": 1.190860629081726, "rewards/rejected": -3.5752978324890137, "step": 130 }, { "epoch": 0.2799893133849853, "grad_norm": 11.015612451823644, "learning_rate": 7.23606797749979e-07, "logits/chosen": -1.0056509971618652, "logits/rejected": -1.0789600610733032, "logps/chosen": -605.8868408203125, "logps/rejected": -681.7349243164062, "loss": 0.4895, "rewards/accuracies": 0.875, "rewards/chosen": -3.0001766681671143, "rewards/margins": 0.875503659248352, "rewards/rejected": -3.875680685043335, "step": 131 }, { "epoch": 0.2821266363879241, "grad_norm": 14.664837385616957, "learning_rate": 7.218391119066673e-07, "logits/chosen": -1.0803807973861694, "logits/rejected": -1.0349315404891968, "logps/chosen": -613.3341064453125, "logps/rejected": -670.0115356445312, "loss": 0.5508, "rewards/accuracies": 0.625, "rewards/chosen": -3.3352606296539307, "rewards/margins": 0.5032040476799011, "rewards/rejected": -3.8384647369384766, "step": 132 }, { "epoch": 0.28426395939086296, "grad_norm": 11.315998787147386, "learning_rate": 7.200534192044825e-07, "logits/chosen": -1.0804975032806396, "logits/rejected": -0.9459331035614014, "logps/chosen": -410.5482482910156, "logps/rejected": -552.6442260742188, "loss": 0.5149, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9860866069793701, "rewards/margins": 1.2312902212142944, "rewards/rejected": -3.217376708984375, "step": 133 }, { "epoch": 0.28640128239380175, "grad_norm": 11.906263072694715, "learning_rate": 7.182498195527123e-07, "logits/chosen": -0.9738950133323669, "logits/rejected": -1.0062488317489624, "logps/chosen": -404.84100341796875, "logps/rejected": -420.71380615234375, "loss": 0.4926, "rewards/accuracies": 0.5625, "rewards/chosen": -2.204134464263916, "rewards/margins": 0.22545769810676575, "rewards/rejected": -2.4295921325683594, "step": 134 }, { "epoch": 0.2885386053967406, "grad_norm": 10.372611012110845, "learning_rate": 7.164284138625366e-07, "logits/chosen": -1.0062649250030518, "logits/rejected": -0.8770849704742432, "logps/chosen": -423.56182861328125, "logps/rejected": -553.545166015625, "loss": 0.4668, "rewards/accuracies": 0.8125, "rewards/chosen": -2.139157295227051, "rewards/margins": 1.1258550882339478, "rewards/rejected": -3.265012264251709, "step": 135 }, { "epoch": 0.29067592839967943, "grad_norm": 12.207304324310757, "learning_rate": 7.1458930404138e-07, "logits/chosen": -1.084909439086914, "logits/rejected": -1.0752151012420654, "logps/chosen": -564.194091796875, "logps/rejected": -667.5319213867188, "loss": 0.4906, "rewards/accuracies": 0.75, "rewards/chosen": -2.760105848312378, "rewards/margins": 1.005811333656311, "rewards/rejected": -3.7659173011779785, "step": 136 }, { "epoch": 0.2928132514026182, "grad_norm": 13.61174460411604, "learning_rate": 7.12732592987212e-07, "logits/chosen": -1.1829478740692139, "logits/rejected": -1.0823060274124146, "logps/chosen": -590.635009765625, "logps/rejected": -802.473388671875, "loss": 0.5161, "rewards/accuracies": 0.6875, "rewards/chosen": -2.7893006801605225, "rewards/margins": 1.989109754562378, "rewards/rejected": -4.7784104347229, "step": 137 }, { "epoch": 0.29495057440555705, "grad_norm": 10.128959641227988, "learning_rate": 7.108583845827883e-07, "logits/chosen": -1.1610594987869263, "logits/rejected": -1.1704530715942383, "logps/chosen": -387.8962097167969, "logps/rejected": -480.23797607421875, "loss": 0.4904, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9382271766662598, "rewards/margins": 0.8355373740196228, "rewards/rejected": -2.7737643718719482, "step": 138 }, { "epoch": 0.29708789740849584, "grad_norm": 10.050177230914596, "learning_rate": 7.089667836898397e-07, "logits/chosen": -1.114013910293579, "logits/rejected": -1.110438585281372, "logps/chosen": -525.3073120117188, "logps/rejected": -606.350830078125, "loss": 0.4506, "rewards/accuracies": 0.6875, "rewards/chosen": -2.643073320388794, "rewards/margins": 0.7284079790115356, "rewards/rejected": -3.371481418609619, "step": 139 }, { "epoch": 0.2992252204114347, "grad_norm": 16.443121928276934, "learning_rate": 7.070578961432044e-07, "logits/chosen": -1.006872296333313, "logits/rejected": -0.9997127652168274, "logps/chosen": -532.4028930664062, "logps/rejected": -579.0374755859375, "loss": 0.5552, "rewards/accuracies": 0.6875, "rewards/chosen": -2.788207769393921, "rewards/margins": 0.6314295530319214, "rewards/rejected": -3.4196369647979736, "step": 140 }, { "epoch": 0.3013625434143735, "grad_norm": 10.5536510889646, "learning_rate": 7.051318287449066e-07, "logits/chosen": -1.0748560428619385, "logits/rejected": -1.0380382537841797, "logps/chosen": -439.17962646484375, "logps/rejected": -555.1817626953125, "loss": 0.5109, "rewards/accuracies": 0.8125, "rewards/chosen": -2.2144086360931396, "rewards/margins": 1.0603525638580322, "rewards/rejected": -3.274761199951172, "step": 141 }, { "epoch": 0.3034998664173123, "grad_norm": 12.492156055956597, "learning_rate": 7.031886892581812e-07, "logits/chosen": -1.0383498668670654, "logits/rejected": -1.008802056312561, "logps/chosen": -501.4347229003906, "logps/rejected": -582.19189453125, "loss": 0.5302, "rewards/accuracies": 0.75, "rewards/chosen": -2.4712347984313965, "rewards/margins": 0.7897406816482544, "rewards/rejected": -3.2609758377075195, "step": 142 }, { "epoch": 0.30563718942025114, "grad_norm": 14.048982773810426, "learning_rate": 7.012285864014444e-07, "logits/chosen": -1.0521856546401978, "logits/rejected": -1.0056830644607544, "logps/chosen": -521.499755859375, "logps/rejected": -605.831787109375, "loss": 0.5057, "rewards/accuracies": 0.8125, "rewards/chosen": -2.7067787647247314, "rewards/margins": 0.7501015663146973, "rewards/rejected": -3.4568800926208496, "step": 143 }, { "epoch": 0.3077745124231899, "grad_norm": 10.420316980175429, "learning_rate": 6.992516298422107e-07, "logits/chosen": -1.0325193405151367, "logits/rejected": -1.072160005569458, "logps/chosen": -711.7697143554688, "logps/rejected": -853.23681640625, "loss": 0.426, "rewards/accuracies": 0.875, "rewards/chosen": -3.530585289001465, "rewards/margins": 1.588570237159729, "rewards/rejected": -5.119154930114746, "step": 144 }, { "epoch": 0.30991183542612877, "grad_norm": 20.310072106279815, "learning_rate": 6.972579301909577e-07, "logits/chosen": -1.07466459274292, "logits/rejected": -1.0684446096420288, "logps/chosen": -638.5875244140625, "logps/rejected": -820.4556884765625, "loss": 0.4766, "rewards/accuracies": 0.8125, "rewards/chosen": -3.3279354572296143, "rewards/margins": 1.7255775928497314, "rewards/rejected": -5.0535125732421875, "step": 145 }, { "epoch": 0.3120491584290676, "grad_norm": 12.2030303667386, "learning_rate": 6.952475989949362e-07, "logits/chosen": -1.0073144435882568, "logits/rejected": -1.0062808990478516, "logps/chosen": -554.0878295898438, "logps/rejected": -649.923828125, "loss": 0.5041, "rewards/accuracies": 0.8125, "rewards/chosen": -2.752135992050171, "rewards/margins": 1.0968319177627563, "rewards/rejected": -3.848968029022217, "step": 146 }, { "epoch": 0.3141864814320064, "grad_norm": 16.554518168883288, "learning_rate": 6.932207487319305e-07, "logits/chosen": -1.0791057348251343, "logits/rejected": -0.9959053993225098, "logps/chosen": -520.7820434570312, "logps/rejected": -573.4239501953125, "loss": 0.5661, "rewards/accuracies": 0.8125, "rewards/chosen": -2.844965934753418, "rewards/margins": 0.715225338935852, "rewards/rejected": -3.5601911544799805, "step": 147 }, { "epoch": 0.31632380443494523, "grad_norm": 18.440264006008654, "learning_rate": 6.911774928039643e-07, "logits/chosen": -1.1887952089309692, "logits/rejected": -1.1027898788452148, "logps/chosen": -543.8815307617188, "logps/rejected": -748.257080078125, "loss": 0.524, "rewards/accuracies": 0.75, "rewards/chosen": -2.785444498062134, "rewards/margins": 1.9438941478729248, "rewards/rejected": -4.729338645935059, "step": 148 }, { "epoch": 0.3184611274378841, "grad_norm": 15.107186630332485, "learning_rate": 6.891179455309565e-07, "logits/chosen": -1.1168442964553833, "logits/rejected": -1.0895863771438599, "logps/chosen": -551.2578125, "logps/rejected": -583.7507934570312, "loss": 0.4379, "rewards/accuracies": 0.625, "rewards/chosen": -3.0725977420806885, "rewards/margins": 0.5573362112045288, "rewards/rejected": -3.6299338340759277, "step": 149 }, { "epoch": 0.32059845044082286, "grad_norm": 13.129470938801255, "learning_rate": 6.870422221443247e-07, "logits/chosen": -1.241693139076233, "logits/rejected": -1.2060009241104126, "logps/chosen": -577.2138061523438, "logps/rejected": -712.0526123046875, "loss": 0.5511, "rewards/accuracies": 0.75, "rewards/chosen": -3.028244972229004, "rewards/margins": 1.214041829109192, "rewards/rejected": -4.242286682128906, "step": 150 }, { "epoch": 0.3227357734437617, "grad_norm": 17.950223217812194, "learning_rate": 6.849504387805379e-07, "logits/chosen": -0.9408307075500488, "logits/rejected": -1.0360286235809326, "logps/chosen": -588.658447265625, "logps/rejected": -604.2407836914062, "loss": 0.5004, "rewards/accuracies": 0.625, "rewards/chosen": -3.3046035766601562, "rewards/margins": 0.3405230641365051, "rewards/rejected": -3.645127058029175, "step": 151 }, { "epoch": 0.3248730964467005, "grad_norm": 15.115382544003781, "learning_rate": 6.828427124746189e-07, "logits/chosen": -1.0206228494644165, "logits/rejected": -0.9669525027275085, "logps/chosen": -562.2371215820312, "logps/rejected": -633.0101928710938, "loss": 0.5519, "rewards/accuracies": 0.6875, "rewards/chosen": -3.1128549575805664, "rewards/margins": 0.6220429539680481, "rewards/rejected": -3.7348976135253906, "step": 152 }, { "epoch": 0.3270104194496393, "grad_norm": 12.660384376081153, "learning_rate": 6.807191611535966e-07, "logits/chosen": -1.1048712730407715, "logits/rejected": -1.0781325101852417, "logps/chosen": -564.8250732421875, "logps/rejected": -678.5778198242188, "loss": 0.474, "rewards/accuracies": 0.8125, "rewards/chosen": -2.999598264694214, "rewards/margins": 1.3427116870880127, "rewards/rejected": -4.342309474945068, "step": 153 }, { "epoch": 0.32914774245257816, "grad_norm": 15.357120824304575, "learning_rate": 6.785799036299067e-07, "logits/chosen": -1.0824536085128784, "logits/rejected": -1.0418331623077393, "logps/chosen": -486.5535888671875, "logps/rejected": -583.5165405273438, "loss": 0.5399, "rewards/accuracies": 0.8125, "rewards/chosen": -2.476439952850342, "rewards/margins": 0.8414661884307861, "rewards/rejected": -3.317906379699707, "step": 154 }, { "epoch": 0.33128506545551695, "grad_norm": 14.128652205737456, "learning_rate": 6.764250595947458e-07, "logits/chosen": -1.059680700302124, "logits/rejected": -1.131382703781128, "logps/chosen": -486.99127197265625, "logps/rejected": -562.620361328125, "loss": 0.4737, "rewards/accuracies": 0.75, "rewards/chosen": -2.4245104789733887, "rewards/margins": 0.9044227004051208, "rewards/rejected": -3.328933000564575, "step": 155 }, { "epoch": 0.3334223884584558, "grad_norm": 12.201086119166204, "learning_rate": 6.742547496113737e-07, "logits/chosen": -1.0942405462265015, "logits/rejected": -1.1031134128570557, "logps/chosen": -506.2625732421875, "logps/rejected": -582.8713989257812, "loss": 0.5059, "rewards/accuracies": 0.8125, "rewards/chosen": -2.9856767654418945, "rewards/margins": 0.6826351284980774, "rewards/rejected": -3.6683120727539062, "step": 156 }, { "epoch": 0.3355597114613946, "grad_norm": 12.433943055598153, "learning_rate": 6.720690951083677e-07, "logits/chosen": -0.9618053436279297, "logits/rejected": -0.948087215423584, "logps/chosen": -620.4253540039062, "logps/rejected": -752.8404541015625, "loss": 0.4782, "rewards/accuracies": 0.9375, "rewards/chosen": -3.224658489227295, "rewards/margins": 1.47016441822052, "rewards/rejected": -4.694822311401367, "step": 157 }, { "epoch": 0.3376970344643334, "grad_norm": 14.86954063543414, "learning_rate": 6.698682183728298e-07, "logits/chosen": -1.1300894021987915, "logits/rejected": -1.039293646812439, "logps/chosen": -464.7246398925781, "logps/rejected": -609.8098754882812, "loss": 0.4946, "rewards/accuracies": 0.8125, "rewards/chosen": -2.3228816986083984, "rewards/margins": 1.281423807144165, "rewards/rejected": -3.6043052673339844, "step": 158 }, { "epoch": 0.33983435746727225, "grad_norm": 11.83277304769165, "learning_rate": 6.676522425435432e-07, "logits/chosen": -1.131430745124817, "logits/rejected": -1.0974528789520264, "logps/chosen": -469.0117492675781, "logps/rejected": -556.792236328125, "loss": 0.4858, "rewards/accuracies": 0.8125, "rewards/chosen": -2.6056880950927734, "rewards/margins": 0.8633173704147339, "rewards/rejected": -3.4690053462982178, "step": 159 }, { "epoch": 0.34197168047021104, "grad_norm": 11.767019931678893, "learning_rate": 6.654212916040845e-07, "logits/chosen": -1.1052802801132202, "logits/rejected": -1.058582067489624, "logps/chosen": -557.347412109375, "logps/rejected": -626.2900390625, "loss": 0.4836, "rewards/accuracies": 0.8125, "rewards/chosen": -2.7133045196533203, "rewards/margins": 0.6999992728233337, "rewards/rejected": -3.413303852081299, "step": 160 }, { "epoch": 0.3441090034731499, "grad_norm": 10.178256055891817, "learning_rate": 6.63175490375885e-07, "logits/chosen": -1.1756691932678223, "logits/rejected": -1.103055715560913, "logps/chosen": -481.7811584472656, "logps/rejected": -647.7122192382812, "loss": 0.4671, "rewards/accuracies": 0.9375, "rewards/chosen": -2.503532648086548, "rewards/margins": 1.39969003200531, "rewards/rejected": -3.9032225608825684, "step": 161 }, { "epoch": 0.3462463264760887, "grad_norm": 11.090255106567847, "learning_rate": 6.609149645112484e-07, "logits/chosen": -1.0091067552566528, "logits/rejected": -0.9871212840080261, "logps/chosen": -494.93609619140625, "logps/rejected": -592.403564453125, "loss": 0.4872, "rewards/accuracies": 0.6875, "rewards/chosen": -2.7299017906188965, "rewards/margins": 0.8612344861030579, "rewards/rejected": -3.5911364555358887, "step": 162 }, { "epoch": 0.3483836494790275, "grad_norm": 11.3289959517762, "learning_rate": 6.586398404863198e-07, "logits/chosen": -1.0473204851150513, "logits/rejected": -1.0162488222122192, "logps/chosen": -468.8031311035156, "logps/rejected": -638.3246459960938, "loss": 0.4592, "rewards/accuracies": 0.9375, "rewards/chosen": -2.419950485229492, "rewards/margins": 1.7530534267425537, "rewards/rejected": -4.173004150390625, "step": 163 }, { "epoch": 0.35052097248196634, "grad_norm": 12.136113240636318, "learning_rate": 6.563502455940096e-07, "logits/chosen": -0.9164242148399353, "logits/rejected": -0.9073900580406189, "logps/chosen": -547.6764526367188, "logps/rejected": -694.9613037109375, "loss": 0.5092, "rewards/accuracies": 0.9375, "rewards/chosen": -2.788818597793579, "rewards/margins": 1.4330434799194336, "rewards/rejected": -4.221862316131592, "step": 164 }, { "epoch": 0.3526582954849052, "grad_norm": 13.204127745290974, "learning_rate": 6.540463079368714e-07, "logits/chosen": -1.156178593635559, "logits/rejected": -1.2097373008728027, "logps/chosen": -669.5219116210938, "logps/rejected": -734.2588500976562, "loss": 0.4965, "rewards/accuracies": 0.8125, "rewards/chosen": -3.4145455360412598, "rewards/margins": 0.7804650068283081, "rewards/rejected": -4.195010662078857, "step": 165 }, { "epoch": 0.35479561848784397, "grad_norm": 12.906217141464444, "learning_rate": 6.51728156419935e-07, "logits/chosen": -1.1758369207382202, "logits/rejected": -1.1248809099197388, "logps/chosen": -432.7223815917969, "logps/rejected": -522.7244262695312, "loss": 0.5121, "rewards/accuracies": 0.6875, "rewards/chosen": -2.4266903400421143, "rewards/margins": 0.7254734635353088, "rewards/rejected": -3.1521639823913574, "step": 166 }, { "epoch": 0.3569329414907828, "grad_norm": 12.171039393472755, "learning_rate": 6.493959207434934e-07, "logits/chosen": -0.9444956183433533, "logits/rejected": -1.0015169382095337, "logps/chosen": -607.41650390625, "logps/rejected": -728.7015380859375, "loss": 0.4482, "rewards/accuracies": 0.8125, "rewards/chosen": -3.5008718967437744, "rewards/margins": 1.2940125465393066, "rewards/rejected": -4.794884204864502, "step": 167 }, { "epoch": 0.3590702644937216, "grad_norm": 12.676709989313622, "learning_rate": 6.470497313958472e-07, "logits/chosen": -1.0418492555618286, "logits/rejected": -0.9932193160057068, "logps/chosen": -495.92938232421875, "logps/rejected": -625.4636840820312, "loss": 0.5167, "rewards/accuracies": 0.875, "rewards/chosen": -2.6624858379364014, "rewards/margins": 1.1294273138046265, "rewards/rejected": -3.791912794113159, "step": 168 }, { "epoch": 0.36120758749666043, "grad_norm": 12.231651860351334, "learning_rate": 6.446897196460025e-07, "logits/chosen": -1.0079303979873657, "logits/rejected": -0.9732705950737, "logps/chosen": -458.8643493652344, "logps/rejected": -558.783447265625, "loss": 0.4617, "rewards/accuracies": 0.75, "rewards/chosen": -2.6645169258117676, "rewards/margins": 0.922903299331665, "rewards/rejected": -3.5874204635620117, "step": 169 }, { "epoch": 0.36334491049959927, "grad_norm": 13.830086184878045, "learning_rate": 6.42316017536328e-07, "logits/chosen": -1.1729865074157715, "logits/rejected": -1.1441071033477783, "logps/chosen": -493.1350402832031, "logps/rejected": -565.9179077148438, "loss": 0.4823, "rewards/accuracies": 0.8125, "rewards/chosen": -2.748018741607666, "rewards/margins": 0.6935210227966309, "rewards/rejected": -3.4415395259857178, "step": 170 }, { "epoch": 0.36548223350253806, "grad_norm": 13.82995654030433, "learning_rate": 6.399287578751656e-07, "logits/chosen": -1.0457535982131958, "logits/rejected": -0.9317151308059692, "logps/chosen": -545.84375, "logps/rejected": -692.236328125, "loss": 0.4293, "rewards/accuracies": 0.875, "rewards/chosen": -3.1345977783203125, "rewards/margins": 1.223220705986023, "rewards/rejected": -4.357818603515625, "step": 171 }, { "epoch": 0.3676195565054769, "grad_norm": 13.523650685418982, "learning_rate": 6.375280742294006e-07, "logits/chosen": -0.807671070098877, "logits/rejected": -0.8415660858154297, "logps/chosen": -581.8911743164062, "logps/rejected": -669.5233764648438, "loss": 0.5499, "rewards/accuracies": 0.8125, "rewards/chosen": -3.200932264328003, "rewards/margins": 0.9946154356002808, "rewards/rejected": -4.195547580718994, "step": 172 }, { "epoch": 0.36975687950841574, "grad_norm": 12.874802080841212, "learning_rate": 6.351141009169893e-07, "logits/chosen": -1.1410176753997803, "logits/rejected": -1.10181725025177, "logps/chosen": -447.8720703125, "logps/rejected": -564.0751953125, "loss": 0.4725, "rewards/accuracies": 0.875, "rewards/chosen": -2.5582220554351807, "rewards/margins": 1.1303775310516357, "rewards/rejected": -3.6885998249053955, "step": 173 }, { "epoch": 0.3718942025113545, "grad_norm": 12.769700171240247, "learning_rate": 6.326869729994423e-07, "logits/chosen": -1.0553710460662842, "logits/rejected": -1.0439517498016357, "logps/chosen": -494.44775390625, "logps/rejected": -636.4324951171875, "loss": 0.4471, "rewards/accuracies": 0.875, "rewards/chosen": -2.5291152000427246, "rewards/margins": 1.314854383468628, "rewards/rejected": -3.8439698219299316, "step": 174 }, { "epoch": 0.37403152551429336, "grad_norm": 11.212264761027647, "learning_rate": 6.302468262742694e-07, "logits/chosen": -0.950715959072113, "logits/rejected": -0.884972333908081, "logps/chosen": -521.4841918945312, "logps/rejected": -649.075927734375, "loss": 0.4173, "rewards/accuracies": 0.8125, "rewards/chosen": -3.026653289794922, "rewards/margins": 1.0717153549194336, "rewards/rejected": -4.098368167877197, "step": 175 }, { "epoch": 0.37616884851723215, "grad_norm": 11.18847853926289, "learning_rate": 6.277937972673808e-07, "logits/chosen": -1.1451900005340576, "logits/rejected": -1.1354702711105347, "logps/chosen": -544.9923095703125, "logps/rejected": -680.9166259765625, "loss": 0.4452, "rewards/accuracies": 0.75, "rewards/chosen": -3.016019582748413, "rewards/margins": 1.4868509769439697, "rewards/rejected": -4.502870559692383, "step": 176 }, { "epoch": 0.378306171520171, "grad_norm": 12.264246636952317, "learning_rate": 6.253280232254487e-07, "logits/chosen": -1.1703343391418457, "logits/rejected": -1.1669647693634033, "logps/chosen": -511.00799560546875, "logps/rejected": -609.4747314453125, "loss": 0.4253, "rewards/accuracies": 0.8125, "rewards/chosen": -3.0230469703674316, "rewards/margins": 0.9169848561286926, "rewards/rejected": -3.9400315284729004, "step": 177 }, { "epoch": 0.3804434945231098, "grad_norm": 14.813100911794251, "learning_rate": 6.228496421082289e-07, "logits/chosen": -1.015668511390686, "logits/rejected": -1.1096816062927246, "logps/chosen": -636.0406494140625, "logps/rejected": -721.57666015625, "loss": 0.4504, "rewards/accuracies": 0.8125, "rewards/chosen": -3.538027286529541, "rewards/margins": 1.1062912940979004, "rewards/rejected": -4.6443190574646, "step": 178 }, { "epoch": 0.3825808175260486, "grad_norm": 12.280974148700997, "learning_rate": 6.20358792580841e-07, "logits/chosen": -0.9031931757926941, "logits/rejected": -0.9428293704986572, "logps/chosen": -572.744384765625, "logps/rejected": -719.7515869140625, "loss": 0.4117, "rewards/accuracies": 0.8125, "rewards/chosen": -3.4223711490631104, "rewards/margins": 1.4378111362457275, "rewards/rejected": -4.860182285308838, "step": 179 }, { "epoch": 0.38471814052898745, "grad_norm": 11.985112172297303, "learning_rate": 6.178556140060108e-07, "logits/chosen": -0.9790340662002563, "logits/rejected": -1.0062155723571777, "logps/chosen": -667.6234130859375, "logps/rejected": -789.353759765625, "loss": 0.4774, "rewards/accuracies": 0.8125, "rewards/chosen": -4.024725914001465, "rewards/margins": 1.263332724571228, "rewards/rejected": -5.288058757781982, "step": 180 }, { "epoch": 0.38685546353192624, "grad_norm": 15.173768084164333, "learning_rate": 6.153402464362729e-07, "logits/chosen": -1.1106728315353394, "logits/rejected": -1.1724138259887695, "logps/chosen": -580.4508056640625, "logps/rejected": -633.2486572265625, "loss": 0.5298, "rewards/accuracies": 0.6875, "rewards/chosen": -3.401585340499878, "rewards/margins": 0.4253509044647217, "rewards/rejected": -3.8269362449645996, "step": 181 }, { "epoch": 0.3889927865348651, "grad_norm": 14.282404953550225, "learning_rate": 6.128128306061346e-07, "logits/chosen": -1.279030203819275, "logits/rejected": -1.2068696022033691, "logps/chosen": -563.7296142578125, "logps/rejected": -805.1969604492188, "loss": 0.4137, "rewards/accuracies": 0.9375, "rewards/chosen": -3.0469908714294434, "rewards/margins": 2.2822344303131104, "rewards/rejected": -5.329225063323975, "step": 182 }, { "epoch": 0.3911301095378039, "grad_norm": 12.48256287123551, "learning_rate": 6.102735079242018e-07, "logits/chosen": -1.091313123703003, "logits/rejected": -1.0286011695861816, "logps/chosen": -562.3391723632812, "logps/rejected": -746.7239379882812, "loss": 0.4182, "rewards/accuracies": 0.8125, "rewards/chosen": -3.107255697250366, "rewards/margins": 1.946906328201294, "rewards/rejected": -5.054161548614502, "step": 183 }, { "epoch": 0.3932674325407427, "grad_norm": 12.225189138276614, "learning_rate": 6.077224204652676e-07, "logits/chosen": -1.2094743251800537, "logits/rejected": -1.2332773208618164, "logps/chosen": -541.5833740234375, "logps/rejected": -624.4737548828125, "loss": 0.439, "rewards/accuracies": 0.875, "rewards/chosen": -3.0276694297790527, "rewards/margins": 0.8412066698074341, "rewards/rejected": -3.868875741958618, "step": 184 }, { "epoch": 0.39540475554368154, "grad_norm": 14.957361436447231, "learning_rate": 6.051597109623624e-07, "logits/chosen": -1.028442144393921, "logits/rejected": -1.0131793022155762, "logps/chosen": -600.3471069335938, "logps/rejected": -733.6820678710938, "loss": 0.4948, "rewards/accuracies": 0.6875, "rewards/chosen": -3.375563859939575, "rewards/margins": 1.3205593824386597, "rewards/rejected": -4.6961236000061035, "step": 185 }, { "epoch": 0.3975420785466204, "grad_norm": 13.214401719298836, "learning_rate": 6.025855227987691e-07, "logits/chosen": -1.1182265281677246, "logits/rejected": -1.0665013790130615, "logps/chosen": -583.1017456054688, "logps/rejected": -770.8402709960938, "loss": 0.4751, "rewards/accuracies": 0.75, "rewards/chosen": -3.495480537414551, "rewards/margins": 1.8332180976867676, "rewards/rejected": -5.328698635101318, "step": 186 }, { "epoch": 0.39967940154955917, "grad_norm": 13.595163023407325, "learning_rate": 6e-07, "logits/chosen": -1.070106863975525, "logits/rejected": -1.0235223770141602, "logps/chosen": -578.7149047851562, "logps/rejected": -709.0172729492188, "loss": 0.4339, "rewards/accuracies": 0.8125, "rewards/chosen": -3.577277183532715, "rewards/margins": 1.1270668506622314, "rewards/rejected": -4.704343795776367, "step": 187 }, { "epoch": 0.401816724552498, "grad_norm": 16.48310592089836, "learning_rate": 5.974032872257387e-07, "logits/chosen": -1.1554845571517944, "logits/rejected": -1.1422069072723389, "logps/chosen": -742.5419921875, "logps/rejected": -898.2552490234375, "loss": 0.5009, "rewards/accuracies": 0.75, "rewards/chosen": -4.372076988220215, "rewards/margins": 1.6709426641464233, "rewards/rejected": -6.043020248413086, "step": 188 }, { "epoch": 0.4039540475554368, "grad_norm": 14.86949064709501, "learning_rate": 5.947955297617469e-07, "logits/chosen": -1.0644134283065796, "logits/rejected": -1.0610729455947876, "logps/chosen": -604.2228393554688, "logps/rejected": -724.7699584960938, "loss": 0.4967, "rewards/accuracies": 0.8125, "rewards/chosen": -3.4057650566101074, "rewards/margins": 1.34103262424469, "rewards/rejected": -4.746798038482666, "step": 189 }, { "epoch": 0.40609137055837563, "grad_norm": 12.688678020056997, "learning_rate": 5.92176873511735e-07, "logits/chosen": -1.015690803527832, "logits/rejected": -0.9889406561851501, "logps/chosen": -544.1987915039062, "logps/rejected": -630.72705078125, "loss": 0.4642, "rewards/accuracies": 0.6875, "rewards/chosen": -3.410604953765869, "rewards/margins": 0.8672491908073425, "rewards/rejected": -4.277853965759277, "step": 190 }, { "epoch": 0.40822869356131447, "grad_norm": 16.497268225970018, "learning_rate": 5.895474649891994e-07, "logits/chosen": -1.1754518747329712, "logits/rejected": -1.1124107837677002, "logps/chosen": -664.4409790039062, "logps/rejected": -850.3493041992188, "loss": 0.4583, "rewards/accuracies": 0.8125, "rewards/chosen": -3.937302350997925, "rewards/margins": 1.6396393775939941, "rewards/rejected": -5.576941967010498, "step": 191 }, { "epoch": 0.41036601656425326, "grad_norm": 16.209730758284643, "learning_rate": 5.869074513092248e-07, "logits/chosen": -1.1026970148086548, "logits/rejected": -1.0514788627624512, "logps/chosen": -518.1434936523438, "logps/rejected": -543.8851318359375, "loss": 0.5024, "rewards/accuracies": 0.75, "rewards/chosen": -3.1491613388061523, "rewards/margins": 0.30625391006469727, "rewards/rejected": -3.4554154872894287, "step": 192 }, { "epoch": 0.4125033395671921, "grad_norm": 14.339086717238697, "learning_rate": 5.842569801802529e-07, "logits/chosen": -0.9835341572761536, "logits/rejected": -0.9959213137626648, "logps/chosen": -591.5449829101562, "logps/rejected": -795.7198486328125, "loss": 0.4859, "rewards/accuracies": 0.875, "rewards/chosen": -3.573066473007202, "rewards/margins": 1.810349464416504, "rewards/rejected": -5.383416175842285, "step": 193 }, { "epoch": 0.41464066257013094, "grad_norm": 14.717147219609627, "learning_rate": 5.815961998958187e-07, "logits/chosen": -1.2837733030319214, "logits/rejected": -1.2531107664108276, "logps/chosen": -598.62841796875, "logps/rejected": -655.9907836914062, "loss": 0.5017, "rewards/accuracies": 0.5625, "rewards/chosen": -3.0715248584747314, "rewards/margins": 0.7950742244720459, "rewards/rejected": -3.8665990829467773, "step": 194 }, { "epoch": 0.4167779855730697, "grad_norm": 13.661497341383276, "learning_rate": 5.78925259326253e-07, "logits/chosen": -1.2736696004867554, "logits/rejected": -1.2564581632614136, "logps/chosen": -678.5160522460938, "logps/rejected": -836.7633666992188, "loss": 0.4046, "rewards/accuracies": 0.9375, "rewards/chosen": -3.7295868396759033, "rewards/margins": 1.5906658172607422, "rewards/rejected": -5.320252418518066, "step": 195 }, { "epoch": 0.41891530857600856, "grad_norm": 17.358547092004624, "learning_rate": 5.762443079103534e-07, "logits/chosen": -1.1058768033981323, "logits/rejected": -1.1880860328674316, "logps/chosen": -632.157470703125, "logps/rejected": -689.973876953125, "loss": 0.6262, "rewards/accuracies": 0.6875, "rewards/chosen": -3.6461615562438965, "rewards/margins": 0.6898024082183838, "rewards/rejected": -4.335964202880859, "step": 196 }, { "epoch": 0.42105263157894735, "grad_norm": 11.719355560175847, "learning_rate": 5.735534956470232e-07, "logits/chosen": -1.1999759674072266, "logits/rejected": -1.228690505027771, "logps/chosen": -673.6649169921875, "logps/rejected": -844.5328369140625, "loss": 0.4206, "rewards/accuracies": 0.9375, "rewards/chosen": -3.677556037902832, "rewards/margins": 1.6347838640213013, "rewards/rejected": -5.312339782714844, "step": 197 }, { "epoch": 0.4231899545818862, "grad_norm": 10.672786717708536, "learning_rate": 5.708529730868787e-07, "logits/chosen": -1.203027606010437, "logits/rejected": -1.1142572164535522, "logps/chosen": -541.4075317382812, "logps/rejected": -631.228759765625, "loss": 0.4455, "rewards/accuracies": 0.75, "rewards/chosen": -3.15077805519104, "rewards/margins": 0.7978792190551758, "rewards/rejected": -3.9486567974090576, "step": 198 }, { "epoch": 0.425327277584825, "grad_norm": 11.74204286722898, "learning_rate": 5.681428913238262e-07, "logits/chosen": -1.0974667072296143, "logits/rejected": -1.1039204597473145, "logps/chosen": -562.8824462890625, "logps/rejected": -656.3973388671875, "loss": 0.4138, "rewards/accuracies": 0.8125, "rewards/chosen": -3.360422134399414, "rewards/margins": 1.0035967826843262, "rewards/rejected": -4.36401891708374, "step": 199 }, { "epoch": 0.4274646005877638, "grad_norm": 12.842353670192367, "learning_rate": 5.654234019866083e-07, "logits/chosen": -1.1496554613113403, "logits/rejected": -1.1662920713424683, "logps/chosen": -515.6434326171875, "logps/rejected": -629.1980590820312, "loss": 0.5011, "rewards/accuracies": 0.625, "rewards/chosen": -3.022861957550049, "rewards/margins": 0.9866237640380859, "rewards/rejected": -4.009485244750977, "step": 200 }, { "epoch": 0.4274646005877638, "eval_logits/chosen": -1.2976744174957275, "eval_logits/rejected": -1.3147474527359009, "eval_logps/chosen": -629.65478515625, "eval_logps/rejected": -736.8406372070312, "eval_loss": 0.4427746832370758, "eval_rewards/accuracies": 0.7903226017951965, "eval_rewards/chosen": -3.5014870166778564, "eval_rewards/margins": 1.131365180015564, "eval_rewards/rejected": -4.632852554321289, "eval_runtime": 127.7594, "eval_samples_per_second": 15.349, "eval_steps_per_second": 0.485, "step": 200 }, { "epoch": 0.42960192359070265, "grad_norm": 14.704888625045939, "learning_rate": 5.626946572303201e-07, "logits/chosen": -1.217873454093933, "logits/rejected": -1.1787673234939575, "logps/chosen": -640.8902587890625, "logps/rejected": -832.7512817382812, "loss": 0.438, "rewards/accuracies": 0.9375, "rewards/chosen": -3.6388046741485596, "rewards/margins": 2.023084878921509, "rewards/rejected": -5.661889553070068, "step": 201 }, { "epoch": 0.4317392465936415, "grad_norm": 12.551626779621763, "learning_rate": 5.599568097278963e-07, "logits/chosen": -1.1497160196304321, "logits/rejected": -1.150457501411438, "logps/chosen": -672.3145141601562, "logps/rejected": -819.0764770507812, "loss": 0.4017, "rewards/accuracies": 0.75, "rewards/chosen": -3.58868145942688, "rewards/margins": 1.576560616493225, "rewards/rejected": -5.165241718292236, "step": 202 }, { "epoch": 0.4338765695965803, "grad_norm": 13.096747763619007, "learning_rate": 5.572100126615694e-07, "logits/chosen": -1.1709344387054443, "logits/rejected": -1.1770344972610474, "logps/chosen": -610.950439453125, "logps/rejected": -775.8607788085938, "loss": 0.4092, "rewards/accuracies": 0.875, "rewards/chosen": -3.6900205612182617, "rewards/margins": 1.358374834060669, "rewards/rejected": -5.048395156860352, "step": 203 }, { "epoch": 0.4360138925995191, "grad_norm": 16.19055766301351, "learning_rate": 5.544544197142984e-07, "logits/chosen": -1.1622633934020996, "logits/rejected": -1.0976923704147339, "logps/chosen": -718.2122802734375, "logps/rejected": -918.7157592773438, "loss": 0.4207, "rewards/accuracies": 0.875, "rewards/chosen": -4.543436050415039, "rewards/margins": 1.6670641899108887, "rewards/rejected": -6.210500717163086, "step": 204 }, { "epoch": 0.4381512156024579, "grad_norm": 13.688669492121473, "learning_rate": 5.516901850611714e-07, "logits/chosen": -1.1796329021453857, "logits/rejected": -1.1708921194076538, "logps/chosen": -581.9102172851562, "logps/rejected": -728.2744750976562, "loss": 0.4615, "rewards/accuracies": 0.6875, "rewards/chosen": -3.4006991386413574, "rewards/margins": 1.4271738529205322, "rewards/rejected": -4.827873229980469, "step": 205 }, { "epoch": 0.44028853860539674, "grad_norm": 20.309923000737594, "learning_rate": 5.489174633607781e-07, "logits/chosen": -0.9230062961578369, "logits/rejected": -1.0179659128189087, "logps/chosen": -700.2884521484375, "logps/rejected": -722.5196533203125, "loss": 0.5692, "rewards/accuracies": 0.75, "rewards/chosen": -4.400253772735596, "rewards/margins": 0.43511492013931274, "rewards/rejected": -4.835369110107422, "step": 206 }, { "epoch": 0.4424258616083356, "grad_norm": 20.545691501495256, "learning_rate": 5.461364097465581e-07, "logits/chosen": -1.1641998291015625, "logits/rejected": -1.1823948621749878, "logps/chosen": -649.5260009765625, "logps/rejected": -736.3587036132812, "loss": 0.5013, "rewards/accuracies": 0.8125, "rewards/chosen": -3.8688461780548096, "rewards/margins": 0.9938517808914185, "rewards/rejected": -4.862697601318359, "step": 207 }, { "epoch": 0.44456318461127436, "grad_norm": 13.254346937801454, "learning_rate": 5.4334717981812e-07, "logits/chosen": -1.212602972984314, "logits/rejected": -1.293573260307312, "logps/chosen": -660.5288696289062, "logps/rejected": -745.7517700195312, "loss": 0.4214, "rewards/accuracies": 0.6875, "rewards/chosen": -4.027283668518066, "rewards/margins": 0.8199220895767212, "rewards/rejected": -4.8472065925598145, "step": 208 }, { "epoch": 0.4467005076142132, "grad_norm": 13.832436176790107, "learning_rate": 5.405499296325371e-07, "logits/chosen": -1.112457513809204, "logits/rejected": -1.1108170747756958, "logps/chosen": -542.69140625, "logps/rejected": -657.5791015625, "loss": 0.4021, "rewards/accuracies": 0.75, "rewards/chosen": -3.3648681640625, "rewards/margins": 1.0020387172698975, "rewards/rejected": -4.366907119750977, "step": 209 }, { "epoch": 0.448837830617152, "grad_norm": 16.15754914343496, "learning_rate": 5.377448156956139e-07, "logits/chosen": -1.2227798700332642, "logits/rejected": -1.1601148843765259, "logps/chosen": -826.6924438476562, "logps/rejected": -967.8865966796875, "loss": 0.4399, "rewards/accuracies": 0.8125, "rewards/chosen": -5.01950740814209, "rewards/margins": 1.297555923461914, "rewards/rejected": -6.317063331604004, "step": 210 }, { "epoch": 0.45097515362009083, "grad_norm": 14.676898780340569, "learning_rate": 5.34931994953132e-07, "logits/chosen": -1.2065531015396118, "logits/rejected": -1.2652815580368042, "logps/chosen": -682.3113403320312, "logps/rejected": -843.3710327148438, "loss": 0.3938, "rewards/accuracies": 0.8125, "rewards/chosen": -3.9389686584472656, "rewards/margins": 1.772455096244812, "rewards/rejected": -5.711423397064209, "step": 211 }, { "epoch": 0.45311247662302967, "grad_norm": 14.111414144546993, "learning_rate": 5.321116247820669e-07, "logits/chosen": -1.1172921657562256, "logits/rejected": -1.1278425455093384, "logps/chosen": -585.2348022460938, "logps/rejected": -633.3766479492188, "loss": 0.4125, "rewards/accuracies": 0.625, "rewards/chosen": -3.4476540088653564, "rewards/margins": 0.7052488327026367, "rewards/rejected": -4.152902603149414, "step": 212 }, { "epoch": 0.45524979962596845, "grad_norm": 16.54663848569769, "learning_rate": 5.29283862981784e-07, "logits/chosen": -1.1069424152374268, "logits/rejected": -1.0816022157669067, "logps/chosen": -623.0894775390625, "logps/rejected": -707.4627685546875, "loss": 0.5382, "rewards/accuracies": 0.6875, "rewards/chosen": -3.8370444774627686, "rewards/margins": 0.6275334358215332, "rewards/rejected": -4.464578151702881, "step": 213 }, { "epoch": 0.4573871226289073, "grad_norm": 15.2093168956623, "learning_rate": 5.264488677652097e-07, "logits/chosen": -1.2023216485977173, "logits/rejected": -1.1567494869232178, "logps/chosen": -548.3652954101562, "logps/rejected": -656.767578125, "loss": 0.3832, "rewards/accuracies": 0.75, "rewards/chosen": -3.367328643798828, "rewards/margins": 0.951412558555603, "rewards/rejected": -4.318741321563721, "step": 214 }, { "epoch": 0.45952444563184613, "grad_norm": 19.223112891790837, "learning_rate": 5.23606797749979e-07, "logits/chosen": -1.100671410560608, "logits/rejected": -1.0285398960113525, "logps/chosen": -586.305419921875, "logps/rejected": -665.7771606445312, "loss": 0.5119, "rewards/accuracies": 0.625, "rewards/chosen": -3.6576530933380127, "rewards/margins": 0.7183351516723633, "rewards/rejected": -4.375988483428955, "step": 215 }, { "epoch": 0.4616617686347849, "grad_norm": 12.33986287311908, "learning_rate": 5.207578119495608e-07, "logits/chosen": -1.2366998195648193, "logits/rejected": -1.2117341756820679, "logps/chosen": -629.59716796875, "logps/rejected": -807.658935546875, "loss": 0.4152, "rewards/accuracies": 0.9375, "rewards/chosen": -3.4987330436706543, "rewards/margins": 1.8127856254577637, "rewards/rejected": -5.311518669128418, "step": 216 }, { "epoch": 0.46379909163772376, "grad_norm": 12.965958762194157, "learning_rate": 5.179020697643617e-07, "logits/chosen": -1.2369261980056763, "logits/rejected": -1.2110192775726318, "logps/chosen": -524.5565795898438, "logps/rejected": -678.670654296875, "loss": 0.4466, "rewards/accuracies": 0.8125, "rewards/chosen": -2.8585205078125, "rewards/margins": 1.3874884843826294, "rewards/rejected": -4.246009349822998, "step": 217 }, { "epoch": 0.46593641464066254, "grad_norm": 16.584181179977357, "learning_rate": 5.150397309728068e-07, "logits/chosen": -1.3417975902557373, "logits/rejected": -1.3138681650161743, "logps/chosen": -658.9564208984375, "logps/rejected": -744.6614990234375, "loss": 0.5277, "rewards/accuracies": 0.75, "rewards/chosen": -3.620779275894165, "rewards/margins": 0.9568281769752502, "rewards/rejected": -4.577607154846191, "step": 218 }, { "epoch": 0.4680737376436014, "grad_norm": 14.322701228154292, "learning_rate": 5.121709557224011e-07, "logits/chosen": -1.0924996137619019, "logits/rejected": -1.0888468027114868, "logps/chosen": -669.5311279296875, "logps/rejected": -813.560546875, "loss": 0.4291, "rewards/accuracies": 0.875, "rewards/chosen": -3.6334965229034424, "rewards/margins": 1.4997690916061401, "rewards/rejected": -5.133265495300293, "step": 219 }, { "epoch": 0.4702110606465402, "grad_norm": 17.034367007461572, "learning_rate": 5.09295904520768e-07, "logits/chosen": -1.1301909685134888, "logits/rejected": -1.140136480331421, "logps/chosen": -595.55126953125, "logps/rejected": -707.5346069335938, "loss": 0.4488, "rewards/accuracies": 0.8125, "rewards/chosen": -3.697270154953003, "rewards/margins": 1.1222831010818481, "rewards/rejected": -4.819552898406982, "step": 220 }, { "epoch": 0.472348383649479, "grad_norm": 18.144002765573468, "learning_rate": 5.0641473822667e-07, "logits/chosen": -1.1868654489517212, "logits/rejected": -1.1960182189941406, "logps/chosen": -459.4822692871094, "logps/rejected": -537.1638793945312, "loss": 0.395, "rewards/accuracies": 0.75, "rewards/chosen": -2.789658546447754, "rewards/margins": 0.8773064017295837, "rewards/rejected": -3.6669647693634033, "step": 221 }, { "epoch": 0.47448570665241785, "grad_norm": 14.14471098052339, "learning_rate": 5.035276180410083e-07, "logits/chosen": -1.1389260292053223, "logits/rejected": -1.1852952241897583, "logps/chosen": -611.3282470703125, "logps/rejected": -692.434326171875, "loss": 0.426, "rewards/accuracies": 0.75, "rewards/chosen": -3.461778402328491, "rewards/margins": 0.9153075814247131, "rewards/rejected": -4.3770856857299805, "step": 222 }, { "epoch": 0.4766230296553567, "grad_norm": 13.32254209502914, "learning_rate": 5.006347054978034e-07, "logits/chosen": -1.1993439197540283, "logits/rejected": -1.1267905235290527, "logps/chosen": -672.0899658203125, "logps/rejected": -908.1744384765625, "loss": 0.3713, "rewards/accuracies": 0.8125, "rewards/chosen": -3.8041088581085205, "rewards/margins": 2.0054619312286377, "rewards/rejected": -5.809570789337158, "step": 223 }, { "epoch": 0.4787603526582955, "grad_norm": 18.83052351653402, "learning_rate": 4.977361624551576e-07, "logits/chosen": -1.265091896057129, "logits/rejected": -1.2223862409591675, "logps/chosen": -536.5931396484375, "logps/rejected": -670.3464965820312, "loss": 0.4369, "rewards/accuracies": 0.75, "rewards/chosen": -3.259114980697632, "rewards/margins": 1.1988860368728638, "rewards/rejected": -4.458001136779785, "step": 224 }, { "epoch": 0.4808976756612343, "grad_norm": 17.558770227833907, "learning_rate": 4.94832151086199e-07, "logits/chosen": -1.2254117727279663, "logits/rejected": -1.198996663093567, "logps/chosen": -693.514404296875, "logps/rejected": -909.830322265625, "loss": 0.4987, "rewards/accuracies": 0.875, "rewards/chosen": -3.9128432273864746, "rewards/margins": 1.8729009628295898, "rewards/rejected": -5.7857441902160645, "step": 225 }, { "epoch": 0.4830349986641731, "grad_norm": 16.275797205270916, "learning_rate": 4.919228338700078e-07, "logits/chosen": -1.158968210220337, "logits/rejected": -1.1183545589447021, "logps/chosen": -637.254150390625, "logps/rejected": -791.1282958984375, "loss": 0.4417, "rewards/accuracies": 0.8125, "rewards/chosen": -3.6901886463165283, "rewards/margins": 1.393813967704773, "rewards/rejected": -5.084002494812012, "step": 226 }, { "epoch": 0.48517232166711194, "grad_norm": 15.28807242884776, "learning_rate": 4.890083735825257e-07, "logits/chosen": -1.1542084217071533, "logits/rejected": -1.1580474376678467, "logps/chosen": -645.986572265625, "logps/rejected": -748.2597045898438, "loss": 0.4419, "rewards/accuracies": 0.875, "rewards/chosen": -3.941164016723633, "rewards/margins": 1.0907334089279175, "rewards/rejected": -5.03189754486084, "step": 227 }, { "epoch": 0.4873096446700508, "grad_norm": 21.17554543529619, "learning_rate": 4.860889332874488e-07, "logits/chosen": -1.1863421201705933, "logits/rejected": -1.2254602909088135, "logps/chosen": -577.6887817382812, "logps/rejected": -713.466796875, "loss": 0.4874, "rewards/accuracies": 0.6875, "rewards/chosen": -3.2465620040893555, "rewards/margins": 1.4701895713806152, "rewards/rejected": -4.7167510986328125, "step": 228 }, { "epoch": 0.48944696767298956, "grad_norm": 16.327068285344875, "learning_rate": 4.831646763271037e-07, "logits/chosen": -1.1210421323776245, "logits/rejected": -1.0809756517410278, "logps/chosen": -746.8450927734375, "logps/rejected": -985.5284423828125, "loss": 0.368, "rewards/accuracies": 0.875, "rewards/chosen": -4.552522659301758, "rewards/margins": 2.365595579147339, "rewards/rejected": -6.918118000030518, "step": 229 }, { "epoch": 0.4915842906759284, "grad_norm": 17.677696465125493, "learning_rate": 4.802357663133091e-07, "logits/chosen": -1.232605218887329, "logits/rejected": -1.1589381694793701, "logps/chosen": -744.6641235351562, "logps/rejected": -832.0184326171875, "loss": 0.4541, "rewards/accuracies": 0.6875, "rewards/chosen": -4.563028335571289, "rewards/margins": 0.9094908833503723, "rewards/rejected": -5.472519874572754, "step": 230 }, { "epoch": 0.49372161367886724, "grad_norm": 18.66106887116432, "learning_rate": 4.773023671182213e-07, "logits/chosen": -1.2479596138000488, "logits/rejected": -1.1852896213531494, "logps/chosen": -643.7618408203125, "logps/rejected": -795.2108764648438, "loss": 0.428, "rewards/accuracies": 0.875, "rewards/chosen": -3.981501579284668, "rewards/margins": 1.6247977018356323, "rewards/rejected": -5.60629940032959, "step": 231 }, { "epoch": 0.49585893668180603, "grad_norm": 18.591705682457018, "learning_rate": 4.743646428651659e-07, "logits/chosen": -1.1132831573486328, "logits/rejected": -1.1119723320007324, "logps/chosen": -640.3826904296875, "logps/rejected": -761.2876586914062, "loss": 0.4324, "rewards/accuracies": 0.8125, "rewards/chosen": -3.8130741119384766, "rewards/margins": 1.1951515674591064, "rewards/rejected": -5.008225440979004, "step": 232 }, { "epoch": 0.49799625968474487, "grad_norm": 16.36807630859828, "learning_rate": 4.7142275791945465e-07, "logits/chosen": -1.1380739212036133, "logits/rejected": -1.056833028793335, "logps/chosen": -574.717529296875, "logps/rejected": -770.259033203125, "loss": 0.4061, "rewards/accuracies": 0.8125, "rewards/chosen": -3.5353829860687256, "rewards/margins": 1.7000670433044434, "rewards/rejected": -5.23544979095459, "step": 233 }, { "epoch": 0.5001335826876837, "grad_norm": 20.982879510548624, "learning_rate": 4.684768768791901e-07, "logits/chosen": -1.2214264869689941, "logits/rejected": -1.2726728916168213, "logps/chosen": -561.918701171875, "logps/rejected": -653.3917236328125, "loss": 0.5084, "rewards/accuracies": 0.6875, "rewards/chosen": -3.465195655822754, "rewards/margins": 0.9199050664901733, "rewards/rejected": -4.385100841522217, "step": 234 }, { "epoch": 0.5022709056906225, "grad_norm": 15.699299624363588, "learning_rate": 4.6552716456605507e-07, "logits/chosen": -1.0644322633743286, "logits/rejected": -1.093167781829834, "logps/chosen": -685.47021484375, "logps/rejected": -693.9439697265625, "loss": 0.3964, "rewards/accuracies": 0.5625, "rewards/chosen": -4.20876932144165, "rewards/margins": 0.20507556200027466, "rewards/rejected": -4.413845062255859, "step": 235 }, { "epoch": 0.5044082286935613, "grad_norm": 15.274480085299626, "learning_rate": 4.6257378601609237e-07, "logits/chosen": -1.2189481258392334, "logits/rejected": -1.183915376663208, "logps/chosen": -632.9229736328125, "logps/rejected": -755.5510864257812, "loss": 0.4118, "rewards/accuracies": 0.8125, "rewards/chosen": -3.502497434616089, "rewards/margins": 1.3661162853240967, "rewards/rejected": -4.8686137199401855, "step": 236 }, { "epoch": 0.5065455516965002, "grad_norm": 18.902872440282195, "learning_rate": 4.596169064704697e-07, "logits/chosen": -1.134831428527832, "logits/rejected": -1.0569604635238647, "logps/chosen": -603.5774536132812, "logps/rejected": -1207.822998046875, "loss": 0.4843, "rewards/accuracies": 0.625, "rewards/chosen": -3.4753661155700684, "rewards/margins": 0.5529882907867432, "rewards/rejected": -4.028354644775391, "step": 237 }, { "epoch": 0.508682874699439, "grad_norm": 14.821420949979046, "learning_rate": 4.5665669136623563e-07, "logits/chosen": -1.0976612567901611, "logits/rejected": -1.0576744079589844, "logps/chosen": -658.7794189453125, "logps/rejected": -707.6447143554688, "loss": 0.468, "rewards/accuracies": 0.6875, "rewards/chosen": -3.8431081771850586, "rewards/margins": 0.5369940996170044, "rewards/rejected": -4.380102157592773, "step": 238 }, { "epoch": 0.5108201977023777, "grad_norm": 13.617631823910862, "learning_rate": 4.536933063270622e-07, "logits/chosen": -1.2480061054229736, "logits/rejected": -1.1350898742675781, "logps/chosen": -675.2293090820312, "logps/rejected": -976.1577758789062, "loss": 0.3874, "rewards/accuracies": 0.875, "rewards/chosen": -4.117259979248047, "rewards/margins": 2.769744634628296, "rewards/rejected": -6.887004852294922, "step": 239 }, { "epoch": 0.5129575207053166, "grad_norm": 14.571710656068836, "learning_rate": 4.5072691715397937e-07, "logits/chosen": -1.1107938289642334, "logits/rejected": -1.0369246006011963, "logps/chosen": -516.5548095703125, "logps/rejected": -683.7294921875, "loss": 0.4148, "rewards/accuracies": 0.9375, "rewards/chosen": -3.2809367179870605, "rewards/margins": 1.5373725891113281, "rewards/rejected": -4.818309307098389, "step": 240 }, { "epoch": 0.5150948437082554, "grad_norm": 18.45094154637716, "learning_rate": 4.477576898160976e-07, "logits/chosen": -1.1420975923538208, "logits/rejected": -1.1292245388031006, "logps/chosen": -711.1873779296875, "logps/rejected": -837.0042114257812, "loss": 0.3439, "rewards/accuracies": 0.8125, "rewards/chosen": -4.162815570831299, "rewards/margins": 1.0814441442489624, "rewards/rejected": -5.244259834289551, "step": 241 }, { "epoch": 0.5172321667111942, "grad_norm": 16.29215056433998, "learning_rate": 4.447857904413231e-07, "logits/chosen": -1.145432710647583, "logits/rejected": -1.0742875337600708, "logps/chosen": -512.1181030273438, "logps/rejected": -708.3970947265625, "loss": 0.3978, "rewards/accuracies": 0.875, "rewards/chosen": -2.8924291133880615, "rewards/margins": 1.711695909500122, "rewards/rejected": -4.604125022888184, "step": 242 }, { "epoch": 0.5193694897141331, "grad_norm": 16.775923990492764, "learning_rate": 4.418113853070614e-07, "logits/chosen": -1.1763932704925537, "logits/rejected": -1.1778051853179932, "logps/chosen": -601.1099853515625, "logps/rejected": -744.1261596679688, "loss": 0.4368, "rewards/accuracies": 0.75, "rewards/chosen": -3.8077492713928223, "rewards/margins": 1.3345444202423096, "rewards/rejected": -5.1422929763793945, "step": 243 }, { "epoch": 0.5215068127170719, "grad_norm": 15.43010582256506, "learning_rate": 4.388346408309153e-07, "logits/chosen": -1.029261589050293, "logits/rejected": -0.904821515083313, "logps/chosen": -537.7769775390625, "logps/rejected": -740.9544677734375, "loss": 0.4082, "rewards/accuracies": 0.875, "rewards/chosen": -3.179456949234009, "rewards/margins": 1.8058176040649414, "rewards/rejected": -4.985274791717529, "step": 244 }, { "epoch": 0.5236441357200107, "grad_norm": 16.998724541332553, "learning_rate": 4.358557235613734e-07, "logits/chosen": -1.211896300315857, "logits/rejected": -1.17788565158844, "logps/chosen": -609.4110107421875, "logps/rejected": -725.498779296875, "loss": 0.4229, "rewards/accuracies": 0.8125, "rewards/chosen": -3.361295223236084, "rewards/margins": 1.2065562009811401, "rewards/rejected": -4.5678510665893555, "step": 245 }, { "epoch": 0.5257814587229495, "grad_norm": 15.563821028699236, "learning_rate": 4.3287480016849214e-07, "logits/chosen": -1.0585182905197144, "logits/rejected": -1.056843876838684, "logps/chosen": -631.421630859375, "logps/rejected": -787.7451782226562, "loss": 0.4121, "rewards/accuracies": 0.6875, "rewards/chosen": -3.867764472961426, "rewards/margins": 1.3012657165527344, "rewards/rejected": -5.169030666351318, "step": 246 }, { "epoch": 0.5279187817258884, "grad_norm": 15.0812037854319, "learning_rate": 4.298920374345697e-07, "logits/chosen": -1.1332510709762573, "logits/rejected": -1.1722335815429688, "logps/chosen": -611.515625, "logps/rejected": -733.9335327148438, "loss": 0.4044, "rewards/accuracies": 0.75, "rewards/chosen": -3.460312843322754, "rewards/margins": 1.2320371866226196, "rewards/rejected": -4.692350387573242, "step": 247 }, { "epoch": 0.5300561047288271, "grad_norm": 19.373390619498192, "learning_rate": 4.2690760224481585e-07, "logits/chosen": -1.25339674949646, "logits/rejected": -1.2282085418701172, "logps/chosen": -507.8250732421875, "logps/rejected": -594.3472290039062, "loss": 0.5068, "rewards/accuracies": 0.625, "rewards/chosen": -2.9802823066711426, "rewards/margins": 0.9198769330978394, "rewards/rejected": -3.9001595973968506, "step": 248 }, { "epoch": 0.5321934277317659, "grad_norm": 18.365430353743545, "learning_rate": 4.239216615780137e-07, "logits/chosen": -1.0999282598495483, "logits/rejected": -1.1215919256210327, "logps/chosen": -714.6709594726562, "logps/rejected": -876.0394897460938, "loss": 0.4225, "rewards/accuracies": 0.875, "rewards/chosen": -4.1298370361328125, "rewards/margins": 1.5860515832901, "rewards/rejected": -5.715888977050781, "step": 249 }, { "epoch": 0.5343307507347048, "grad_norm": 20.578219385486992, "learning_rate": 4.2093438249717754e-07, "logits/chosen": -1.3405359983444214, "logits/rejected": -1.3141045570373535, "logps/chosen": -621.981201171875, "logps/rejected": -748.8320922851562, "loss": 0.5387, "rewards/accuracies": 0.8125, "rewards/chosen": -3.775174140930176, "rewards/margins": 1.1837245225906372, "rewards/rejected": -4.958899021148682, "step": 250 }, { "epoch": 0.5364680737376436, "grad_norm": 13.688542006328998, "learning_rate": 4.17945932140206e-07, "logits/chosen": -1.0330238342285156, "logits/rejected": -0.9786776900291443, "logps/chosen": -804.1392822265625, "logps/rejected": -984.1054077148438, "loss": 0.3583, "rewards/accuracies": 0.8125, "rewards/chosen": -4.976142406463623, "rewards/margins": 1.77129065990448, "rewards/rejected": -6.747433662414551, "step": 251 }, { "epoch": 0.5386053967405824, "grad_norm": 23.49388537702244, "learning_rate": 4.1495647771053034e-07, "logits/chosen": -0.9389646649360657, "logits/rejected": -1.0423595905303955, "logps/chosen": -592.633544921875, "logps/rejected": -688.3473510742188, "loss": 0.4508, "rewards/accuracies": 0.625, "rewards/chosen": -3.8458104133605957, "rewards/margins": 0.9122557640075684, "rewards/rejected": -4.758066654205322, "step": 252 }, { "epoch": 0.5407427197435213, "grad_norm": 22.3306378383826, "learning_rate": 4.119661864677594e-07, "logits/chosen": -1.3090245723724365, "logits/rejected": -1.293576955795288, "logps/chosen": -752.04296875, "logps/rejected": -861.8148193359375, "loss": 0.4564, "rewards/accuracies": 0.875, "rewards/chosen": -4.430123805999756, "rewards/margins": 1.2242711782455444, "rewards/rejected": -5.65439510345459, "step": 253 }, { "epoch": 0.5428800427464601, "grad_norm": 14.971016637567855, "learning_rate": 4.08975225718322e-07, "logits/chosen": -1.172256350517273, "logits/rejected": -1.1566345691680908, "logps/chosen": -750.8662719726562, "logps/rejected": -982.0726318359375, "loss": 0.4086, "rewards/accuracies": 0.9375, "rewards/chosen": -4.331751823425293, "rewards/margins": 2.4119136333465576, "rewards/rejected": -6.74366569519043, "step": 254 }, { "epoch": 0.5450173657493989, "grad_norm": 16.74852123402871, "learning_rate": 4.0598376280610545e-07, "logits/chosen": -1.1496988534927368, "logits/rejected": -1.1922340393066406, "logps/chosen": -729.1366577148438, "logps/rejected": -863.9114990234375, "loss": 0.4343, "rewards/accuracies": 0.9375, "rewards/chosen": -3.9464735984802246, "rewards/margins": 1.6140131950378418, "rewards/rejected": -5.560486793518066, "step": 255 }, { "epoch": 0.5471546887523377, "grad_norm": 21.036518560558875, "learning_rate": 4.029919651030932e-07, "logits/chosen": -1.2442394495010376, "logits/rejected": -1.224663496017456, "logps/chosen": -604.781005859375, "logps/rejected": -661.07763671875, "loss": 0.4658, "rewards/accuracies": 0.75, "rewards/chosen": -3.6011922359466553, "rewards/margins": 0.4141632914543152, "rewards/rejected": -4.015355587005615, "step": 256 }, { "epoch": 0.5492920117552765, "grad_norm": 21.112582633149223, "learning_rate": 4e-07, "logits/chosen": -1.2898122072219849, "logits/rejected": -1.235668659210205, "logps/chosen": -665.9544677734375, "logps/rejected": -848.035400390625, "loss": 0.4932, "rewards/accuracies": 0.8125, "rewards/chosen": -3.9917383193969727, "rewards/margins": 1.5706775188446045, "rewards/rejected": -5.562415599822998, "step": 257 }, { "epoch": 0.5514293347582153, "grad_norm": 17.267156100092052, "learning_rate": 3.9700803489690677e-07, "logits/chosen": -1.2663730382919312, "logits/rejected": -1.2143464088439941, "logps/chosen": -638.0128173828125, "logps/rejected": -827.65966796875, "loss": 0.3859, "rewards/accuracies": 0.9375, "rewards/chosen": -3.7429115772247314, "rewards/margins": 1.5405060052871704, "rewards/rejected": -5.283417701721191, "step": 258 }, { "epoch": 0.5535666577611541, "grad_norm": 14.88755803006458, "learning_rate": 3.940162371938946e-07, "logits/chosen": -1.1717605590820312, "logits/rejected": -1.1811717748641968, "logps/chosen": -575.023681640625, "logps/rejected": -674.9644165039062, "loss": 0.385, "rewards/accuracies": 0.6875, "rewards/chosen": -3.434030771255493, "rewards/margins": 0.9599059224128723, "rewards/rejected": -4.393936634063721, "step": 259 }, { "epoch": 0.555703980764093, "grad_norm": 15.467774533794707, "learning_rate": 3.91024774281678e-07, "logits/chosen": -1.231236457824707, "logits/rejected": -1.3122860193252563, "logps/chosen": -754.0248413085938, "logps/rejected": -851.9044799804688, "loss": 0.4214, "rewards/accuracies": 0.75, "rewards/chosen": -4.264395713806152, "rewards/margins": 1.528761625289917, "rewards/rejected": -5.79315710067749, "step": 260 }, { "epoch": 0.5578413037670318, "grad_norm": 14.748190228015728, "learning_rate": 3.880338135322407e-07, "logits/chosen": -0.9601567983627319, "logits/rejected": -0.9594780206680298, "logps/chosen": -532.7817993164062, "logps/rejected": -641.3465576171875, "loss": 0.4361, "rewards/accuracies": 0.625, "rewards/chosen": -3.3210856914520264, "rewards/margins": 1.0201408863067627, "rewards/rejected": -4.341227054595947, "step": 261 }, { "epoch": 0.5599786267699706, "grad_norm": 18.112744314127806, "learning_rate": 3.850435222894698e-07, "logits/chosen": -1.0205025672912598, "logits/rejected": -1.062712550163269, "logps/chosen": -724.7273559570312, "logps/rejected": -883.1221923828125, "loss": 0.4313, "rewards/accuracies": 0.8125, "rewards/chosen": -4.406254291534424, "rewards/margins": 1.7283343076705933, "rewards/rejected": -6.134588718414307, "step": 262 }, { "epoch": 0.5621159497729095, "grad_norm": 18.973955414256828, "learning_rate": 3.8205406785979413e-07, "logits/chosen": -1.1698824167251587, "logits/rejected": -1.245084524154663, "logps/chosen": -756.6258544921875, "logps/rejected": -888.836669921875, "loss": 0.4413, "rewards/accuracies": 0.8125, "rewards/chosen": -4.1967387199401855, "rewards/margins": 1.4100241661071777, "rewards/rejected": -5.606762886047363, "step": 263 }, { "epoch": 0.5642532727758482, "grad_norm": 18.24827242372365, "learning_rate": 3.790656175028224e-07, "logits/chosen": -1.0585461854934692, "logits/rejected": -1.0999590158462524, "logps/chosen": -639.7376708984375, "logps/rejected": -804.4436645507812, "loss": 0.4411, "rewards/accuracies": 0.75, "rewards/chosen": -3.9402523040771484, "rewards/margins": 1.6609243154525757, "rewards/rejected": -5.601176738739014, "step": 264 }, { "epoch": 0.566390595778787, "grad_norm": 24.668076315985935, "learning_rate": 3.760783384219863e-07, "logits/chosen": -1.0429140329360962, "logits/rejected": -1.0325287580490112, "logps/chosen": -712.508544921875, "logps/rejected": -817.262451171875, "loss": 0.5696, "rewards/accuracies": 0.625, "rewards/chosen": -4.626794815063477, "rewards/margins": 1.1756452322006226, "rewards/rejected": -5.802440166473389, "step": 265 }, { "epoch": 0.5685279187817259, "grad_norm": 20.420005320382554, "learning_rate": 3.730923977551841e-07, "logits/chosen": -1.0772805213928223, "logits/rejected": -1.1414085626602173, "logps/chosen": -486.30316162109375, "logps/rejected": -534.7408447265625, "loss": 0.4484, "rewards/accuracies": 0.6875, "rewards/chosen": -3.166795492172241, "rewards/margins": 0.49554774165153503, "rewards/rejected": -3.6623430252075195, "step": 266 }, { "epoch": 0.5706652417846647, "grad_norm": 16.033046055991687, "learning_rate": 3.7010796256543027e-07, "logits/chosen": -1.0211303234100342, "logits/rejected": -1.028752088546753, "logps/chosen": -680.7797241210938, "logps/rejected": -796.4379272460938, "loss": 0.4204, "rewards/accuracies": 0.9375, "rewards/chosen": -4.561736583709717, "rewards/margins": 1.135624647140503, "rewards/rejected": -5.697361469268799, "step": 267 }, { "epoch": 0.5728025647876035, "grad_norm": 13.506751879581227, "learning_rate": 3.671251998315079e-07, "logits/chosen": -1.143643856048584, "logits/rejected": -1.1033873558044434, "logps/chosen": -630.0643920898438, "logps/rejected": -847.8607177734375, "loss": 0.3377, "rewards/accuracies": 1.0, "rewards/chosen": -3.909426689147949, "rewards/margins": 2.0678324699401855, "rewards/rejected": -5.977258682250977, "step": 268 }, { "epoch": 0.5749398877905424, "grad_norm": 23.078993663211875, "learning_rate": 3.641442764386266e-07, "logits/chosen": -1.2684578895568848, "logits/rejected": -1.2815937995910645, "logps/chosen": -547.9085693359375, "logps/rejected": -619.15673828125, "loss": 0.5744, "rewards/accuracies": 0.5625, "rewards/chosen": -3.3864290714263916, "rewards/margins": 0.6473042964935303, "rewards/rejected": -4.033733367919922, "step": 269 }, { "epoch": 0.5770772107934812, "grad_norm": 16.148785025450067, "learning_rate": 3.611653591690847e-07, "logits/chosen": -1.0950298309326172, "logits/rejected": -1.1259742975234985, "logps/chosen": -606.2462158203125, "logps/rejected": -697.3421020507812, "loss": 0.4529, "rewards/accuracies": 0.6875, "rewards/chosen": -3.8272175788879395, "rewards/margins": 0.8501311540603638, "rewards/rejected": -4.677348613739014, "step": 270 }, { "epoch": 0.57921453379642, "grad_norm": 14.989894099730115, "learning_rate": 3.5818861469293866e-07, "logits/chosen": -1.1437263488769531, "logits/rejected": -1.1254221200942993, "logps/chosen": -622.5841674804688, "logps/rejected": -733.7385864257812, "loss": 0.3795, "rewards/accuracies": 0.75, "rewards/chosen": -3.9022326469421387, "rewards/margins": 1.0598499774932861, "rewards/rejected": -4.962083339691162, "step": 271 }, { "epoch": 0.5813518567993589, "grad_norm": 16.156205336224875, "learning_rate": 3.5521420955867687e-07, "logits/chosen": -1.0730040073394775, "logits/rejected": -1.0788663625717163, "logps/chosen": -613.4228515625, "logps/rejected": -743.5269165039062, "loss": 0.392, "rewards/accuracies": 0.75, "rewards/chosen": -3.553554058074951, "rewards/margins": 1.3978602886199951, "rewards/rejected": -4.951415061950684, "step": 272 }, { "epoch": 0.5834891798022976, "grad_norm": 16.15777168809771, "learning_rate": 3.5224231018390234e-07, "logits/chosen": -1.2778537273406982, "logits/rejected": -1.1532944440841675, "logps/chosen": -565.4224853515625, "logps/rejected": -756.1430053710938, "loss": 0.4587, "rewards/accuracies": 0.8125, "rewards/chosen": -3.2680768966674805, "rewards/margins": 1.7718642950057983, "rewards/rejected": -5.039941787719727, "step": 273 }, { "epoch": 0.5856265028052364, "grad_norm": 14.465499992590448, "learning_rate": 3.492730828460207e-07, "logits/chosen": -1.1964144706726074, "logits/rejected": -1.0841807126998901, "logps/chosen": -606.995849609375, "logps/rejected": -809.55224609375, "loss": 0.4038, "rewards/accuracies": 0.9375, "rewards/chosen": -3.509310245513916, "rewards/margins": 1.7641215324401855, "rewards/rejected": -5.273431777954102, "step": 274 }, { "epoch": 0.5877638258081752, "grad_norm": 18.62595847705199, "learning_rate": 3.4630669367293793e-07, "logits/chosen": -1.322818398475647, "logits/rejected": -1.2504830360412598, "logps/chosen": -557.3622436523438, "logps/rejected": -669.48974609375, "loss": 0.4694, "rewards/accuracies": 0.8125, "rewards/chosen": -3.3967947959899902, "rewards/margins": 0.9858072996139526, "rewards/rejected": -4.382602214813232, "step": 275 }, { "epoch": 0.5899011488111141, "grad_norm": 14.846250971855795, "learning_rate": 3.4334330863376444e-07, "logits/chosen": -1.0945661067962646, "logits/rejected": -1.0822480916976929, "logps/chosen": -538.973388671875, "logps/rejected": -668.257568359375, "loss": 0.3706, "rewards/accuracies": 0.8125, "rewards/chosen": -3.1467766761779785, "rewards/margins": 1.3210586309432983, "rewards/rejected": -4.467835903167725, "step": 276 }, { "epoch": 0.5920384718140529, "grad_norm": 15.039517764071677, "learning_rate": 3.4038309352953014e-07, "logits/chosen": -1.1040856838226318, "logits/rejected": -1.1212596893310547, "logps/chosen": -417.47808837890625, "logps/rejected": -508.0076599121094, "loss": 0.3713, "rewards/accuracies": 0.6875, "rewards/chosen": -2.398800849914551, "rewards/margins": 0.8262351751327515, "rewards/rejected": -3.225036382675171, "step": 277 }, { "epoch": 0.5941757948169917, "grad_norm": 13.350623045669444, "learning_rate": 3.3742621398390754e-07, "logits/chosen": -1.0822662115097046, "logits/rejected": -1.0757577419281006, "logps/chosen": -532.8475341796875, "logps/rejected": -611.3041381835938, "loss": 0.3868, "rewards/accuracies": 0.625, "rewards/chosen": -3.1749584674835205, "rewards/margins": 0.6507718563079834, "rewards/rejected": -3.8257298469543457, "step": 278 }, { "epoch": 0.5963131178199306, "grad_norm": 15.78335853431774, "learning_rate": 3.344728354339449e-07, "logits/chosen": -1.2665464878082275, "logits/rejected": -1.2659828662872314, "logps/chosen": -554.4942016601562, "logps/rejected": -625.689208984375, "loss": 0.3992, "rewards/accuracies": 0.625, "rewards/chosen": -3.3500773906707764, "rewards/margins": 0.8534866571426392, "rewards/rejected": -4.203563690185547, "step": 279 }, { "epoch": 0.5984504408228694, "grad_norm": 18.33665050808218, "learning_rate": 3.315231231208099e-07, "logits/chosen": -1.3685458898544312, "logits/rejected": -1.233042597770691, "logps/chosen": -728.152099609375, "logps/rejected": -849.6878662109375, "loss": 0.4596, "rewards/accuracies": 0.75, "rewards/chosen": -4.0944743156433105, "rewards/margins": 1.1558749675750732, "rewards/rejected": -5.250349998474121, "step": 280 }, { "epoch": 0.6005877638258081, "grad_norm": 18.245751326222873, "learning_rate": 3.2857724208054536e-07, "logits/chosen": -1.3227458000183105, "logits/rejected": -1.2658473253250122, "logps/chosen": -644.74755859375, "logps/rejected": -807.0831909179688, "loss": 0.4902, "rewards/accuracies": 0.8125, "rewards/chosen": -3.4471018314361572, "rewards/margins": 1.4783265590667725, "rewards/rejected": -4.92542839050293, "step": 281 }, { "epoch": 0.602725086828747, "grad_norm": 15.332405651097067, "learning_rate": 3.256353571348342e-07, "logits/chosen": -1.2925728559494019, "logits/rejected": -1.2592719793319702, "logps/chosen": -593.9208984375, "logps/rejected": -715.414306640625, "loss": 0.4138, "rewards/accuracies": 0.8125, "rewards/chosen": -3.4808356761932373, "rewards/margins": 1.198320984840393, "rewards/rejected": -4.679156303405762, "step": 282 }, { "epoch": 0.6048624098316858, "grad_norm": 20.021176403122144, "learning_rate": 3.2269763288177875e-07, "logits/chosen": -1.150704026222229, "logits/rejected": -1.1762549877166748, "logps/chosen": -602.2947387695312, "logps/rejected": -703.0159912109375, "loss": 0.3747, "rewards/accuracies": 0.875, "rewards/chosen": -3.3265883922576904, "rewards/margins": 1.0499697923660278, "rewards/rejected": -4.376558303833008, "step": 283 }, { "epoch": 0.6069997328346246, "grad_norm": 19.739728018611924, "learning_rate": 3.197642336866909e-07, "logits/chosen": -1.1246461868286133, "logits/rejected": -1.1267664432525635, "logps/chosen": -591.5923461914062, "logps/rejected": -673.4954833984375, "loss": 0.4876, "rewards/accuracies": 0.625, "rewards/chosen": -3.6883199214935303, "rewards/margins": 0.7502920627593994, "rewards/rejected": -4.43861198425293, "step": 284 }, { "epoch": 0.6091370558375635, "grad_norm": 23.876230746394956, "learning_rate": 3.168353236728963e-07, "logits/chosen": -1.2377270460128784, "logits/rejected": -1.242301106452942, "logps/chosen": -601.439697265625, "logps/rejected": -642.472412109375, "loss": 0.5947, "rewards/accuracies": 0.625, "rewards/chosen": -3.569728374481201, "rewards/margins": 0.6284288167953491, "rewards/rejected": -4.19815731048584, "step": 285 }, { "epoch": 0.6112743788405023, "grad_norm": 22.209485552755492, "learning_rate": 3.139110667125512e-07, "logits/chosen": -0.9830377697944641, "logits/rejected": -0.9108029007911682, "logps/chosen": -562.501708984375, "logps/rejected": -705.535400390625, "loss": 0.4883, "rewards/accuracies": 0.75, "rewards/chosen": -3.516263961791992, "rewards/margins": 1.286893367767334, "rewards/rejected": -4.803157806396484, "step": 286 }, { "epoch": 0.6134117018434411, "grad_norm": 19.053259171815892, "learning_rate": 3.1099162641747425e-07, "logits/chosen": -0.9869133830070496, "logits/rejected": -1.0198793411254883, "logps/chosen": -587.336669921875, "logps/rejected": -673.4232788085938, "loss": 0.4925, "rewards/accuracies": 0.6875, "rewards/chosen": -3.481856107711792, "rewards/margins": 0.7810866832733154, "rewards/rejected": -4.262942314147949, "step": 287 }, { "epoch": 0.6155490248463799, "grad_norm": 20.90972346201698, "learning_rate": 3.0807716612999227e-07, "logits/chosen": -1.2177681922912598, "logits/rejected": -1.2541393041610718, "logps/chosen": -608.8695068359375, "logps/rejected": -738.279052734375, "loss": 0.4164, "rewards/accuracies": 0.75, "rewards/chosen": -3.7167322635650635, "rewards/margins": 1.185416340827942, "rewards/rejected": -4.902148723602295, "step": 288 }, { "epoch": 0.6176863478493188, "grad_norm": 17.392772948265563, "learning_rate": 3.05167848913801e-07, "logits/chosen": -1.1927785873413086, "logits/rejected": -1.101136326789856, "logps/chosen": -615.4705810546875, "logps/rejected": -807.6544799804688, "loss": 0.4141, "rewards/accuracies": 0.9375, "rewards/chosen": -3.5115911960601807, "rewards/margins": 1.9201233386993408, "rewards/rejected": -5.4317145347595215, "step": 289 }, { "epoch": 0.6198236708522575, "grad_norm": 39.464356706878, "learning_rate": 3.022638375448423e-07, "logits/chosen": -1.198436975479126, "logits/rejected": -1.1444064378738403, "logps/chosen": -669.1489868164062, "logps/rejected": -752.60009765625, "loss": 0.4269, "rewards/accuracies": 0.625, "rewards/chosen": -4.052773952484131, "rewards/margins": 0.8161685466766357, "rewards/rejected": -4.868942737579346, "step": 290 }, { "epoch": 0.6219609938551963, "grad_norm": 14.841377854168094, "learning_rate": 2.993652945021966e-07, "logits/chosen": -1.2626550197601318, "logits/rejected": -1.2754631042480469, "logps/chosen": -603.2761840820312, "logps/rejected": -811.0333251953125, "loss": 0.3819, "rewards/accuracies": 0.875, "rewards/chosen": -3.527158737182617, "rewards/margins": 1.902860403060913, "rewards/rejected": -5.430018424987793, "step": 291 }, { "epoch": 0.6240983168581352, "grad_norm": 19.276604227233815, "learning_rate": 2.9647238195899166e-07, "logits/chosen": -1.097237467765808, "logits/rejected": -1.037533164024353, "logps/chosen": -494.391845703125, "logps/rejected": -708.1276245117188, "loss": 0.3576, "rewards/accuracies": 0.875, "rewards/chosen": -2.6569738388061523, "rewards/margins": 2.002774238586426, "rewards/rejected": -4.659748077392578, "step": 292 }, { "epoch": 0.626235639861074, "grad_norm": 18.95704234795035, "learning_rate": 2.9358526177332995e-07, "logits/chosen": -1.1904467344284058, "logits/rejected": -1.1917550563812256, "logps/chosen": -709.7570190429688, "logps/rejected": -801.443359375, "loss": 0.4692, "rewards/accuracies": 0.75, "rewards/chosen": -4.291625022888184, "rewards/margins": 0.9095453023910522, "rewards/rejected": -5.201169967651367, "step": 293 }, { "epoch": 0.6283729628640128, "grad_norm": 18.243219081118724, "learning_rate": 2.90704095479232e-07, "logits/chosen": -1.002289056777954, "logits/rejected": -1.0251970291137695, "logps/chosen": -596.549560546875, "logps/rejected": -699.5868530273438, "loss": 0.5095, "rewards/accuracies": 0.6875, "rewards/chosen": -3.7809488773345947, "rewards/margins": 0.9690990447998047, "rewards/rejected": -4.7500481605529785, "step": 294 }, { "epoch": 0.6305102858669517, "grad_norm": 17.459161625382947, "learning_rate": 2.8782904427759893e-07, "logits/chosen": -1.3322062492370605, "logits/rejected": -1.3123010396957397, "logps/chosen": -506.9045715332031, "logps/rejected": -793.8304443359375, "loss": 0.4079, "rewards/accuracies": 0.9375, "rewards/chosen": -2.7689664363861084, "rewards/margins": 2.5288219451904297, "rewards/rejected": -5.297788619995117, "step": 295 }, { "epoch": 0.6326476088698905, "grad_norm": 16.674107473408924, "learning_rate": 2.849602690271931e-07, "logits/chosen": -1.151752233505249, "logits/rejected": -1.1323444843292236, "logps/chosen": -626.8087768554688, "logps/rejected": -798.4351806640625, "loss": 0.3672, "rewards/accuracies": 1.0, "rewards/chosen": -3.600851058959961, "rewards/margins": 1.5587782859802246, "rewards/rejected": -5.1596293449401855, "step": 296 }, { "epoch": 0.6347849318728293, "grad_norm": 17.154415301310323, "learning_rate": 2.820979302356383e-07, "logits/chosen": -1.213531494140625, "logits/rejected": -1.1952483654022217, "logps/chosen": -578.425537109375, "logps/rejected": -722.8228759765625, "loss": 0.3553, "rewards/accuracies": 0.875, "rewards/chosen": -3.1612937450408936, "rewards/margins": 1.449401617050171, "rewards/rejected": -4.6106953620910645, "step": 297 }, { "epoch": 0.6369222548757681, "grad_norm": 18.96014182785911, "learning_rate": 2.792421880504392e-07, "logits/chosen": -1.2606010437011719, "logits/rejected": -1.236906886100769, "logps/chosen": -544.4500122070312, "logps/rejected": -641.5679931640625, "loss": 0.3772, "rewards/accuracies": 0.8125, "rewards/chosen": -3.2772369384765625, "rewards/margins": 0.9185816049575806, "rewards/rejected": -4.195818901062012, "step": 298 }, { "epoch": 0.6390595778787069, "grad_norm": 23.422220984702424, "learning_rate": 2.7639320225002107e-07, "logits/chosen": -1.2836658954620361, "logits/rejected": -1.2603038549423218, "logps/chosen": -521.1630859375, "logps/rejected": -686.9148559570312, "loss": 0.4465, "rewards/accuracies": 0.9375, "rewards/chosen": -3.0996432304382324, "rewards/margins": 1.5820202827453613, "rewards/rejected": -4.681663990020752, "step": 299 }, { "epoch": 0.6411969008816457, "grad_norm": 17.358355183824074, "learning_rate": 2.735511322347904e-07, "logits/chosen": -1.08761727809906, "logits/rejected": -1.0740941762924194, "logps/chosen": -684.2798461914062, "logps/rejected": -888.6907958984375, "loss": 0.3663, "rewards/accuracies": 0.8125, "rewards/chosen": -4.057728290557861, "rewards/margins": 2.032811164855957, "rewards/rejected": -6.090539455413818, "step": 300 }, { "epoch": 0.6411969008816457, "eval_logits/chosen": -1.3138450384140015, "eval_logits/rejected": -1.3326525688171387, "eval_logps/chosen": -668.366943359375, "eval_logps/rejected": -808.5509033203125, "eval_loss": 0.40117064118385315, "eval_rewards/accuracies": 0.8387096524238586, "eval_rewards/chosen": -3.888610601425171, "eval_rewards/margins": 1.4613444805145264, "eval_rewards/rejected": -5.349954605102539, "eval_runtime": 127.6244, "eval_samples_per_second": 15.365, "eval_steps_per_second": 0.486, "step": 300 }, { "epoch": 0.6433342238845846, "grad_norm": 20.781279404805886, "learning_rate": 2.7071613701821607e-07, "logits/chosen": -1.1530544757843018, "logits/rejected": -1.2095261812210083, "logps/chosen": -620.2159423828125, "logps/rejected": -744.0250854492188, "loss": 0.4255, "rewards/accuracies": 0.8125, "rewards/chosen": -3.827709436416626, "rewards/margins": 1.2618365287780762, "rewards/rejected": -5.089546203613281, "step": 301 }, { "epoch": 0.6454715468875234, "grad_norm": 20.821176231506904, "learning_rate": 2.678883752179333e-07, "logits/chosen": -1.2675704956054688, "logits/rejected": -1.3029217720031738, "logps/chosen": -689.5173950195312, "logps/rejected": -833.730712890625, "loss": 0.4655, "rewards/accuracies": 0.75, "rewards/chosen": -4.1408796310424805, "rewards/margins": 1.3784571886062622, "rewards/rejected": -5.519336223602295, "step": 302 }, { "epoch": 0.6476088698904622, "grad_norm": 20.312995975775525, "learning_rate": 2.65068005046868e-07, "logits/chosen": -1.3904988765716553, "logits/rejected": -1.4191157817840576, "logps/chosen": -761.3741455078125, "logps/rejected": -1012.2886962890625, "loss": 0.4047, "rewards/accuracies": 0.6875, "rewards/chosen": -4.49802303314209, "rewards/margins": 2.586608409881592, "rewards/rejected": -7.08463191986084, "step": 303 }, { "epoch": 0.649746192893401, "grad_norm": 18.605444040069756, "learning_rate": 2.622551843043859e-07, "logits/chosen": -1.2163584232330322, "logits/rejected": -1.2040042877197266, "logps/chosen": -582.2776489257812, "logps/rejected": -685.1724853515625, "loss": 0.4572, "rewards/accuracies": 0.6875, "rewards/chosen": -3.6959495544433594, "rewards/margins": 0.9104414582252502, "rewards/rejected": -4.606390953063965, "step": 304 }, { "epoch": 0.6518835158963399, "grad_norm": 16.54860058502452, "learning_rate": 2.5945007036746285e-07, "logits/chosen": -1.2848327159881592, "logits/rejected": -1.2738027572631836, "logps/chosen": -659.4707641601562, "logps/rejected": -836.122802734375, "loss": 0.4086, "rewards/accuracies": 0.75, "rewards/chosen": -3.9217731952667236, "rewards/margins": 1.8700674772262573, "rewards/rejected": -5.791840553283691, "step": 305 }, { "epoch": 0.6540208388992786, "grad_norm": 18.88108514365039, "learning_rate": 2.5665282018187986e-07, "logits/chosen": -1.2504708766937256, "logits/rejected": -1.2493597269058228, "logps/chosen": -616.8862915039062, "logps/rejected": -682.9422607421875, "loss": 0.4277, "rewards/accuracies": 0.75, "rewards/chosen": -3.915069818496704, "rewards/margins": 0.7035645246505737, "rewards/rejected": -4.618634223937988, "step": 306 }, { "epoch": 0.6561581619022174, "grad_norm": 26.06295428477828, "learning_rate": 2.5386359025344197e-07, "logits/chosen": -1.2800426483154297, "logits/rejected": -1.2985005378723145, "logps/chosen": -688.5096435546875, "logps/rejected": -803.2349243164062, "loss": 0.4146, "rewards/accuracies": 0.75, "rewards/chosen": -4.173797130584717, "rewards/margins": 1.3077186346054077, "rewards/rejected": -5.481515884399414, "step": 307 }, { "epoch": 0.6582954849051563, "grad_norm": 14.94353300613803, "learning_rate": 2.510825366392219e-07, "logits/chosen": -1.27797269821167, "logits/rejected": -1.263561725616455, "logps/chosen": -560.7982788085938, "logps/rejected": -704.850341796875, "loss": 0.3413, "rewards/accuracies": 0.875, "rewards/chosen": -3.493176221847534, "rewards/margins": 1.3737064599990845, "rewards/rejected": -4.86688232421875, "step": 308 }, { "epoch": 0.6604328079080951, "grad_norm": 20.334800907510225, "learning_rate": 2.4830981493882865e-07, "logits/chosen": -1.2430211305618286, "logits/rejected": -1.275850534439087, "logps/chosen": -584.9478759765625, "logps/rejected": -739.5841674804688, "loss": 0.464, "rewards/accuracies": 0.8125, "rewards/chosen": -3.6128716468811035, "rewards/margins": 1.7369210720062256, "rewards/rejected": -5.34979248046875, "step": 309 }, { "epoch": 0.6625701309110339, "grad_norm": 17.83873419658186, "learning_rate": 2.455455802857015e-07, "logits/chosen": -1.1412709951400757, "logits/rejected": -1.2062087059020996, "logps/chosen": -765.4309692382812, "logps/rejected": -982.59765625, "loss": 0.3761, "rewards/accuracies": 0.875, "rewards/chosen": -4.830318927764893, "rewards/margins": 1.9539070129394531, "rewards/rejected": -6.784224987030029, "step": 310 }, { "epoch": 0.6647074539139728, "grad_norm": 16.263277573360906, "learning_rate": 2.4278998733843056e-07, "logits/chosen": -1.149855136871338, "logits/rejected": -1.1428004503250122, "logps/chosen": -699.1431274414062, "logps/rejected": -828.4837036132812, "loss": 0.428, "rewards/accuracies": 0.75, "rewards/chosen": -4.370693206787109, "rewards/margins": 1.2527273893356323, "rewards/rejected": -5.623420715332031, "step": 311 }, { "epoch": 0.6668447769169116, "grad_norm": 21.249199073124323, "learning_rate": 2.4004319027210367e-07, "logits/chosen": -1.0749962329864502, "logits/rejected": -1.0791423320770264, "logps/chosen": -673.4476318359375, "logps/rejected": -822.49169921875, "loss": 0.3787, "rewards/accuracies": 0.8125, "rewards/chosen": -3.9068007469177246, "rewards/margins": 1.6602363586425781, "rewards/rejected": -5.567037105560303, "step": 312 }, { "epoch": 0.6689820999198504, "grad_norm": 19.210337957305978, "learning_rate": 2.3730534276967998e-07, "logits/chosen": -1.2630716562271118, "logits/rejected": -1.2258106470108032, "logps/chosen": -718.524658203125, "logps/rejected": -831.8648681640625, "loss": 0.4599, "rewards/accuracies": 0.8125, "rewards/chosen": -4.357652187347412, "rewards/margins": 1.134694218635559, "rewards/rejected": -5.492346286773682, "step": 313 }, { "epoch": 0.6711194229227893, "grad_norm": 19.848471724389256, "learning_rate": 2.345765980133918e-07, "logits/chosen": -1.2012803554534912, "logits/rejected": -1.1929750442504883, "logps/chosen": -779.7426147460938, "logps/rejected": -1021.65625, "loss": 0.4261, "rewards/accuracies": 0.75, "rewards/chosen": -4.710476875305176, "rewards/margins": 2.3127245903015137, "rewards/rejected": -7.023200988769531, "step": 314 }, { "epoch": 0.673256745925728, "grad_norm": 16.96894863528391, "learning_rate": 2.3185710867617386e-07, "logits/chosen": -1.13607656955719, "logits/rejected": -1.1218336820602417, "logps/chosen": -734.45751953125, "logps/rejected": -819.6937866210938, "loss": 0.4107, "rewards/accuracies": 0.875, "rewards/chosen": -4.487407684326172, "rewards/margins": 0.7835354804992676, "rewards/rejected": -5.2709431648254395, "step": 315 }, { "epoch": 0.6753940689286668, "grad_norm": 23.53945549697492, "learning_rate": 2.2914702691312144e-07, "logits/chosen": -1.2212194204330444, "logits/rejected": -1.2105563879013062, "logps/chosen": -656.0445556640625, "logps/rejected": -824.4144287109375, "loss": 0.4747, "rewards/accuracies": 0.875, "rewards/chosen": -3.8364951610565186, "rewards/margins": 1.6471456289291382, "rewards/rejected": -5.483640670776367, "step": 316 }, { "epoch": 0.6775313919316056, "grad_norm": 22.00641078072797, "learning_rate": 2.2644650435297675e-07, "logits/chosen": -0.9918270707130432, "logits/rejected": -1.0709521770477295, "logps/chosen": -635.0587768554688, "logps/rejected": -808.44140625, "loss": 0.4351, "rewards/accuracies": 0.8125, "rewards/chosen": -3.9043281078338623, "rewards/margins": 1.8593040704727173, "rewards/rejected": -5.763632297515869, "step": 317 }, { "epoch": 0.6796687149345445, "grad_norm": 18.51407185653388, "learning_rate": 2.2375569208964645e-07, "logits/chosen": -1.2991502285003662, "logits/rejected": -1.2963578701019287, "logps/chosen": -552.6873779296875, "logps/rejected": -702.3699951171875, "loss": 0.4147, "rewards/accuracies": 0.9375, "rewards/chosen": -3.1100082397460938, "rewards/margins": 1.3412725925445557, "rewards/rejected": -4.45128059387207, "step": 318 }, { "epoch": 0.6818060379374833, "grad_norm": 21.46578371453758, "learning_rate": 2.210747406737469e-07, "logits/chosen": -1.1598680019378662, "logits/rejected": -1.0896382331848145, "logps/chosen": -679.51171875, "logps/rejected": -972.2852783203125, "loss": 0.3608, "rewards/accuracies": 0.9375, "rewards/chosen": -4.203690052032471, "rewards/margins": 2.7421700954437256, "rewards/rejected": -6.945860862731934, "step": 319 }, { "epoch": 0.6839433609404221, "grad_norm": 15.531933254870697, "learning_rate": 2.1840380010418128e-07, "logits/chosen": -1.2312465906143188, "logits/rejected": -1.2830405235290527, "logps/chosen": -637.8733520507812, "logps/rejected": -837.0244140625, "loss": 0.4024, "rewards/accuracies": 0.9375, "rewards/chosen": -3.5754833221435547, "rewards/margins": 1.8828397989273071, "rewards/rejected": -5.458322525024414, "step": 320 }, { "epoch": 0.686080683943361, "grad_norm": 21.09966403146541, "learning_rate": 2.1574301981974715e-07, "logits/chosen": -1.0287922620773315, "logits/rejected": -1.1071122884750366, "logps/chosen": -648.6826171875, "logps/rejected": -769.0919799804688, "loss": 0.4074, "rewards/accuracies": 0.75, "rewards/chosen": -3.858161211013794, "rewards/margins": 1.2575337886810303, "rewards/rejected": -5.115694999694824, "step": 321 }, { "epoch": 0.6882180069462998, "grad_norm": 18.679551175728633, "learning_rate": 2.1309254869077518e-07, "logits/chosen": -1.123620629310608, "logits/rejected": -1.142130732536316, "logps/chosen": -643.7474365234375, "logps/rejected": -805.8037719726562, "loss": 0.3653, "rewards/accuracies": 0.875, "rewards/chosen": -3.7344915866851807, "rewards/margins": 1.7917336225509644, "rewards/rejected": -5.526225566864014, "step": 322 }, { "epoch": 0.6903553299492385, "grad_norm": 20.66758893901287, "learning_rate": 2.1045253501080054e-07, "logits/chosen": -1.1394745111465454, "logits/rejected": -1.1683499813079834, "logps/chosen": -561.175537109375, "logps/rejected": -684.5661010742188, "loss": 0.401, "rewards/accuracies": 0.75, "rewards/chosen": -3.2739458084106445, "rewards/margins": 1.2987473011016846, "rewards/rejected": -4.57269287109375, "step": 323 }, { "epoch": 0.6924926529521774, "grad_norm": 18.77702538784566, "learning_rate": 2.0782312648826505e-07, "logits/chosen": -1.2171223163604736, "logits/rejected": -1.2107038497924805, "logps/chosen": -609.377197265625, "logps/rejected": -789.9899291992188, "loss": 0.3615, "rewards/accuracies": 0.9375, "rewards/chosen": -3.611778736114502, "rewards/margins": 1.7298355102539062, "rewards/rejected": -5.34161376953125, "step": 324 }, { "epoch": 0.6946299759551162, "grad_norm": 17.599881691021295, "learning_rate": 2.0520447023825306e-07, "logits/chosen": -1.1523199081420898, "logits/rejected": -1.1357042789459229, "logps/chosen": -502.8484802246094, "logps/rejected": -595.9307250976562, "loss": 0.3407, "rewards/accuracies": 0.8125, "rewards/chosen": -3.2537741661071777, "rewards/margins": 0.8426163792610168, "rewards/rejected": -4.096390247344971, "step": 325 }, { "epoch": 0.696767298958055, "grad_norm": 15.783181757197172, "learning_rate": 2.0259671277426123e-07, "logits/chosen": -1.0857826471328735, "logits/rejected": -1.0804615020751953, "logps/chosen": -541.2989501953125, "logps/rejected": -728.1956176757812, "loss": 0.3597, "rewards/accuracies": 0.8125, "rewards/chosen": -3.1643340587615967, "rewards/margins": 1.8594626188278198, "rewards/rejected": -5.023797035217285, "step": 326 }, { "epoch": 0.6989046219609939, "grad_norm": 18.584132337966185, "learning_rate": 2.0000000000000007e-07, "logits/chosen": -1.1965287923812866, "logits/rejected": -1.1642347574234009, "logps/chosen": -604.3222045898438, "logps/rejected": -733.7069091796875, "loss": 0.443, "rewards/accuracies": 0.75, "rewards/chosen": -3.5162549018859863, "rewards/margins": 1.2821638584136963, "rewards/rejected": -4.7984185218811035, "step": 327 }, { "epoch": 0.7010419449639327, "grad_norm": 20.799891467119032, "learning_rate": 1.9741447720123095e-07, "logits/chosen": -1.184598684310913, "logits/rejected": -1.1836549043655396, "logps/chosen": -607.91748046875, "logps/rejected": -766.5301513671875, "loss": 0.4253, "rewards/accuracies": 0.75, "rewards/chosen": -3.7808310985565186, "rewards/margins": 1.377992033958435, "rewards/rejected": -5.158823013305664, "step": 328 }, { "epoch": 0.7031792679668715, "grad_norm": 17.876290923345074, "learning_rate": 1.9484028903763754e-07, "logits/chosen": -1.3433024883270264, "logits/rejected": -1.3774957656860352, "logps/chosen": -657.9730224609375, "logps/rejected": -829.743896484375, "loss": 0.4266, "rewards/accuracies": 0.875, "rewards/chosen": -3.6865885257720947, "rewards/margins": 1.7604457139968872, "rewards/rejected": -5.4470343589782715, "step": 329 }, { "epoch": 0.7053165909698104, "grad_norm": 19.62276873807506, "learning_rate": 1.9227757953473233e-07, "logits/chosen": -1.1646440029144287, "logits/rejected": -1.1711232662200928, "logps/chosen": -685.2410888671875, "logps/rejected": -812.1082763671875, "loss": 0.3925, "rewards/accuracies": 0.875, "rewards/chosen": -3.885378360748291, "rewards/margins": 1.3792006969451904, "rewards/rejected": -5.2645792961120605, "step": 330 }, { "epoch": 0.7074539139727491, "grad_norm": 20.713783128170522, "learning_rate": 1.8972649207579807e-07, "logits/chosen": -1.1773666143417358, "logits/rejected": -1.183628797531128, "logps/chosen": -571.4490356445312, "logps/rejected": -727.8609619140625, "loss": 0.3542, "rewards/accuracies": 0.875, "rewards/chosen": -3.21671462059021, "rewards/margins": 1.5842726230621338, "rewards/rejected": -4.800987720489502, "step": 331 }, { "epoch": 0.7095912369756879, "grad_norm": 14.323520360594955, "learning_rate": 1.871871693938654e-07, "logits/chosen": -1.2549258470535278, "logits/rejected": -1.218357801437378, "logps/chosen": -706.3805541992188, "logps/rejected": -884.6824951171875, "loss": 0.3146, "rewards/accuracies": 0.75, "rewards/chosen": -4.322238922119141, "rewards/margins": 1.728010892868042, "rewards/rejected": -6.050250053405762, "step": 332 }, { "epoch": 0.7117285599786267, "grad_norm": 15.481665401011659, "learning_rate": 1.8465975356372713e-07, "logits/chosen": -1.2304145097732544, "logits/rejected": -1.216366171836853, "logps/chosen": -635.7239379882812, "logps/rejected": -711.3768920898438, "loss": 0.4016, "rewards/accuracies": 0.75, "rewards/chosen": -3.948883056640625, "rewards/margins": 0.8189201951026917, "rewards/rejected": -4.767803192138672, "step": 333 }, { "epoch": 0.7138658829815656, "grad_norm": 17.737870919716684, "learning_rate": 1.8214438599398915e-07, "logits/chosen": -1.159693956375122, "logits/rejected": -1.2034831047058105, "logps/chosen": -673.4019775390625, "logps/rejected": -883.9022216796875, "loss": 0.4045, "rewards/accuracies": 0.8125, "rewards/chosen": -4.365835666656494, "rewards/margins": 1.9925627708435059, "rewards/rejected": -6.358397960662842, "step": 334 }, { "epoch": 0.7160032059845044, "grad_norm": 25.705664070692066, "learning_rate": 1.7964120741915902e-07, "logits/chosen": -1.2710121870040894, "logits/rejected": -1.1944791078567505, "logps/chosen": -608.1783447265625, "logps/rejected": -811.2279052734375, "loss": 0.4087, "rewards/accuracies": 0.875, "rewards/chosen": -3.5328927040100098, "rewards/margins": 1.9209730625152588, "rewards/rejected": -5.453866004943848, "step": 335 }, { "epoch": 0.7181405289874432, "grad_norm": 18.1742169248362, "learning_rate": 1.771503578917711e-07, "logits/chosen": -1.1713533401489258, "logits/rejected": -1.240103006362915, "logps/chosen": -669.6802368164062, "logps/rejected": -841.9739990234375, "loss": 0.427, "rewards/accuracies": 0.875, "rewards/chosen": -4.10801362991333, "rewards/margins": 1.8291915655136108, "rewards/rejected": -5.9372053146362305, "step": 336 }, { "epoch": 0.7202778519903821, "grad_norm": 18.465855653993856, "learning_rate": 1.7467197677455116e-07, "logits/chosen": -1.1570916175842285, "logits/rejected": -1.1773279905319214, "logps/chosen": -621.7543334960938, "logps/rejected": -863.5875854492188, "loss": 0.318, "rewards/accuracies": 0.875, "rewards/chosen": -3.662381172180176, "rewards/margins": 2.4044947624206543, "rewards/rejected": -6.066876411437988, "step": 337 }, { "epoch": 0.7224151749933209, "grad_norm": 18.086374212134224, "learning_rate": 1.7220620273261922e-07, "logits/chosen": -1.2305190563201904, "logits/rejected": -1.1802289485931396, "logps/chosen": -574.2893676757812, "logps/rejected": -654.89501953125, "loss": 0.3703, "rewards/accuracies": 0.8125, "rewards/chosen": -3.308969020843506, "rewards/margins": 0.8964172601699829, "rewards/rejected": -4.205386161804199, "step": 338 }, { "epoch": 0.7245524979962596, "grad_norm": 21.686278920021145, "learning_rate": 1.6975317372573065e-07, "logits/chosen": -0.9897362589836121, "logits/rejected": -1.0513758659362793, "logps/chosen": -718.6741943359375, "logps/rejected": -755.384765625, "loss": 0.392, "rewards/accuracies": 0.625, "rewards/chosen": -4.776792526245117, "rewards/margins": 0.4688657820224762, "rewards/rejected": -5.2456583976745605, "step": 339 }, { "epoch": 0.7266898209991985, "grad_norm": 17.6310482182181, "learning_rate": 1.6731302700055777e-07, "logits/chosen": -1.2428513765335083, "logits/rejected": -1.2316035032272339, "logps/chosen": -669.8923950195312, "logps/rejected": -796.8135986328125, "loss": 0.4199, "rewards/accuracies": 0.75, "rewards/chosen": -4.199666976928711, "rewards/margins": 1.1926888227462769, "rewards/rejected": -5.392355442047119, "step": 340 }, { "epoch": 0.7288271440021373, "grad_norm": 15.542714019081663, "learning_rate": 1.6488589908301078e-07, "logits/chosen": -1.090733289718628, "logits/rejected": -1.0847265720367432, "logps/chosen": -755.2330932617188, "logps/rejected": -932.0684204101562, "loss": 0.3997, "rewards/accuracies": 0.8125, "rewards/chosen": -5.0632171630859375, "rewards/margins": 1.7293668985366821, "rewards/rejected": -6.792583465576172, "step": 341 }, { "epoch": 0.7309644670050761, "grad_norm": 21.28545495220758, "learning_rate": 1.6247192577059941e-07, "logits/chosen": -1.232560634613037, "logits/rejected": -1.2369352579116821, "logps/chosen": -646.4221801757812, "logps/rejected": -811.7963256835938, "loss": 0.3937, "rewards/accuracies": 0.625, "rewards/chosen": -4.2266082763671875, "rewards/margins": 1.3389954566955566, "rewards/rejected": -5.565604209899902, "step": 342 }, { "epoch": 0.733101790008015, "grad_norm": 19.319434903860703, "learning_rate": 1.600712421248345e-07, "logits/chosen": -1.1512730121612549, "logits/rejected": -1.1143797636032104, "logps/chosen": -672.3272705078125, "logps/rejected": -789.5181274414062, "loss": 0.4251, "rewards/accuracies": 0.75, "rewards/chosen": -4.444333076477051, "rewards/margins": 1.1563409566879272, "rewards/rejected": -5.600674629211426, "step": 343 }, { "epoch": 0.7352391130109538, "grad_norm": 22.15460617482266, "learning_rate": 1.5768398246367207e-07, "logits/chosen": -1.113842248916626, "logits/rejected": -1.1331862211227417, "logps/chosen": -731.4851684570312, "logps/rejected": -965.7476196289062, "loss": 0.3713, "rewards/accuracies": 0.9375, "rewards/chosen": -4.364319801330566, "rewards/margins": 2.2552897930145264, "rewards/rejected": -6.6196088790893555, "step": 344 }, { "epoch": 0.7373764360138926, "grad_norm": 16.990002139173328, "learning_rate": 1.553102803539974e-07, "logits/chosen": -1.2558772563934326, "logits/rejected": -1.2412264347076416, "logps/chosen": -637.6004028320312, "logps/rejected": -920.36865234375, "loss": 0.3857, "rewards/accuracies": 0.8125, "rewards/chosen": -4.0147528648376465, "rewards/margins": 2.530723810195923, "rewards/rejected": -6.54547643661499, "step": 345 }, { "epoch": 0.7395137590168315, "grad_norm": 23.667341921272293, "learning_rate": 1.529502686041529e-07, "logits/chosen": -1.15803861618042, "logits/rejected": -1.2394546270370483, "logps/chosen": -798.1978759765625, "logps/rejected": -771.4423217773438, "loss": 0.4965, "rewards/accuracies": 0.4375, "rewards/chosen": -5.245590686798096, "rewards/margins": -0.053717248141765594, "rewards/rejected": -5.191873073577881, "step": 346 }, { "epoch": 0.7416510820197703, "grad_norm": 17.49003980577173, "learning_rate": 1.506040792565066e-07, "logits/chosen": -1.2386159896850586, "logits/rejected": -1.2004777193069458, "logps/chosen": -696.9265747070312, "logps/rejected": -820.576171875, "loss": 0.3834, "rewards/accuracies": 0.9375, "rewards/chosen": -3.919370651245117, "rewards/margins": 1.2635618448257446, "rewards/rejected": -5.1829328536987305, "step": 347 }, { "epoch": 0.743788405022709, "grad_norm": 26.239386727495823, "learning_rate": 1.4827184358006508e-07, "logits/chosen": -1.0444031953811646, "logits/rejected": -1.0797080993652344, "logps/chosen": -661.2929077148438, "logps/rejected": -858.2059936523438, "loss": 0.416, "rewards/accuracies": 0.875, "rewards/chosen": -4.284615993499756, "rewards/margins": 1.9141457080841064, "rewards/rejected": -6.198761940002441, "step": 348 }, { "epoch": 0.7459257280256478, "grad_norm": 22.633565158197506, "learning_rate": 1.459536920631285e-07, "logits/chosen": -1.2120310068130493, "logits/rejected": -1.2144323587417603, "logps/chosen": -793.6173095703125, "logps/rejected": -964.0809326171875, "loss": 0.3976, "rewards/accuracies": 0.875, "rewards/chosen": -5.077376842498779, "rewards/margins": 1.8791519403457642, "rewards/rejected": -6.956528663635254, "step": 349 }, { "epoch": 0.7480630510285867, "grad_norm": 21.615429147306983, "learning_rate": 1.4364975440599033e-07, "logits/chosen": -1.2455774545669556, "logits/rejected": -1.2984986305236816, "logps/chosen": -720.0133056640625, "logps/rejected": -948.4733276367188, "loss": 0.412, "rewards/accuracies": 0.8125, "rewards/chosen": -4.872766494750977, "rewards/margins": 2.1328089237213135, "rewards/rejected": -7.005575180053711, "step": 350 }, { "epoch": 0.7502003740315255, "grad_norm": 17.381380445589752, "learning_rate": 1.4136015951368018e-07, "logits/chosen": -1.2067402601242065, "logits/rejected": -1.2139760255813599, "logps/chosen": -676.5394897460938, "logps/rejected": -823.6031494140625, "loss": 0.3261, "rewards/accuracies": 0.9375, "rewards/chosen": -4.276626110076904, "rewards/margins": 1.5800349712371826, "rewards/rejected": -5.856661796569824, "step": 351 }, { "epoch": 0.7523376970344643, "grad_norm": 21.729828693203185, "learning_rate": 1.3908503548875167e-07, "logits/chosen": -1.2517892122268677, "logits/rejected": -1.1772764921188354, "logps/chosen": -711.4560546875, "logps/rejected": -869.8018798828125, "loss": 0.3411, "rewards/accuracies": 0.75, "rewards/chosen": -4.649675369262695, "rewards/margins": 1.567895531654358, "rewards/rejected": -6.21757173538208, "step": 352 }, { "epoch": 0.7544750200374032, "grad_norm": 18.37581288755564, "learning_rate": 1.3682450962411495e-07, "logits/chosen": -1.202965259552002, "logits/rejected": -1.1767388582229614, "logps/chosen": -735.4443359375, "logps/rejected": -899.7939453125, "loss": 0.4046, "rewards/accuracies": 0.875, "rewards/chosen": -4.456427097320557, "rewards/margins": 1.780933141708374, "rewards/rejected": -6.23736047744751, "step": 353 }, { "epoch": 0.756612343040342, "grad_norm": 25.687813258400997, "learning_rate": 1.3457870839591556e-07, "logits/chosen": -1.1969988346099854, "logits/rejected": -1.190890908241272, "logps/chosen": -668.3444213867188, "logps/rejected": -812.265625, "loss": 0.3904, "rewards/accuracies": 0.6875, "rewards/chosen": -4.523505687713623, "rewards/margins": 1.2760380506515503, "rewards/rejected": -5.799543857574463, "step": 354 }, { "epoch": 0.7587496660432808, "grad_norm": 19.70525548630737, "learning_rate": 1.3234775745645682e-07, "logits/chosen": -1.298694133758545, "logits/rejected": -1.2099275588989258, "logps/chosen": -629.9318237304688, "logps/rejected": -793.7557373046875, "loss": 0.3677, "rewards/accuracies": 0.75, "rewards/chosen": -4.134130001068115, "rewards/margins": 1.51906418800354, "rewards/rejected": -5.653193950653076, "step": 355 }, { "epoch": 0.7608869890462197, "grad_norm": 20.97352305735656, "learning_rate": 1.3013178162717026e-07, "logits/chosen": -1.1704061031341553, "logits/rejected": -1.1709686517715454, "logps/chosen": -619.7793579101562, "logps/rejected": -770.8734130859375, "loss": 0.3622, "rewards/accuracies": 0.875, "rewards/chosen": -3.893303632736206, "rewards/margins": 1.5143392086029053, "rewards/rejected": -5.407642841339111, "step": 356 }, { "epoch": 0.7630243120491584, "grad_norm": 19.640911151476274, "learning_rate": 1.2793090489163218e-07, "logits/chosen": -1.1160629987716675, "logits/rejected": -1.2110121250152588, "logps/chosen": -786.0536499023438, "logps/rejected": -1036.768310546875, "loss": 0.4379, "rewards/accuracies": 0.875, "rewards/chosen": -4.929718017578125, "rewards/margins": 2.5798377990722656, "rewards/rejected": -7.509555816650391, "step": 357 }, { "epoch": 0.7651616350520972, "grad_norm": 18.712384438599756, "learning_rate": 1.257452503886263e-07, "logits/chosen": -1.2526037693023682, "logits/rejected": -1.232513666152954, "logps/chosen": -785.4493408203125, "logps/rejected": -969.44140625, "loss": 0.3292, "rewards/accuracies": 0.875, "rewards/chosen": -5.247096538543701, "rewards/margins": 1.8201085329055786, "rewards/rejected": -7.067205429077148, "step": 358 }, { "epoch": 0.7672989580550361, "grad_norm": 22.206627398708065, "learning_rate": 1.2357494040525414e-07, "logits/chosen": -1.0485000610351562, "logits/rejected": -1.0508546829223633, "logps/chosen": -662.2064819335938, "logps/rejected": -915.0498046875, "loss": 0.4551, "rewards/accuracies": 0.8125, "rewards/chosen": -4.379815101623535, "rewards/margins": 2.327033758163452, "rewards/rejected": -6.706849575042725, "step": 359 }, { "epoch": 0.7694362810579749, "grad_norm": 23.59419747338851, "learning_rate": 1.2142009637009333e-07, "logits/chosen": -1.1591897010803223, "logits/rejected": -1.090525507926941, "logps/chosen": -831.9130859375, "logps/rejected": -980.3768920898438, "loss": 0.4204, "rewards/accuracies": 0.875, "rewards/chosen": -5.667198181152344, "rewards/margins": 1.3969204425811768, "rewards/rejected": -7.0641188621521, "step": 360 }, { "epoch": 0.7715736040609137, "grad_norm": 20.325203409090374, "learning_rate": 1.192808388464034e-07, "logits/chosen": -1.1007847785949707, "logits/rejected": -0.9627750515937805, "logps/chosen": -626.9066162109375, "logps/rejected": -841.2676391601562, "loss": 0.3843, "rewards/accuracies": 0.875, "rewards/chosen": -4.2317962646484375, "rewards/margins": 1.9466259479522705, "rewards/rejected": -6.178422451019287, "step": 361 }, { "epoch": 0.7737109270638525, "grad_norm": 18.105397272000737, "learning_rate": 1.17157287525381e-07, "logits/chosen": -1.155585765838623, "logits/rejected": -1.0961203575134277, "logps/chosen": -684.6275024414062, "logps/rejected": -876.870849609375, "loss": 0.3921, "rewards/accuracies": 0.875, "rewards/chosen": -4.383594512939453, "rewards/margins": 1.803823471069336, "rewards/rejected": -6.187417507171631, "step": 362 }, { "epoch": 0.7758482500667914, "grad_norm": 26.052189317294125, "learning_rate": 1.1504956121946214e-07, "logits/chosen": -1.0707626342773438, "logits/rejected": -1.1878434419631958, "logps/chosen": -714.69384765625, "logps/rejected": -863.8623046875, "loss": 0.5443, "rewards/accuracies": 0.8125, "rewards/chosen": -4.7078094482421875, "rewards/margins": 1.6244311332702637, "rewards/rejected": -6.332241058349609, "step": 363 }, { "epoch": 0.7779855730697302, "grad_norm": 19.3332723154738, "learning_rate": 1.1295777785567522e-07, "logits/chosen": -1.2466063499450684, "logits/rejected": -1.3214343786239624, "logps/chosen": -628.520751953125, "logps/rejected": -765.861083984375, "loss": 0.3409, "rewards/accuracies": 0.75, "rewards/chosen": -4.307971477508545, "rewards/margins": 1.5903899669647217, "rewards/rejected": -5.8983612060546875, "step": 364 }, { "epoch": 0.7801228960726689, "grad_norm": 16.65631148968948, "learning_rate": 1.1088205446904342e-07, "logits/chosen": -1.2889045476913452, "logits/rejected": -1.2453293800354004, "logps/chosen": -632.6788940429688, "logps/rejected": -860.2509155273438, "loss": 0.3204, "rewards/accuracies": 0.875, "rewards/chosen": -3.8525776863098145, "rewards/margins": 2.0945329666137695, "rewards/rejected": -5.947110652923584, "step": 365 }, { "epoch": 0.7822602190756078, "grad_norm": 25.617856733947676, "learning_rate": 1.0882250719603567e-07, "logits/chosen": -1.1964340209960938, "logits/rejected": -1.23922598361969, "logps/chosen": -930.8063354492188, "logps/rejected": -1066.240478515625, "loss": 0.4511, "rewards/accuracies": 0.6875, "rewards/chosen": -5.721828937530518, "rewards/margins": 1.5414330959320068, "rewards/rejected": -7.263261795043945, "step": 366 }, { "epoch": 0.7843975420785466, "grad_norm": 24.227338760419844, "learning_rate": 1.0677925126806955e-07, "logits/chosen": -1.169182300567627, "logits/rejected": -1.2425594329833984, "logps/chosen": -638.6636962890625, "logps/rejected": -791.4046020507812, "loss": 0.3979, "rewards/accuracies": 0.8125, "rewards/chosen": -4.070646286010742, "rewards/margins": 1.7463229894638062, "rewards/rejected": -5.816969394683838, "step": 367 }, { "epoch": 0.7865348650814854, "grad_norm": 16.35871784445141, "learning_rate": 1.0475240100506377e-07, "logits/chosen": -1.0509607791900635, "logits/rejected": -1.0016944408416748, "logps/chosen": -731.5370483398438, "logps/rejected": -868.427978515625, "loss": 0.2992, "rewards/accuracies": 0.8125, "rewards/chosen": -4.792571067810059, "rewards/margins": 1.3596065044403076, "rewards/rejected": -6.152177333831787, "step": 368 }, { "epoch": 0.7886721880844243, "grad_norm": 20.50758693822622, "learning_rate": 1.0274206980904226e-07, "logits/chosen": -1.0662883520126343, "logits/rejected": -1.0809623003005981, "logps/chosen": -701.0050048828125, "logps/rejected": -845.4842529296875, "loss": 0.3898, "rewards/accuracies": 0.75, "rewards/chosen": -4.612939834594727, "rewards/margins": 1.4648727178573608, "rewards/rejected": -6.077812194824219, "step": 369 }, { "epoch": 0.7908095110873631, "grad_norm": 21.295312469199402, "learning_rate": 1.0074837015778919e-07, "logits/chosen": -1.1983091831207275, "logits/rejected": -1.2084715366363525, "logps/chosen": -812.4208374023438, "logps/rejected": -999.982177734375, "loss": 0.4268, "rewards/accuracies": 0.8125, "rewards/chosen": -5.570307731628418, "rewards/margins": 1.8499504327774048, "rewards/rejected": -7.420258522033691, "step": 370 }, { "epoch": 0.7929468340903019, "grad_norm": 23.822573821786133, "learning_rate": 9.877141359855565e-08, "logits/chosen": -1.206276774406433, "logits/rejected": -1.2356822490692139, "logps/chosen": -775.7623901367188, "logps/rejected": -1005.5828247070312, "loss": 0.4341, "rewards/accuracies": 0.6875, "rewards/chosen": -4.983241558074951, "rewards/margins": 2.1637260913848877, "rewards/rejected": -7.14696741104126, "step": 371 }, { "epoch": 0.7950841570932408, "grad_norm": 17.234697083614194, "learning_rate": 9.681131074181874e-08, "logits/chosen": -1.3584322929382324, "logits/rejected": -1.299919605255127, "logps/chosen": -613.3756103515625, "logps/rejected": -737.3634033203125, "loss": 0.3485, "rewards/accuracies": 0.75, "rewards/chosen": -3.7099380493164062, "rewards/margins": 1.276181697845459, "rewards/rejected": -4.986119747161865, "step": 372 }, { "epoch": 0.7972214800961795, "grad_norm": 17.48652928766399, "learning_rate": 9.486817125509339e-08, "logits/chosen": -1.1015208959579468, "logits/rejected": -1.0562043190002441, "logps/chosen": -806.1483764648438, "logps/rejected": -1005.850341796875, "loss": 0.3802, "rewards/accuracies": 0.875, "rewards/chosen": -5.184823036193848, "rewards/margins": 1.9502785205841064, "rewards/rejected": -7.135101795196533, "step": 373 }, { "epoch": 0.7993588030991183, "grad_norm": 23.159224310703724, "learning_rate": 9.294210385679555e-08, "logits/chosen": -1.20077645778656, "logits/rejected": -1.168664574623108, "logps/chosen": -732.3262329101562, "logps/rejected": -890.5650634765625, "loss": 0.4008, "rewards/accuracies": 0.75, "rewards/chosen": -4.771186828613281, "rewards/margins": 1.5593047142028809, "rewards/rejected": -6.33049201965332, "step": 374 }, { "epoch": 0.8014961261020572, "grad_norm": 18.102274669415063, "learning_rate": 9.103321631016023e-08, "logits/chosen": -1.157024621963501, "logits/rejected": -1.1546276807785034, "logps/chosen": -793.4083251953125, "logps/rejected": -937.8966674804688, "loss": 0.4136, "rewards/accuracies": 0.75, "rewards/chosen": -4.976907253265381, "rewards/margins": 1.4234803915023804, "rewards/rejected": -6.400387763977051, "step": 375 }, { "epoch": 0.803633449104996, "grad_norm": 22.18700340122903, "learning_rate": 8.914161541721163e-08, "logits/chosen": -1.2537552118301392, "logits/rejected": -1.237257480621338, "logps/chosen": -648.7177124023438, "logps/rejected": -805.5570678710938, "loss": 0.4648, "rewards/accuracies": 0.75, "rewards/chosen": -4.536306381225586, "rewards/margins": 1.4047529697418213, "rewards/rejected": -5.941059112548828, "step": 376 }, { "epoch": 0.8057707721079348, "grad_norm": 17.300986839334925, "learning_rate": 8.726740701278807e-08, "logits/chosen": -1.3532252311706543, "logits/rejected": -1.290065884590149, "logps/chosen": -593.5990600585938, "logps/rejected": -786.1945190429688, "loss": 0.377, "rewards/accuracies": 0.6875, "rewards/chosen": -3.6258466243743896, "rewards/margins": 1.8245813846588135, "rewards/rejected": -5.450427532196045, "step": 377 }, { "epoch": 0.8079080951108736, "grad_norm": 20.165978429403715, "learning_rate": 8.541069595862009e-08, "logits/chosen": -1.2440727949142456, "logits/rejected": -1.14741849899292, "logps/chosen": -717.24951171875, "logps/rejected": -860.7356567382812, "loss": 0.4126, "rewards/accuracies": 0.8125, "rewards/chosen": -4.245814800262451, "rewards/margins": 1.338247537612915, "rewards/rejected": -5.584062576293945, "step": 378 }, { "epoch": 0.8100454181138125, "grad_norm": 16.719380649306434, "learning_rate": 8.35715861374636e-08, "logits/chosen": -1.1465423107147217, "logits/rejected": -1.1313426494598389, "logps/chosen": -563.7252197265625, "logps/rejected": -776.3715209960938, "loss": 0.363, "rewards/accuracies": 1.0, "rewards/chosen": -3.422579288482666, "rewards/margins": 1.9151452779769897, "rewards/rejected": -5.337724208831787, "step": 379 }, { "epoch": 0.8121827411167513, "grad_norm": 21.079445701366463, "learning_rate": 8.175018044728762e-08, "logits/chosen": -1.2566273212432861, "logits/rejected": -1.2379119396209717, "logps/chosen": -588.2570190429688, "logps/rejected": -726.7328491210938, "loss": 0.4063, "rewards/accuracies": 0.9375, "rewards/chosen": -3.6017940044403076, "rewards/margins": 1.2587138414382935, "rewards/rejected": -4.860507965087891, "step": 380 }, { "epoch": 0.81432006411969, "grad_norm": 15.615292244997176, "learning_rate": 7.994658079551752e-08, "logits/chosen": -0.9509527683258057, "logits/rejected": -0.928947925567627, "logps/chosen": -749.95556640625, "logps/rejected": -985.079345703125, "loss": 0.3506, "rewards/accuracies": 0.8125, "rewards/chosen": -4.5622477531433105, "rewards/margins": 2.3395752906799316, "rewards/rejected": -6.901823043823242, "step": 381 }, { "epoch": 0.8164573871226289, "grad_norm": 21.72824821202356, "learning_rate": 7.816088809333265e-08, "logits/chosen": -1.112214207649231, "logits/rejected": -1.0684149265289307, "logps/chosen": -764.0916137695312, "logps/rejected": -841.7941284179688, "loss": 0.4301, "rewards/accuracies": 0.8125, "rewards/chosen": -4.703119277954102, "rewards/margins": 0.840286374092102, "rewards/rejected": -5.543406009674072, "step": 382 }, { "epoch": 0.8185947101255677, "grad_norm": 17.6012054275616, "learning_rate": 7.639320225002106e-08, "logits/chosen": -1.123322606086731, "logits/rejected": -1.0792958736419678, "logps/chosen": -656.8797607421875, "logps/rejected": -841.3156127929688, "loss": 0.4329, "rewards/accuracies": 0.9375, "rewards/chosen": -4.335733890533447, "rewards/margins": 1.6224541664123535, "rewards/rejected": -5.958188056945801, "step": 383 }, { "epoch": 0.8207320331285065, "grad_norm": 20.681275510903472, "learning_rate": 7.464362216738882e-08, "logits/chosen": -1.1937505006790161, "logits/rejected": -1.172591209411621, "logps/chosen": -640.5665283203125, "logps/rejected": -749.8336181640625, "loss": 0.4055, "rewards/accuracies": 0.75, "rewards/chosen": -4.004388809204102, "rewards/margins": 1.1090167760849, "rewards/rejected": -5.113406181335449, "step": 384 }, { "epoch": 0.8228693561314454, "grad_norm": 20.783011075052453, "learning_rate": 7.291224573422746e-08, "logits/chosen": -1.0488053560256958, "logits/rejected": -1.0833758115768433, "logps/chosen": -651.447998046875, "logps/rejected": -798.273681640625, "loss": 0.3371, "rewards/accuracies": 0.875, "rewards/chosen": -4.223069190979004, "rewards/margins": 1.4724674224853516, "rewards/rejected": -5.695535659790039, "step": 385 }, { "epoch": 0.8250066791343842, "grad_norm": 19.706489967433907, "learning_rate": 7.119916982083606e-08, "logits/chosen": -1.1488205194473267, "logits/rejected": -1.1003917455673218, "logps/chosen": -594.006591796875, "logps/rejected": -760.5867309570312, "loss": 0.3778, "rewards/accuracies": 0.9375, "rewards/chosen": -3.641071319580078, "rewards/margins": 1.4457651376724243, "rewards/rejected": -5.086835861206055, "step": 386 }, { "epoch": 0.827144002137323, "grad_norm": 21.521620560904775, "learning_rate": 6.950449027360213e-08, "logits/chosen": -1.1488525867462158, "logits/rejected": -1.1080577373504639, "logps/chosen": -554.5031127929688, "logps/rejected": -725.948974609375, "loss": 0.3668, "rewards/accuracies": 1.0, "rewards/chosen": -3.534379005432129, "rewards/margins": 1.5944782495498657, "rewards/rejected": -5.128857135772705, "step": 387 }, { "epoch": 0.8292813251402619, "grad_norm": 18.836097701043112, "learning_rate": 6.782830190963849e-08, "logits/chosen": -1.0808215141296387, "logits/rejected": -1.1298860311508179, "logps/chosen": -559.855224609375, "logps/rejected": -634.6371459960938, "loss": 0.4358, "rewards/accuracies": 0.75, "rewards/chosen": -3.5141537189483643, "rewards/margins": 0.8853537440299988, "rewards/rejected": -4.399507522583008, "step": 388 }, { "epoch": 0.8314186481432007, "grad_norm": 17.005948701459808, "learning_rate": 6.617069851147893e-08, "logits/chosen": -1.1958553791046143, "logits/rejected": -1.1517484188079834, "logps/chosen": -775.6495971679688, "logps/rejected": -891.900390625, "loss": 0.3866, "rewards/accuracies": 0.625, "rewards/chosen": -4.379716873168945, "rewards/margins": 1.2400314807891846, "rewards/rejected": -5.619749069213867, "step": 389 }, { "epoch": 0.8335559711461394, "grad_norm": 18.782389894036523, "learning_rate": 6.453177282183043e-08, "logits/chosen": -1.0847554206848145, "logits/rejected": -1.1218299865722656, "logps/chosen": -553.1158447265625, "logps/rejected": -640.9627075195312, "loss": 0.395, "rewards/accuracies": 0.8125, "rewards/chosen": -3.2679800987243652, "rewards/margins": 1.0876916646957397, "rewards/rejected": -4.3556718826293945, "step": 390 }, { "epoch": 0.8356932941490782, "grad_norm": 20.020475926403833, "learning_rate": 6.291161653838433e-08, "logits/chosen": -1.1652432680130005, "logits/rejected": -1.1427745819091797, "logps/chosen": -619.0631713867188, "logps/rejected": -852.297607421875, "loss": 0.347, "rewards/accuracies": 0.8125, "rewards/chosen": -3.763627052307129, "rewards/margins": 2.3782565593719482, "rewards/rejected": -6.141883850097656, "step": 391 }, { "epoch": 0.8378306171520171, "grad_norm": 17.938841226698447, "learning_rate": 6.131032030868634e-08, "logits/chosen": -1.3010265827178955, "logits/rejected": -1.2798787355422974, "logps/chosen": -648.106201171875, "logps/rejected": -922.129638671875, "loss": 0.3221, "rewards/accuracies": 0.75, "rewards/chosen": -3.8143765926361084, "rewards/margins": 2.5941810607910156, "rewards/rejected": -6.408557415008545, "step": 392 }, { "epoch": 0.8399679401549559, "grad_norm": 18.079175693306084, "learning_rate": 5.972797372506421e-08, "logits/chosen": -1.3528358936309814, "logits/rejected": -1.3354519605636597, "logps/chosen": -690.3780517578125, "logps/rejected": -912.6945190429688, "loss": 0.3283, "rewards/accuracies": 0.875, "rewards/chosen": -3.9751172065734863, "rewards/margins": 1.9700416326522827, "rewards/rejected": -5.945158958435059, "step": 393 }, { "epoch": 0.8421052631578947, "grad_norm": 16.39904375972069, "learning_rate": 5.8164665319615416e-08, "logits/chosen": -1.3244935274124146, "logits/rejected": -1.250441074371338, "logps/chosen": -579.123779296875, "logps/rejected": -744.0313720703125, "loss": 0.3762, "rewards/accuracies": 0.9375, "rewards/chosen": -3.270462989807129, "rewards/margins": 1.4447557926177979, "rewards/rejected": -4.715218544006348, "step": 394 }, { "epoch": 0.8442425861608336, "grad_norm": 21.900567022185207, "learning_rate": 5.662048255925356e-08, "logits/chosen": -1.1036981344223022, "logits/rejected": -1.073646068572998, "logps/chosen": -705.447509765625, "logps/rejected": -859.1611938476562, "loss": 0.4035, "rewards/accuracies": 0.6875, "rewards/chosen": -4.186145782470703, "rewards/margins": 1.4756170511245728, "rewards/rejected": -5.661762714385986, "step": 395 }, { "epoch": 0.8463799091637724, "grad_norm": 24.440062318949717, "learning_rate": 5.5095511840814957e-08, "logits/chosen": -1.2547982931137085, "logits/rejected": -1.2235480546951294, "logps/chosen": -632.1664428710938, "logps/rejected": -789.1854858398438, "loss": 0.4916, "rewards/accuracies": 0.8125, "rewards/chosen": -3.523899555206299, "rewards/margins": 1.5776091814041138, "rewards/rejected": -5.101509094238281, "step": 396 }, { "epoch": 0.8485172321667112, "grad_norm": 17.709481773760306, "learning_rate": 5.358983848622451e-08, "logits/chosen": -1.1620187759399414, "logits/rejected": -1.1197460889816284, "logps/chosen": -677.14697265625, "logps/rejected": -884.9602661132812, "loss": 0.3381, "rewards/accuracies": 0.875, "rewards/chosen": -4.283136367797852, "rewards/margins": 2.155815601348877, "rewards/rejected": -6.43895149230957, "step": 397 }, { "epoch": 0.85065455516965, "grad_norm": 21.95309780747416, "learning_rate": 5.210354673772186e-08, "logits/chosen": -1.1960110664367676, "logits/rejected": -1.1809446811676025, "logps/chosen": -591.931640625, "logps/rejected": -729.0454711914062, "loss": 0.3959, "rewards/accuracies": 0.75, "rewards/chosen": -3.7348904609680176, "rewards/margins": 1.1620283126831055, "rewards/rejected": -4.896918296813965, "step": 398 }, { "epoch": 0.8527918781725888, "grad_norm": 18.231544991301615, "learning_rate": 5.063671975314814e-08, "logits/chosen": -1.1651999950408936, "logits/rejected": -1.1458827257156372, "logps/chosen": -544.5778198242188, "logps/rejected": -645.9469604492188, "loss": 0.4343, "rewards/accuracies": 0.8125, "rewards/chosen": -3.4847066402435303, "rewards/margins": 0.870603084564209, "rewards/rejected": -4.355309963226318, "step": 399 }, { "epoch": 0.8549292011755276, "grad_norm": 16.85803441290228, "learning_rate": 4.918943960129338e-08, "logits/chosen": -1.356669306755066, "logits/rejected": -1.3354345560073853, "logps/chosen": -568.3810424804688, "logps/rejected": -700.6995239257812, "loss": 0.3856, "rewards/accuracies": 0.875, "rewards/chosen": -3.211237907409668, "rewards/margins": 1.343750238418579, "rewards/rejected": -4.554987907409668, "step": 400 }, { "epoch": 0.8549292011755276, "eval_logits/chosen": -1.3413100242614746, "eval_logits/rejected": -1.3605502843856812, "eval_logps/chosen": -735.5136108398438, "eval_logps/rejected": -898.4371337890625, "eval_loss": 0.38407519459724426, "eval_rewards/accuracies": 0.8588709831237793, "eval_rewards/chosen": -4.560075759887695, "eval_rewards/margins": 1.6887413263320923, "eval_rewards/rejected": -6.2488179206848145, "eval_runtime": 127.6843, "eval_samples_per_second": 15.358, "eval_steps_per_second": 0.486, "step": 400 }, { "epoch": 0.8570665241784665, "grad_norm": 19.09808858732997, "learning_rate": 4.7761787257304796e-08, "logits/chosen": -1.1470953226089478, "logits/rejected": -1.167830467224121, "logps/chosen": -727.25341796875, "logps/rejected": -1019.1134643554688, "loss": 0.3153, "rewards/accuracies": 1.0, "rewards/chosen": -4.293584823608398, "rewards/margins": 2.9339499473571777, "rewards/rejected": -7.227534294128418, "step": 401 }, { "epoch": 0.8592038471814053, "grad_norm": 20.031183507985155, "learning_rate": 4.6353842598156134e-08, "logits/chosen": -1.2042540311813354, "logits/rejected": -1.2306770086288452, "logps/chosen": -832.0037841796875, "logps/rejected": -949.708251953125, "loss": 0.4025, "rewards/accuracies": 0.75, "rewards/chosen": -5.143182277679443, "rewards/margins": 1.2332004308700562, "rewards/rejected": -6.376382827758789, "step": 402 }, { "epoch": 0.8613411701843441, "grad_norm": 20.27761411702487, "learning_rate": 4.496568439817836e-08, "logits/chosen": -1.0823955535888672, "logits/rejected": -1.0055339336395264, "logps/chosen": -647.8933715820312, "logps/rejected": -846.7052001953125, "loss": 0.4049, "rewards/accuracies": 0.9375, "rewards/chosen": -4.089991092681885, "rewards/margins": 1.9131741523742676, "rewards/rejected": -6.003165245056152, "step": 403 }, { "epoch": 0.863478493187283, "grad_norm": 20.06260268066064, "learning_rate": 4.359739032465288e-08, "logits/chosen": -1.238147258758545, "logits/rejected": -1.2503734827041626, "logps/chosen": -628.3074340820312, "logps/rejected": -690.2806396484375, "loss": 0.4391, "rewards/accuracies": 0.6875, "rewards/chosen": -3.6560134887695312, "rewards/margins": 0.6559101343154907, "rewards/rejected": -4.311923503875732, "step": 404 }, { "epoch": 0.8656158161902218, "grad_norm": 24.04539917692329, "learning_rate": 4.224903693346533e-08, "logits/chosen": -1.1764211654663086, "logits/rejected": -1.1727336645126343, "logps/chosen": -837.36767578125, "logps/rejected": -985.0379638671875, "loss": 0.4175, "rewards/accuracies": 0.75, "rewards/chosen": -5.274961471557617, "rewards/margins": 1.683783769607544, "rewards/rejected": -6.95874547958374, "step": 405 }, { "epoch": 0.8677531391931605, "grad_norm": 16.471866635220206, "learning_rate": 4.092069966482272e-08, "logits/chosen": -1.0595924854278564, "logits/rejected": -1.1007946729660034, "logps/chosen": -696.4332275390625, "logps/rejected": -854.4127197265625, "loss": 0.3502, "rewards/accuracies": 0.75, "rewards/chosen": -4.544949531555176, "rewards/margins": 1.6854453086853027, "rewards/rejected": -6.23039436340332, "step": 406 }, { "epoch": 0.8698904621960993, "grad_norm": 18.880730560681034, "learning_rate": 3.9612452839032386e-08, "logits/chosen": -1.1478521823883057, "logits/rejected": -1.1282038688659668, "logps/chosen": -612.4228515625, "logps/rejected": -871.350341796875, "loss": 0.3887, "rewards/accuracies": 0.9375, "rewards/chosen": -4.034952163696289, "rewards/margins": 2.548003911972046, "rewards/rejected": -6.582956314086914, "step": 407 }, { "epoch": 0.8720277851990382, "grad_norm": 24.18209624562828, "learning_rate": 3.8324369652343954e-08, "logits/chosen": -1.2262625694274902, "logits/rejected": -1.1821433305740356, "logps/chosen": -582.0433959960938, "logps/rejected": -733.720458984375, "loss": 0.4053, "rewards/accuracies": 0.75, "rewards/chosen": -3.865800142288208, "rewards/margins": 1.6720855236053467, "rewards/rejected": -5.537886142730713, "step": 408 }, { "epoch": 0.874165108201977, "grad_norm": 18.382867885499415, "learning_rate": 3.7056522172854045e-08, "logits/chosen": -1.2061108350753784, "logits/rejected": -1.2035906314849854, "logps/chosen": -787.7473754882812, "logps/rejected": -904.2359008789062, "loss": 0.3439, "rewards/accuracies": 0.625, "rewards/chosen": -5.003305435180664, "rewards/margins": 1.1130136251449585, "rewards/rejected": -6.116319179534912, "step": 409 }, { "epoch": 0.8763024312049158, "grad_norm": 23.170208730820974, "learning_rate": 3.580898133647365e-08, "logits/chosen": -1.1570967435836792, "logits/rejected": -1.1725834608078003, "logps/chosen": -491.8000793457031, "logps/rejected": -648.202880859375, "loss": 0.438, "rewards/accuracies": 0.875, "rewards/chosen": -3.1505000591278076, "rewards/margins": 1.3966213464736938, "rewards/rejected": -4.547121524810791, "step": 410 }, { "epoch": 0.8784397542078547, "grad_norm": 13.414426080076618, "learning_rate": 3.4581816942959606e-08, "logits/chosen": -1.2170116901397705, "logits/rejected": -1.1329779624938965, "logps/chosen": -799.8605346679688, "logps/rejected": -1069.72412109375, "loss": 0.3279, "rewards/accuracies": 1.0, "rewards/chosen": -4.736441612243652, "rewards/margins": 2.4817426204681396, "rewards/rejected": -7.218183994293213, "step": 411 }, { "epoch": 0.8805770772107935, "grad_norm": 25.164272834377723, "learning_rate": 3.337509765200952e-08, "logits/chosen": -1.263645887374878, "logits/rejected": -1.2011065483093262, "logps/chosen": -733.5821533203125, "logps/rejected": -922.4378662109375, "loss": 0.4522, "rewards/accuracies": 0.75, "rewards/chosen": -4.364438533782959, "rewards/margins": 1.6717685461044312, "rewards/rejected": -6.0362067222595215, "step": 412 }, { "epoch": 0.8827144002137323, "grad_norm": 21.972539663439804, "learning_rate": 3.218889097941977e-08, "logits/chosen": -1.1184589862823486, "logits/rejected": -1.0591604709625244, "logps/chosen": -735.7826538085938, "logps/rejected": -919.8989868164062, "loss": 0.4028, "rewards/accuracies": 0.75, "rewards/chosen": -4.92265510559082, "rewards/margins": 1.8032639026641846, "rewards/rejected": -6.725918769836426, "step": 413 }, { "epoch": 0.8848517232166712, "grad_norm": 19.5992524976479, "learning_rate": 3.102326329330833e-08, "logits/chosen": -1.180734395980835, "logits/rejected": -1.1732540130615234, "logps/chosen": -568.0210571289062, "logps/rejected": -707.9583740234375, "loss": 0.3788, "rewards/accuracies": 0.625, "rewards/chosen": -3.657954454421997, "rewards/margins": 1.3774219751358032, "rewards/rejected": -5.03537654876709, "step": 414 }, { "epoch": 0.88698904621961, "grad_norm": 19.912191558357517, "learning_rate": 2.987827981040132e-08, "logits/chosen": -1.0525703430175781, "logits/rejected": -1.0452289581298828, "logps/chosen": -732.814453125, "logps/rejected": -772.8712158203125, "loss": 0.3975, "rewards/accuracies": 0.625, "rewards/chosen": -4.660562515258789, "rewards/margins": 0.4499680995941162, "rewards/rejected": -5.110530853271484, "step": 415 }, { "epoch": 0.8891263692225487, "grad_norm": 26.727536724022972, "learning_rate": 2.875400459238446e-08, "logits/chosen": -1.2559760808944702, "logits/rejected": -1.2618238925933838, "logps/chosen": -765.4171142578125, "logps/rejected": -1030.8968505859375, "loss": 0.453, "rewards/accuracies": 0.8125, "rewards/chosen": -4.885884761810303, "rewards/margins": 2.478846549987793, "rewards/rejected": -7.364731788635254, "step": 416 }, { "epoch": 0.8912636922254876, "grad_norm": 22.005485483200033, "learning_rate": 2.7650500542318344e-08, "logits/chosen": -1.2461367845535278, "logits/rejected": -1.2258386611938477, "logps/chosen": -687.5975952148438, "logps/rejected": -837.7894287109375, "loss": 0.3724, "rewards/accuracies": 0.75, "rewards/chosen": -4.603135108947754, "rewards/margins": 1.5509103536605835, "rewards/rejected": -6.154045104980469, "step": 417 }, { "epoch": 0.8934010152284264, "grad_norm": 20.686911824626822, "learning_rate": 2.6567829401119346e-08, "logits/chosen": -1.2208616733551025, "logits/rejected": -1.208647608757019, "logps/chosen": -694.425537109375, "logps/rejected": -811.75927734375, "loss": 0.3835, "rewards/accuracies": 0.8125, "rewards/chosen": -4.29609489440918, "rewards/margins": 1.2039227485656738, "rewards/rejected": -5.500017166137695, "step": 418 }, { "epoch": 0.8955383382313652, "grad_norm": 21.443800944172835, "learning_rate": 2.5506051744105117e-08, "logits/chosen": -1.1272634267807007, "logits/rejected": -1.1442179679870605, "logps/chosen": -639.4913330078125, "logps/rejected": -763.7130737304688, "loss": 0.4385, "rewards/accuracies": 0.8125, "rewards/chosen": -3.8574483394622803, "rewards/margins": 1.4068204164505005, "rewards/rejected": -5.26426887512207, "step": 419 }, { "epoch": 0.897675661234304, "grad_norm": 25.09735847833568, "learning_rate": 2.4465226977605558e-08, "logits/chosen": -1.192622423171997, "logits/rejected": -1.1234114170074463, "logps/chosen": -763.729248046875, "logps/rejected": -975.58935546875, "loss": 0.4638, "rewards/accuracies": 0.6875, "rewards/chosen": -4.789132118225098, "rewards/margins": 1.976550579071045, "rewards/rejected": -6.765683174133301, "step": 420 }, { "epoch": 0.8998129842372429, "grad_norm": 19.301280601090053, "learning_rate": 2.3445413335638763e-08, "logits/chosen": -1.3491016626358032, "logits/rejected": -1.3572145700454712, "logps/chosen": -705.7742919921875, "logps/rejected": -887.14013671875, "loss": 0.3918, "rewards/accuracies": 0.75, "rewards/chosen": -4.3721818923950195, "rewards/margins": 1.478896975517273, "rewards/rejected": -5.851078510284424, "step": 421 }, { "epoch": 0.9019503072401817, "grad_norm": 23.607839241733036, "learning_rate": 2.244666787665297e-08, "logits/chosen": -1.2374908924102783, "logits/rejected": -1.2936606407165527, "logps/chosen": -756.2825927734375, "logps/rejected": -903.6665649414062, "loss": 0.4241, "rewards/accuracies": 0.875, "rewards/chosen": -4.921727180480957, "rewards/margins": 1.6834334135055542, "rewards/rejected": -6.605160713195801, "step": 422 }, { "epoch": 0.9040876302431204, "grad_norm": 16.860532940493467, "learning_rate": 2.1469046480334296e-08, "logits/chosen": -1.228078842163086, "logits/rejected": -1.1509662866592407, "logps/chosen": -617.8320922851562, "logps/rejected": -804.2755737304688, "loss": 0.3241, "rewards/accuracies": 0.9375, "rewards/chosen": -4.25843620300293, "rewards/margins": 1.6883177757263184, "rewards/rejected": -5.946753978729248, "step": 423 }, { "epoch": 0.9062249532460593, "grad_norm": 19.717442168166148, "learning_rate": 2.05126038444801e-08, "logits/chosen": -1.047011137008667, "logits/rejected": -1.0633037090301514, "logps/chosen": -587.2730712890625, "logps/rejected": -760.9138793945312, "loss": 0.411, "rewards/accuracies": 0.9375, "rewards/chosen": -3.5641701221466064, "rewards/margins": 1.6567331552505493, "rewards/rejected": -5.220903396606445, "step": 424 }, { "epoch": 0.9083622762489981, "grad_norm": 19.401646167891172, "learning_rate": 1.9577393481938586e-08, "logits/chosen": -1.1300030946731567, "logits/rejected": -1.1926459074020386, "logps/chosen": -668.7816772460938, "logps/rejected": -773.44677734375, "loss": 0.371, "rewards/accuracies": 0.75, "rewards/chosen": -3.893263578414917, "rewards/margins": 1.1991028785705566, "rewards/rejected": -5.0923662185668945, "step": 425 }, { "epoch": 0.9104995992519369, "grad_norm": 19.48240593398746, "learning_rate": 1.8663467717614955e-08, "logits/chosen": -1.2391318082809448, "logits/rejected": -1.218917727470398, "logps/chosen": -715.0926513671875, "logps/rejected": -927.725830078125, "loss": 0.337, "rewards/accuracies": 0.8125, "rewards/chosen": -4.81939697265625, "rewards/margins": 2.132520914077759, "rewards/rejected": -6.95191764831543, "step": 426 }, { "epoch": 0.9126369222548758, "grad_norm": 18.414154188749755, "learning_rate": 1.7770877685543685e-08, "logits/chosen": -0.9984086155891418, "logits/rejected": -0.9543211460113525, "logps/chosen": -630.6248168945312, "logps/rejected": -776.2060546875, "loss": 0.3846, "rewards/accuracies": 0.8125, "rewards/chosen": -4.3436994552612305, "rewards/margins": 1.370991587638855, "rewards/rejected": -5.714690685272217, "step": 427 }, { "epoch": 0.9147742452578146, "grad_norm": 28.47516460996172, "learning_rate": 1.689967332602782e-08, "logits/chosen": -1.2434695959091187, "logits/rejected": -1.2322406768798828, "logps/chosen": -648.59375, "logps/rejected": -814.1904907226562, "loss": 0.4201, "rewards/accuracies": 0.75, "rewards/chosen": -4.107604026794434, "rewards/margins": 1.6489994525909424, "rewards/rejected": -5.756603717803955, "step": 428 }, { "epoch": 0.9169115682607534, "grad_norm": 20.787778312181622, "learning_rate": 1.604990338284442e-08, "logits/chosen": -1.2705280780792236, "logits/rejected": -1.1843757629394531, "logps/chosen": -623.3156127929688, "logps/rejected": -877.883544921875, "loss": 0.3917, "rewards/accuracies": 0.875, "rewards/chosen": -4.032170295715332, "rewards/margins": 2.4661622047424316, "rewards/rejected": -6.498332500457764, "step": 429 }, { "epoch": 0.9190488912636923, "grad_norm": 23.557666816240015, "learning_rate": 1.522161540051763e-08, "logits/chosen": -1.318623423576355, "logits/rejected": -1.2060602903366089, "logps/chosen": -841.9174194335938, "logps/rejected": -1067.8670654296875, "loss": 0.3666, "rewards/accuracies": 0.75, "rewards/chosen": -5.210690975189209, "rewards/margins": 2.1968445777893066, "rewards/rejected": -7.407536029815674, "step": 430 }, { "epoch": 0.921186214266631, "grad_norm": 19.13785898561301, "learning_rate": 1.4414855721658703e-08, "logits/chosen": -1.153996467590332, "logits/rejected": -1.1101250648498535, "logps/chosen": -573.8594970703125, "logps/rejected": -784.9390869140625, "loss": 0.3734, "rewards/accuracies": 0.875, "rewards/chosen": -3.4458227157592773, "rewards/margins": 2.165468692779541, "rewards/rejected": -5.611291408538818, "step": 431 }, { "epoch": 0.9233235372695698, "grad_norm": 24.984469469230163, "learning_rate": 1.362966948437272e-08, "logits/chosen": -1.161950707435608, "logits/rejected": -1.1117969751358032, "logps/chosen": -742.8123779296875, "logps/rejected": -1053.392578125, "loss": 0.4537, "rewards/accuracies": 0.9375, "rewards/chosen": -4.41086483001709, "rewards/margins": 2.8615894317626953, "rewards/rejected": -7.272454261779785, "step": 432 }, { "epoch": 0.9254608602725087, "grad_norm": 17.14897188522608, "learning_rate": 1.2866100619733434e-08, "logits/chosen": -1.1124742031097412, "logits/rejected": -1.0683679580688477, "logps/chosen": -692.58056640625, "logps/rejected": -865.1836547851562, "loss": 0.4003, "rewards/accuracies": 0.8125, "rewards/chosen": -4.52462911605835, "rewards/margins": 1.5883965492248535, "rewards/rejected": -6.113025665283203, "step": 433 }, { "epoch": 0.9275981832754475, "grad_norm": 18.976385617816387, "learning_rate": 1.212419184932525e-08, "logits/chosen": -1.283325433731079, "logits/rejected": -1.2627527713775635, "logps/chosen": -848.5104370117188, "logps/rejected": -1140.6717529296875, "loss": 0.323, "rewards/accuracies": 0.9375, "rewards/chosen": -5.1217360496521, "rewards/margins": 2.514495611190796, "rewards/rejected": -7.636231899261475, "step": 434 }, { "epoch": 0.9297355062783863, "grad_norm": 18.525625944510605, "learning_rate": 1.1403984682852997e-08, "logits/chosen": -1.1664470434188843, "logits/rejected": -1.1477528810501099, "logps/chosen": -637.484619140625, "logps/rejected": -859.3595581054688, "loss": 0.3552, "rewards/accuracies": 0.875, "rewards/chosen": -4.328897953033447, "rewards/margins": 1.8881185054779053, "rewards/rejected": -6.217016220092773, "step": 435 }, { "epoch": 0.9318728292813251, "grad_norm": 23.166677968022782, "learning_rate": 1.0705519415819431e-08, "logits/chosen": -1.2317030429840088, "logits/rejected": -1.1913528442382812, "logps/chosen": -722.13330078125, "logps/rejected": -883.4630126953125, "loss": 0.385, "rewards/accuracies": 0.875, "rewards/chosen": -4.459592819213867, "rewards/margins": 1.4444059133529663, "rewards/rejected": -5.903999328613281, "step": 436 }, { "epoch": 0.934010152284264, "grad_norm": 29.360763259322653, "learning_rate": 1.0028835127270552e-08, "logits/chosen": -1.041462779045105, "logits/rejected": -1.1673197746276855, "logps/chosen": -612.159912109375, "logps/rejected": -679.0078125, "loss": 0.5247, "rewards/accuracies": 0.5625, "rewards/chosen": -3.783984422683716, "rewards/margins": 0.7272745370864868, "rewards/rejected": -4.511259078979492, "step": 437 }, { "epoch": 0.9361474752872028, "grad_norm": 19.605084291231115, "learning_rate": 9.373969677609483e-09, "logits/chosen": -1.3655058145523071, "logits/rejected": -1.294546127319336, "logps/chosen": -698.7198486328125, "logps/rejected": -940.9436645507812, "loss": 0.427, "rewards/accuracies": 0.875, "rewards/chosen": -4.112486839294434, "rewards/margins": 2.187129497528076, "rewards/rejected": -6.299617290496826, "step": 438 }, { "epoch": 0.9382847982901416, "grad_norm": 22.063411073782135, "learning_rate": 8.740959706477725e-09, "logits/chosen": -1.077052354812622, "logits/rejected": -1.151231050491333, "logps/chosen": -696.8163452148438, "logps/rejected": -933.444091796875, "loss": 0.4013, "rewards/accuracies": 0.6875, "rewards/chosen": -4.566803455352783, "rewards/margins": 2.404057025909424, "rewards/rejected": -6.970860481262207, "step": 439 }, { "epoch": 0.9404221212930804, "grad_norm": 25.49031368931362, "learning_rate": 8.129840630705809e-09, "logits/chosen": -1.328292727470398, "logits/rejected": -1.3197101354599, "logps/chosen": -745.0568237304688, "logps/rejected": -901.1380615234375, "loss": 0.4558, "rewards/accuracies": 0.75, "rewards/chosen": -4.428404808044434, "rewards/margins": 1.5418949127197266, "rewards/rejected": -5.970300674438477, "step": 440 }, { "epoch": 0.9425594442960192, "grad_norm": 19.90218785765348, "learning_rate": 7.540646642330984e-09, "logits/chosen": -1.204488754272461, "logits/rejected": -1.0937587022781372, "logps/chosen": -753.1057739257812, "logps/rejected": -1030.1650390625, "loss": 0.4377, "rewards/accuracies": 0.875, "rewards/chosen": -4.8104987144470215, "rewards/margins": 2.6143453121185303, "rewards/rejected": -7.424844741821289, "step": 441 }, { "epoch": 0.944696767298958, "grad_norm": 17.97300546078058, "learning_rate": 6.9734107066846906e-09, "logits/chosen": -1.2257647514343262, "logits/rejected": -1.260422945022583, "logps/chosen": -596.8286743164062, "logps/rejected": -672.286376953125, "loss": 0.3596, "rewards/accuracies": 0.6875, "rewards/chosen": -3.532057523727417, "rewards/margins": 0.8501084446907043, "rewards/rejected": -4.382165908813477, "step": 442 }, { "epoch": 0.9468340903018969, "grad_norm": 17.202047962190797, "learning_rate": 6.428164560548133e-09, "logits/chosen": -1.1206276416778564, "logits/rejected": -1.157448172569275, "logps/chosen": -758.1364135742188, "logps/rejected": -882.7911376953125, "loss": 0.3883, "rewards/accuracies": 0.875, "rewards/chosen": -4.609854698181152, "rewards/margins": 1.4294791221618652, "rewards/rejected": -6.039333820343018, "step": 443 }, { "epoch": 0.9489714133048357, "grad_norm": 23.28751615124104, "learning_rate": 5.9049387103763175e-09, "logits/chosen": -1.0286593437194824, "logits/rejected": -1.0789514780044556, "logps/chosen": -584.1970825195312, "logps/rejected": -675.3173828125, "loss": 0.4016, "rewards/accuracies": 0.75, "rewards/chosen": -3.8274412155151367, "rewards/margins": 1.028598427772522, "rewards/rejected": -4.856039047241211, "step": 444 }, { "epoch": 0.9511087363077745, "grad_norm": 21.061524434011588, "learning_rate": 5.403762430591463e-09, "logits/chosen": -1.3204259872436523, "logits/rejected": -1.2254197597503662, "logps/chosen": -608.9866943359375, "logps/rejected": -819.9737548828125, "loss": 0.3407, "rewards/accuracies": 0.875, "rewards/chosen": -3.7939043045043945, "rewards/margins": 1.8012745380401611, "rewards/rejected": -5.595179080963135, "step": 445 }, { "epoch": 0.9532460593107134, "grad_norm": 24.59026604382506, "learning_rate": 4.924663761944936e-09, "logits/chosen": -1.1305930614471436, "logits/rejected": -1.1460232734680176, "logps/chosen": -762.6463623046875, "logps/rejected": -925.34326171875, "loss": 0.3586, "rewards/accuracies": 0.9375, "rewards/chosen": -4.521890163421631, "rewards/margins": 1.8425782918930054, "rewards/rejected": -6.364468574523926, "step": 446 }, { "epoch": 0.9553833823136522, "grad_norm": 20.723204696998593, "learning_rate": 4.467669509948591e-09, "logits/chosen": -1.22001314163208, "logits/rejected": -1.2475132942199707, "logps/chosen": -567.6207275390625, "logps/rejected": -686.4295043945312, "loss": 0.363, "rewards/accuracies": 0.8125, "rewards/chosen": -3.614631175994873, "rewards/margins": 1.1865804195404053, "rewards/rejected": -4.801211833953857, "step": 447 }, { "epoch": 0.957520705316591, "grad_norm": 17.735725565956283, "learning_rate": 4.032805243374815e-09, "logits/chosen": -1.0866748094558716, "logits/rejected": -1.0737552642822266, "logps/chosen": -707.5775146484375, "logps/rejected": -885.0850830078125, "loss": 0.3426, "rewards/accuracies": 0.875, "rewards/chosen": -4.333130836486816, "rewards/margins": 1.6789565086364746, "rewards/rejected": -6.012087345123291, "step": 448 }, { "epoch": 0.9596580283195298, "grad_norm": 17.3793434143283, "learning_rate": 3.62009529282612e-09, "logits/chosen": -1.2599151134490967, "logits/rejected": -1.234681487083435, "logps/chosen": -582.177978515625, "logps/rejected": -735.9927978515625, "loss": 0.4105, "rewards/accuracies": 0.875, "rewards/chosen": -3.6418380737304688, "rewards/margins": 1.4252443313598633, "rewards/rejected": -5.06708288192749, "step": 449 }, { "epoch": 0.9617953513224686, "grad_norm": 24.52461330654904, "learning_rate": 3.2295627493735155e-09, "logits/chosen": -1.2312730550765991, "logits/rejected": -1.1974250078201294, "logps/chosen": -790.2763671875, "logps/rejected": -950.1840209960938, "loss": 0.4455, "rewards/accuracies": 0.875, "rewards/chosen": -4.5439605712890625, "rewards/margins": 1.6306798458099365, "rewards/rejected": -6.174641132354736, "step": 450 }, { "epoch": 0.9639326743254074, "grad_norm": 22.5202862560615, "learning_rate": 2.8612294632650578e-09, "logits/chosen": -1.2211391925811768, "logits/rejected": -1.226765751838684, "logps/chosen": -697.6963500976562, "logps/rejected": -789.454833984375, "loss": 0.4469, "rewards/accuracies": 0.5625, "rewards/chosen": -4.249999046325684, "rewards/margins": 0.9787368774414062, "rewards/rejected": -5.22873592376709, "step": 451 }, { "epoch": 0.9660699973283462, "grad_norm": 21.041642949979845, "learning_rate": 2.5151160427029583e-09, "logits/chosen": -1.341503381729126, "logits/rejected": -1.2913732528686523, "logps/chosen": -760.9803466796875, "logps/rejected": -895.4586181640625, "loss": 0.4508, "rewards/accuracies": 0.75, "rewards/chosen": -4.8633880615234375, "rewards/margins": 1.360411524772644, "rewards/rejected": -6.223799228668213, "step": 452 }, { "epoch": 0.9682073203312851, "grad_norm": 24.476456599792517, "learning_rate": 2.191241852690684e-09, "logits/chosen": -1.2209526300430298, "logits/rejected": -1.3040820360183716, "logps/chosen": -729.9022216796875, "logps/rejected": -835.0201416015625, "loss": 0.4371, "rewards/accuracies": 0.6875, "rewards/chosen": -4.68671989440918, "rewards/margins": 1.158172369003296, "rewards/rejected": -5.8448920249938965, "step": 453 }, { "epoch": 0.9703446433342239, "grad_norm": 17.467256417005025, "learning_rate": 1.8896250139494252e-09, "logits/chosen": -1.133383870124817, "logits/rejected": -1.1557976007461548, "logps/chosen": -641.5121459960938, "logps/rejected": -720.6392211914062, "loss": 0.3563, "rewards/accuracies": 0.6875, "rewards/chosen": -4.076579570770264, "rewards/margins": 0.7415927648544312, "rewards/rejected": -4.818172454833984, "step": 454 }, { "epoch": 0.9724819663371627, "grad_norm": 29.06168949637391, "learning_rate": 1.6102824019043725e-09, "logits/chosen": -1.3402838706970215, "logits/rejected": -1.2556098699569702, "logps/chosen": -714.4627685546875, "logps/rejected": -948.2529907226562, "loss": 0.4449, "rewards/accuracies": 0.875, "rewards/chosen": -4.323197364807129, "rewards/margins": 2.1209869384765625, "rewards/rejected": -6.444183826446533, "step": 455 }, { "epoch": 0.9746192893401016, "grad_norm": 24.172794750012795, "learning_rate": 1.3532296457404946e-09, "logits/chosen": -1.0889981985092163, "logits/rejected": -1.0795581340789795, "logps/chosen": -892.805419921875, "logps/rejected": -1116.1790771484375, "loss": 0.4206, "rewards/accuracies": 0.9375, "rewards/chosen": -5.471851825714111, "rewards/margins": 2.0194764137268066, "rewards/rejected": -7.491328239440918, "step": 456 }, { "epoch": 0.9767566123430403, "grad_norm": 23.800223087589952, "learning_rate": 1.1184811275279481e-09, "logits/chosen": -1.1701947450637817, "logits/rejected": -1.1236315965652466, "logps/chosen": -692.0287475585938, "logps/rejected": -1029.572509765625, "loss": 0.3649, "rewards/accuracies": 1.0, "rewards/chosen": -4.509798526763916, "rewards/margins": 3.4749834537506104, "rewards/rejected": -7.9847822189331055, "step": 457 }, { "epoch": 0.9788939353459791, "grad_norm": 16.492942853907678, "learning_rate": 9.06049981417567e-10, "logits/chosen": -1.2626256942749023, "logits/rejected": -1.289964199066162, "logps/chosen": -730.7406616210938, "logps/rejected": -926.16015625, "loss": 0.4024, "rewards/accuracies": 0.8125, "rewards/chosen": -4.51789665222168, "rewards/margins": 2.0349254608154297, "rewards/rejected": -6.552822113037109, "step": 458 }, { "epoch": 0.981031258348918, "grad_norm": 23.671232760661084, "learning_rate": 7.159480929059381e-10, "logits/chosen": -1.2181737422943115, "logits/rejected": -1.1888567209243774, "logps/chosen": -670.6812744140625, "logps/rejected": -897.4075317382812, "loss": 0.4605, "rewards/accuracies": 0.9375, "rewards/chosen": -3.946488380432129, "rewards/margins": 2.147352933883667, "rewards/rejected": -6.093842029571533, "step": 459 }, { "epoch": 0.9831685813518568, "grad_norm": 18.779798535200392, "learning_rate": 5.481860981704667e-10, "logits/chosen": -1.1538286209106445, "logits/rejected": -1.1552972793579102, "logps/chosen": -690.300537109375, "logps/rejected": -903.190673828125, "loss": 0.3621, "rewards/accuracies": 0.9375, "rewards/chosen": -4.287991523742676, "rewards/margins": 1.8346328735351562, "rewards/rejected": -6.122623920440674, "step": 460 }, { "epoch": 0.9853059043547956, "grad_norm": 17.340195720499324, "learning_rate": 4.027733834741642e-10, "logits/chosen": -1.245958924293518, "logits/rejected": -1.314367413520813, "logps/chosen": -711.3773193359375, "logps/rejected": -859.24462890625, "loss": 0.3816, "rewards/accuracies": 0.8125, "rewards/chosen": -4.259920120239258, "rewards/margins": 1.5348095893859863, "rewards/rejected": -5.794729709625244, "step": 461 }, { "epoch": 0.9874432273577345, "grad_norm": 20.215944038876593, "learning_rate": 2.7971808464055667e-10, "logits/chosen": -1.160940408706665, "logits/rejected": -1.1195350885391235, "logps/chosen": -534.329833984375, "logps/rejected": -804.3992919921875, "loss": 0.3931, "rewards/accuracies": 1.0, "rewards/chosen": -3.372945785522461, "rewards/margins": 2.39996337890625, "rewards/rejected": -5.772909641265869, "step": 462 }, { "epoch": 0.9895805503606733, "grad_norm": 44.280226028243526, "learning_rate": 1.7902708659867095e-10, "logits/chosen": -1.2246938943862915, "logits/rejected": -1.2264604568481445, "logps/chosen": -670.49169921875, "logps/rejected": -792.8765869140625, "loss": 0.3134, "rewards/accuracies": 0.9375, "rewards/chosen": -4.106105327606201, "rewards/margins": 1.2818169593811035, "rewards/rejected": -5.3879218101501465, "step": 463 }, { "epoch": 0.9917178733636121, "grad_norm": 21.939913072475736, "learning_rate": 1.0070602299738773e-10, "logits/chosen": -1.3647425174713135, "logits/rejected": -1.348206639289856, "logps/chosen": -645.3593139648438, "logps/rejected": -828.7014770507812, "loss": 0.3997, "rewards/accuracies": 0.875, "rewards/chosen": -4.110527038574219, "rewards/margins": 1.7152307033538818, "rewards/rejected": -5.8257575035095215, "step": 464 }, { "epoch": 0.9938551963665508, "grad_norm": 20.19763187664986, "learning_rate": 4.475927589062678e-11, "logits/chosen": -1.1594457626342773, "logits/rejected": -1.1328028440475464, "logps/chosen": -565.1810913085938, "logps/rejected": -730.758056640625, "loss": 0.3701, "rewards/accuracies": 0.875, "rewards/chosen": -3.548121213912964, "rewards/margins": 1.580583095550537, "rewards/rejected": -5.128703594207764, "step": 465 }, { "epoch": 0.9959925193694897, "grad_norm": 19.096874791193336, "learning_rate": 1.1189975492076343e-11, "logits/chosen": -1.142935037612915, "logits/rejected": -1.178402304649353, "logps/chosen": -515.046875, "logps/rejected": -609.339111328125, "loss": 0.381, "rewards/accuracies": 0.75, "rewards/chosen": -3.373872756958008, "rewards/margins": 0.8627163767814636, "rewards/rejected": -4.236588954925537, "step": 466 }, { "epoch": 0.9981298423724285, "grad_norm": 15.024893493481045, "learning_rate": 0.0, "logits/chosen": -1.2568109035491943, "logits/rejected": -1.233332872390747, "logps/chosen": -832.8156127929688, "logps/rejected": -1123.501708984375, "loss": 0.3511, "rewards/accuracies": 0.8125, "rewards/chosen": -5.070691108703613, "rewards/margins": 3.0254790782928467, "rewards/rejected": -8.096170425415039, "step": 467 }, { "epoch": 0.9981298423724285, "step": 467, "total_flos": 0.0, "train_loss": 0.47420934423389477, "train_runtime": 9989.1556, "train_samples_per_second": 5.994, "train_steps_per_second": 0.047 } ], "logging_steps": 1, "max_steps": 467, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }