{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9989528795811519, "eval_steps": 200, "global_step": 477, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0020942408376963353, "grad_norm": 63.87941360473633, "learning_rate": 0.0, "logits/chosen": 2.203179359436035, "logits/rejected": 2.035616397857666, "logps/chosen": -1.1535288095474243, "logps/rejected": -1.4391145706176758, "loss": 10.7981, "rewards/accuracies": 0.6875, "rewards/chosen": -2.3070576190948486, "rewards/margins": 0.5711714625358582, "rewards/rejected": -2.8782291412353516, "step": 1 }, { "epoch": 0.020942408376963352, "grad_norm": 32.67357635498047, "learning_rate": 1.125e-07, "logits/chosen": 2.019272804260254, "logits/rejected": 1.9496015310287476, "logps/chosen": -1.1085267066955566, "logps/rejected": -1.3239415884017944, "loss": 10.102, "rewards/accuracies": 0.6076388955116272, "rewards/chosen": -2.2170534133911133, "rewards/margins": 0.4308299422264099, "rewards/rejected": -2.647883176803589, "step": 10 }, { "epoch": 0.041884816753926704, "grad_norm": 65.35013580322266, "learning_rate": 2.3749999999999998e-07, "logits/chosen": 1.9201877117156982, "logits/rejected": 1.8487052917480469, "logps/chosen": -1.1284211874008179, "logps/rejected": -1.3107967376708984, "loss": 9.9907, "rewards/accuracies": 0.578125, "rewards/chosen": -2.2568423748016357, "rewards/margins": 0.3647509813308716, "rewards/rejected": -2.621593475341797, "step": 20 }, { "epoch": 0.06282722513089005, "grad_norm": 46.34416198730469, "learning_rate": 3.6249999999999997e-07, "logits/chosen": 1.9416582584381104, "logits/rejected": 1.8945503234863281, "logps/chosen": -1.090599775314331, "logps/rejected": -1.200209140777588, "loss": 9.8702, "rewards/accuracies": 0.578125, "rewards/chosen": -2.181199550628662, "rewards/margins": 0.21921858191490173, "rewards/rejected": -2.400418281555176, "step": 30 }, { "epoch": 0.08376963350785341, "grad_norm": 34.50259780883789, "learning_rate": 4.875e-07, "logits/chosen": 1.869368553161621, "logits/rejected": 1.8852355480194092, "logps/chosen": -1.0211187601089478, "logps/rejected": -1.2025065422058105, "loss": 9.811, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -2.0422375202178955, "rewards/margins": 0.3627752661705017, "rewards/rejected": -2.405013084411621, "step": 40 }, { "epoch": 0.10471204188481675, "grad_norm": 28.40943145751953, "learning_rate": 5.999919559552264e-07, "logits/chosen": 1.989923119544983, "logits/rejected": 2.015007495880127, "logps/chosen": -0.9122698903083801, "logps/rejected": -1.1353458166122437, "loss": 9.6163, "rewards/accuracies": 0.578125, "rewards/chosen": -1.8245397806167603, "rewards/margins": 0.44615206122398376, "rewards/rejected": -2.2706916332244873, "step": 50 }, { "epoch": 0.1256544502617801, "grad_norm": 34.944854736328125, "learning_rate": 5.99027192440263e-07, "logits/chosen": 1.8734846115112305, "logits/rejected": 1.8532871007919312, "logps/chosen": -0.9877859354019165, "logps/rejected": -1.1455579996109009, "loss": 9.9483, "rewards/accuracies": 0.5531250238418579, "rewards/chosen": -1.975571870803833, "rewards/margins": 0.31554415822029114, "rewards/rejected": -2.2911159992218018, "step": 60 }, { "epoch": 0.14659685863874344, "grad_norm": 33.13404846191406, "learning_rate": 5.964595461857045e-07, "logits/chosen": 2.0360629558563232, "logits/rejected": 2.0784692764282227, "logps/chosen": -0.9781535863876343, "logps/rejected": -1.1859705448150635, "loss": 9.5206, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -1.9563071727752686, "rewards/margins": 0.41563382744789124, "rewards/rejected": -2.371941089630127, "step": 70 }, { "epoch": 0.16753926701570682, "grad_norm": 31.283838272094727, "learning_rate": 5.923027806082798e-07, "logits/chosen": 1.9586601257324219, "logits/rejected": 1.9780356884002686, "logps/chosen": -0.9476488828659058, "logps/rejected": -1.172166109085083, "loss": 9.5782, "rewards/accuracies": 0.640625, "rewards/chosen": -1.8952977657318115, "rewards/margins": 0.44903475046157837, "rewards/rejected": -2.344332218170166, "step": 80 }, { "epoch": 0.18848167539267016, "grad_norm": 52.153682708740234, "learning_rate": 5.865791773197119e-07, "logits/chosen": 1.9691884517669678, "logits/rejected": 1.95147705078125, "logps/chosen": -0.9630438685417175, "logps/rejected": -1.2420399188995361, "loss": 9.5363, "rewards/accuracies": 0.640625, "rewards/chosen": -1.926087737083435, "rewards/margins": 0.5579922795295715, "rewards/rejected": -2.4840798377990723, "step": 90 }, { "epoch": 0.2094240837696335, "grad_norm": 35.23684310913086, "learning_rate": 5.793194166900525e-07, "logits/chosen": 1.859946608543396, "logits/rejected": 1.847818374633789, "logps/chosen": -0.9811749458312988, "logps/rejected": -1.1944254636764526, "loss": 9.6309, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -1.9623498916625977, "rewards/margins": 0.42650118470191956, "rewards/rejected": -2.3888509273529053, "step": 100 }, { "epoch": 0.23036649214659685, "grad_norm": 41.84571838378906, "learning_rate": 5.705624133909468e-07, "logits/chosen": 2.0235304832458496, "logits/rejected": 2.071727752685547, "logps/chosen": -0.9354808926582336, "logps/rejected": -1.2987582683563232, "loss": 9.1676, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.8709617853164673, "rewards/margins": 0.7265545129776001, "rewards/rejected": -2.5975165367126465, "step": 110 }, { "epoch": 0.2513089005235602, "grad_norm": 35.44133758544922, "learning_rate": 5.603551078003725e-07, "logits/chosen": 1.958561658859253, "logits/rejected": 1.9417177438735962, "logps/chosen": -1.0246690511703491, "logps/rejected": -1.231858491897583, "loss": 9.6426, "rewards/accuracies": 0.578125, "rewards/chosen": -2.0493381023406982, "rewards/margins": 0.41437870264053345, "rewards/rejected": -2.463716983795166, "step": 120 }, { "epoch": 0.27225130890052357, "grad_norm": 35.418907165527344, "learning_rate": 5.487522143869884e-07, "logits/chosen": 1.781296968460083, "logits/rejected": 1.8837049007415771, "logps/chosen": -1.019592046737671, "logps/rejected": -1.2727988958358765, "loss": 9.2965, "rewards/accuracies": 0.65625, "rewards/chosen": -2.039184093475342, "rewards/margins": 0.5064135789871216, "rewards/rejected": -2.545597791671753, "step": 130 }, { "epoch": 0.2931937172774869, "grad_norm": 33.78614807128906, "learning_rate": 5.358159284228362e-07, "logits/chosen": 1.831496000289917, "logits/rejected": 1.8257663249969482, "logps/chosen": -1.017618179321289, "logps/rejected": -1.2920626401901245, "loss": 9.7072, "rewards/accuracies": 0.609375, "rewards/chosen": -2.035236358642578, "rewards/margins": 0.5488889813423157, "rewards/rejected": -2.584125280380249, "step": 140 }, { "epoch": 0.31413612565445026, "grad_norm": 41.28224182128906, "learning_rate": 5.216155925965094e-07, "logits/chosen": 1.9868301153182983, "logits/rejected": 1.9190441370010376, "logps/chosen": -0.9844843149185181, "logps/rejected": -1.2851510047912598, "loss": 9.3868, "rewards/accuracies": 0.625, "rewards/chosen": -1.9689686298370361, "rewards/margins": 0.6013331413269043, "rewards/rejected": -2.5703020095825195, "step": 150 }, { "epoch": 0.33507853403141363, "grad_norm": 28.384824752807617, "learning_rate": 5.062273253138518e-07, "logits/chosen": 1.9291102886199951, "logits/rejected": 1.90181565284729, "logps/chosen": -1.0036952495574951, "logps/rejected": -1.3095722198486328, "loss": 9.2335, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -2.0073904991149902, "rewards/margins": 0.6117540597915649, "rewards/rejected": -2.6191444396972656, "step": 160 }, { "epoch": 0.35602094240837695, "grad_norm": 34.20272445678711, "learning_rate": 4.897336126786132e-07, "logits/chosen": 1.9849627017974854, "logits/rejected": 1.9100704193115234, "logps/chosen": -0.9672800302505493, "logps/rejected": -1.2293925285339355, "loss": 9.3151, "rewards/accuracies": 0.625, "rewards/chosen": -1.9345600605010986, "rewards/margins": 0.5242254137992859, "rewards/rejected": -2.458785057067871, "step": 170 }, { "epoch": 0.3769633507853403, "grad_norm": 31.79982566833496, "learning_rate": 4.722228663401794e-07, "logits/chosen": 1.9956668615341187, "logits/rejected": 1.9031591415405273, "logps/chosen": -0.9950326681137085, "logps/rejected": -1.2332713603973389, "loss": 9.2109, "rewards/accuracies": 0.625, "rewards/chosen": -1.990065336227417, "rewards/margins": 0.47647732496261597, "rewards/rejected": -2.4665427207946777, "step": 180 }, { "epoch": 0.39790575916230364, "grad_norm": 48.42845153808594, "learning_rate": 4.537889495784557e-07, "logits/chosen": 1.8380305767059326, "logits/rejected": 1.8153343200683594, "logps/chosen": -0.9896278381347656, "logps/rejected": -1.3258594274520874, "loss": 9.0701, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -1.9792556762695312, "rewards/margins": 0.6724631190299988, "rewards/rejected": -2.651718854904175, "step": 190 }, { "epoch": 0.418848167539267, "grad_norm": 27.38842010498047, "learning_rate": 4.345306741662423e-07, "logits/chosen": 1.883819580078125, "logits/rejected": 1.9056427478790283, "logps/chosen": -1.014286994934082, "logps/rejected": -1.3301787376403809, "loss": 9.1298, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.028573989868164, "rewards/margins": 0.6317834258079529, "rewards/rejected": -2.6603574752807617, "step": 200 }, { "epoch": 0.418848167539267, "eval_logits/chosen": 2.1119980812072754, "eval_logits/rejected": 2.124922752380371, "eval_logps/chosen": -1.0400915145874023, "eval_logps/rejected": -1.3719773292541504, "eval_loss": 1.1164965629577637, "eval_rewards/accuracies": 0.6359999775886536, "eval_rewards/chosen": -2.0801830291748047, "eval_rewards/margins": 0.6637715697288513, "eval_rewards/rejected": -2.743954658508301, "eval_runtime": 45.5084, "eval_samples_per_second": 43.948, "eval_steps_per_second": 5.493, "step": 200 }, { "epoch": 0.4397905759162304, "grad_norm": 37.30736541748047, "learning_rate": 4.145512707060832e-07, "logits/chosen": 2.0058822631835938, "logits/rejected": 2.02673077583313, "logps/chosen": -1.0181465148925781, "logps/rejected": -1.3840270042419434, "loss": 9.2944, "rewards/accuracies": 0.640625, "rewards/chosen": -2.0362930297851562, "rewards/margins": 0.7317610383033752, "rewards/rejected": -2.7680540084838867, "step": 210 }, { "epoch": 0.4607329842931937, "grad_norm": 39.49748992919922, "learning_rate": 3.939578352807537e-07, "logits/chosen": 2.021946907043457, "logits/rejected": 2.058398723602295, "logps/chosen": -1.0262434482574463, "logps/rejected": -1.3750391006469727, "loss": 9.1142, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.0524868965148926, "rewards/margins": 0.6975916028022766, "rewards/rejected": -2.7500782012939453, "step": 220 }, { "epoch": 0.4816753926701571, "grad_norm": 46.19815444946289, "learning_rate": 3.7286075538352106e-07, "logits/chosen": 1.8356702327728271, "logits/rejected": 1.8795230388641357, "logps/chosen": -1.0263798236846924, "logps/rejected": -1.3885927200317383, "loss": 9.0176, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -2.0527596473693848, "rewards/margins": 0.7244257926940918, "rewards/rejected": -2.7771854400634766, "step": 230 }, { "epoch": 0.5026178010471204, "grad_norm": 37.616127014160156, "learning_rate": 3.5137311820537683e-07, "logits/chosen": 1.7736084461212158, "logits/rejected": 1.7944352626800537, "logps/chosen": -0.9926139116287231, "logps/rejected": -1.3105839490890503, "loss": 9.1917, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -1.9852278232574463, "rewards/margins": 0.6359401941299438, "rewards/rejected": -2.6211678981781006, "step": 240 }, { "epoch": 0.5235602094240838, "grad_norm": 47.60890197753906, "learning_rate": 3.296101044510136e-07, "logits/chosen": 1.8431848287582397, "logits/rejected": 1.7555633783340454, "logps/chosen": -1.0195379257202148, "logps/rejected": -1.463659644126892, "loss": 8.915, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -2.0390758514404297, "rewards/margins": 0.8882430791854858, "rewards/rejected": -2.927319288253784, "step": 250 }, { "epoch": 0.5445026178010471, "grad_norm": 53.570133209228516, "learning_rate": 3.076883709328898e-07, "logits/chosen": 1.885573387145996, "logits/rejected": 1.8557020425796509, "logps/chosen": -1.0390733480453491, "logps/rejected": -1.3984655141830444, "loss": 9.3265, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.0781466960906982, "rewards/margins": 0.7187842726707458, "rewards/rejected": -2.796931028366089, "step": 260 }, { "epoch": 0.5654450261780105, "grad_norm": 52.018001556396484, "learning_rate": 2.857254252528773e-07, "logits/chosen": 1.815469741821289, "logits/rejected": 1.8461593389511108, "logps/chosen": -1.0356584787368774, "logps/rejected": -1.3995596170425415, "loss": 9.1513, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -2.071316957473755, "rewards/margins": 0.7278021574020386, "rewards/rejected": -2.799119234085083, "step": 270 }, { "epoch": 0.5863874345549738, "grad_norm": 47.29704284667969, "learning_rate": 2.638389959234031e-07, "logits/chosen": 1.8500423431396484, "logits/rejected": 1.827164888381958, "logps/chosen": -0.9695369601249695, "logps/rejected": -1.464297890663147, "loss": 8.6549, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -1.939073920249939, "rewards/margins": 0.9895218014717102, "rewards/rejected": -2.928595781326294, "step": 280 }, { "epoch": 0.6073298429319371, "grad_norm": 46.65730285644531, "learning_rate": 2.421464013044373e-07, "logits/chosen": 1.9609591960906982, "logits/rejected": 1.9269940853118896, "logps/chosen": -1.0431578159332275, "logps/rejected": -1.36538827419281, "loss": 9.1695, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -2.086315631866455, "rewards/margins": 0.6444610357284546, "rewards/rejected": -2.73077654838562, "step": 290 }, { "epoch": 0.6282722513089005, "grad_norm": 42.485137939453125, "learning_rate": 2.2076392073903244e-07, "logits/chosen": 1.8708031177520752, "logits/rejected": 1.8845112323760986, "logps/chosen": -1.1147522926330566, "logps/rejected": -1.3503917455673218, "loss": 9.1503, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.2295045852661133, "rewards/margins": 0.4712789058685303, "rewards/rejected": -2.7007834911346436, "step": 300 }, { "epoch": 0.6492146596858639, "grad_norm": 112.9022445678711, "learning_rate": 1.9980617125832958e-07, "logits/chosen": 1.9049968719482422, "logits/rejected": 1.908156156539917, "logps/chosen": -1.0263423919677734, "logps/rejected": -1.3653188943862915, "loss": 8.8053, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.052684783935547, "rewards/margins": 0.6779530644416809, "rewards/rejected": -2.730637788772583, "step": 310 }, { "epoch": 0.6701570680628273, "grad_norm": 42.80693054199219, "learning_rate": 1.7938549319709663e-07, "logits/chosen": 1.9933536052703857, "logits/rejected": 2.0374882221221924, "logps/chosen": -1.0420429706573486, "logps/rejected": -1.4136046171188354, "loss": 8.8744, "rewards/accuracies": 0.671875, "rewards/chosen": -2.0840859413146973, "rewards/margins": 0.7431236505508423, "rewards/rejected": -2.827209234237671, "step": 320 }, { "epoch": 0.6910994764397905, "grad_norm": 64.53388977050781, "learning_rate": 1.5961134801309614e-07, "logits/chosen": 1.9328991174697876, "logits/rejected": 1.9388740062713623, "logps/chosen": -1.0375453233718872, "logps/rejected": -1.4853423833847046, "loss": 9.0326, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -2.0750906467437744, "rewards/margins": 0.8955942392349243, "rewards/rejected": -2.970684766769409, "step": 330 }, { "epoch": 0.7120418848167539, "grad_norm": 64.71541595458984, "learning_rate": 1.4058973153816886e-07, "logits/chosen": 1.8567371368408203, "logits/rejected": 1.851202368736267, "logps/chosen": -1.0736474990844727, "logps/rejected": -1.4988930225372314, "loss": 9.0231, "rewards/accuracies": 0.6875, "rewards/chosen": -2.1472949981689453, "rewards/margins": 0.850490927696228, "rewards/rejected": -2.997786045074463, "step": 340 }, { "epoch": 0.7329842931937173, "grad_norm": 48.053550720214844, "learning_rate": 1.2242260580619538e-07, "logits/chosen": 1.9933841228485107, "logits/rejected": 2.0060808658599854, "logps/chosen": -1.0655163526535034, "logps/rejected": -1.511839747428894, "loss": 8.737, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.131032705307007, "rewards/margins": 0.8926466107368469, "rewards/rejected": -3.023679494857788, "step": 350 }, { "epoch": 0.7539267015706806, "grad_norm": 119.26463317871094, "learning_rate": 1.0520735250352405e-07, "logits/chosen": 1.9238300323486328, "logits/rejected": 1.964910864830017, "logps/chosen": -1.0357812643051147, "logps/rejected": -1.4163744449615479, "loss": 8.9391, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.0715625286102295, "rewards/margins": 0.761186420917511, "rewards/rejected": -2.8327488899230957, "step": 360 }, { "epoch": 0.774869109947644, "grad_norm": 74.86650848388672, "learning_rate": 8.903625097154667e-08, "logits/chosen": 2.0095980167388916, "logits/rejected": 1.9654392004013062, "logps/chosen": -1.075518012046814, "logps/rejected": -1.5022872686386108, "loss": 8.4271, "rewards/accuracies": 0.640625, "rewards/chosen": -2.151036024093628, "rewards/margins": 0.8535385131835938, "rewards/rejected": -3.0045745372772217, "step": 370 }, { "epoch": 0.7958115183246073, "grad_norm": 148.51052856445312, "learning_rate": 7.399598355949822e-08, "logits/chosen": 1.9656111001968384, "logits/rejected": 1.9382845163345337, "logps/chosen": -0.987983226776123, "logps/rejected": -1.367729663848877, "loss": 8.8956, "rewards/accuracies": 0.671875, "rewards/chosen": -1.975966453552246, "rewards/margins": 0.7594925165176392, "rewards/rejected": -2.735459327697754, "step": 380 }, { "epoch": 0.8167539267015707, "grad_norm": 61.45123291015625, "learning_rate": 6.01671709789497e-08, "logits/chosen": 1.955004334449768, "logits/rejected": 1.985464334487915, "logps/chosen": -1.0322657823562622, "logps/rejected": -1.4219481945037842, "loss": 8.9243, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -2.0645315647125244, "rewards/margins": 0.7793648838996887, "rewards/rejected": -2.8438963890075684, "step": 390 }, { "epoch": 0.837696335078534, "grad_norm": 44.18227767944336, "learning_rate": 4.76239401506456e-08, "logits/chosen": 1.9698352813720703, "logits/rejected": 1.9627288579940796, "logps/chosen": -1.0838392972946167, "logps/rejected": -1.3468244075775146, "loss": 8.8992, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -2.1676785945892334, "rewards/margins": 0.5259703993797302, "rewards/rejected": -2.6936488151550293, "step": 400 }, { "epoch": 0.837696335078534, "eval_logits/chosen": 2.138575553894043, "eval_logits/rejected": 2.156599521636963, "eval_logps/chosen": -1.054769515991211, "eval_logps/rejected": -1.4746460914611816, "eval_loss": 1.076961874961853, "eval_rewards/accuracies": 0.6660000085830688, "eval_rewards/chosen": -2.109539031982422, "eval_rewards/margins": 0.8397529721260071, "eval_rewards/rejected": -2.9492921829223633, "eval_runtime": 45.4523, "eval_samples_per_second": 44.002, "eval_steps_per_second": 5.5, "step": 400 }, { "epoch": 0.8586387434554974, "grad_norm": 86.99646759033203, "learning_rate": 3.643352686016596e-08, "logits/chosen": 1.959341287612915, "logits/rejected": 1.9492921829223633, "logps/chosen": -1.049546241760254, "logps/rejected": -1.4740784168243408, "loss": 8.7816, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -2.099092483520508, "rewards/margins": 0.8490642309188843, "rewards/rejected": -2.9481568336486816, "step": 410 }, { "epoch": 0.8795811518324608, "grad_norm": 77.62808990478516, "learning_rate": 2.665591535230738e-08, "logits/chosen": 1.9194042682647705, "logits/rejected": 1.9090583324432373, "logps/chosen": -1.046962022781372, "logps/rejected": -1.3986111879348755, "loss": 8.6978, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -2.093924045562744, "rewards/margins": 0.7032982110977173, "rewards/rejected": -2.797222375869751, "step": 420 }, { "epoch": 0.900523560209424, "grad_norm": 104.71875, "learning_rate": 1.834351679607603e-08, "logits/chosen": 1.983788251876831, "logits/rejected": 1.991276502609253, "logps/chosen": -0.9792089462280273, "logps/rejected": -1.5666288137435913, "loss": 8.5908, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -1.9584178924560547, "rewards/margins": 1.174839735031128, "rewards/rejected": -3.1332576274871826, "step": 430 }, { "epoch": 0.9214659685863874, "grad_norm": 61.72246170043945, "learning_rate": 1.1540888343822164e-08, "logits/chosen": 2.009519100189209, "logits/rejected": 2.041383743286133, "logps/chosen": -0.999321460723877, "logps/rejected": -1.394999623298645, "loss": 8.7008, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.998642921447754, "rewards/margins": 0.7913564443588257, "rewards/rejected": -2.78999924659729, "step": 440 }, { "epoch": 0.9424083769633508, "grad_norm": 47.30830383300781, "learning_rate": 6.284494290451603e-09, "logits/chosen": 2.0252490043640137, "logits/rejected": 2.0139636993408203, "logps/chosen": -1.0534889698028564, "logps/rejected": -1.4194049835205078, "loss": 8.8863, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -2.106977939605713, "rewards/margins": 0.7318320870399475, "rewards/rejected": -2.8388099670410156, "step": 450 }, { "epoch": 0.9633507853403142, "grad_norm": 55.95344161987305, "learning_rate": 2.6025106129779263e-09, "logits/chosen": 2.087092161178589, "logits/rejected": 2.062786102294922, "logps/chosen": -1.058509111404419, "logps/rejected": -1.4474523067474365, "loss": 8.995, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -2.117018222808838, "rewards/margins": 0.777886688709259, "rewards/rejected": -2.894904613494873, "step": 460 }, { "epoch": 0.9842931937172775, "grad_norm": 57.510528564453125, "learning_rate": 5.146739381471921e-10, "logits/chosen": 1.9302631616592407, "logits/rejected": 1.8986546993255615, "logps/chosen": -0.9788550138473511, "logps/rejected": -1.4690487384796143, "loss": 8.7456, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -1.9577100276947021, "rewards/margins": 0.9803873300552368, "rewards/rejected": -2.9380974769592285, "step": 470 }, { "epoch": 0.9989528795811519, "step": 477, "total_flos": 0.0, "train_loss": 9.188309121681709, "train_runtime": 5401.7608, "train_samples_per_second": 11.318, "train_steps_per_second": 0.088 } ], "logging_steps": 10, "max_steps": 477, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }