{ "best_metric": 0.800000011920929, "best_model_checkpoint": "./outputs/tinyllama-1.1b-dpo-pku-saferlhf/checkpoint-1400", "epoch": 0.9997600191984641, "eval_steps": 200, "global_step": 2083, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004799616030717543, "grad_norm": 56.5, "learning_rate": 2.3923444976076555e-07, "logits/chosen": -2.689218282699585, "logits/rejected": -2.554370880126953, "logps/chosen": -212.5878143310547, "logps/rejected": -186.63473510742188, "loss": 0.693, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": 4.648463800549507e-05, "rewards/margins": 0.0005794697208330035, "rewards/rejected": -0.0005329854902811348, "step": 10 }, { "epoch": 0.009599232061435085, "grad_norm": 59.75, "learning_rate": 4.784688995215311e-07, "logits/chosen": -2.7294280529022217, "logits/rejected": -2.6172096729278564, "logps/chosen": -223.5776824951172, "logps/rejected": -203.3684539794922, "loss": 0.6931, "rewards/accuracies": 0.46875, "rewards/chosen": 0.0015927983913570642, "rewards/margins": 0.0005491179181262851, "rewards/rejected": 0.0010436807060614228, "step": 20 }, { "epoch": 0.014398848092152628, "grad_norm": 50.25, "learning_rate": 7.177033492822967e-07, "logits/chosen": -2.7166686058044434, "logits/rejected": -2.6399471759796143, "logps/chosen": -238.0247344970703, "logps/rejected": -219.1692352294922, "loss": 0.6949, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -0.003216738346964121, "rewards/margins": -0.0030289709102362394, "rewards/rejected": -0.0001877670583780855, "step": 30 }, { "epoch": 0.01919846412287017, "grad_norm": 57.0, "learning_rate": 9.569377990430622e-07, "logits/chosen": -2.7442615032196045, "logits/rejected": -2.5918688774108887, "logps/chosen": -251.1165771484375, "logps/rejected": -196.37649536132812, "loss": 0.6883, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.00424658227711916, "rewards/margins": 0.010354852303862572, "rewards/rejected": -0.006108270026743412, "step": 40 }, { "epoch": 0.023998080153587713, "grad_norm": 47.25, "learning_rate": 1.196172248803828e-06, "logits/chosen": -2.6663458347320557, "logits/rejected": -2.604515552520752, "logps/chosen": -234.8127899169922, "logps/rejected": -199.76278686523438, "loss": 0.6904, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0006293629994615912, "rewards/margins": 0.006024296395480633, "rewards/rejected": -0.00539493327960372, "step": 50 }, { "epoch": 0.028797696184305256, "grad_norm": 65.0, "learning_rate": 1.4354066985645934e-06, "logits/chosen": -2.7063660621643066, "logits/rejected": -2.6000542640686035, "logps/chosen": -223.295166015625, "logps/rejected": -215.13656616210938, "loss": 0.6887, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0032431543804705143, "rewards/margins": 0.009618126787245274, "rewards/rejected": -0.012861279770731926, "step": 60 }, { "epoch": 0.033597312215022795, "grad_norm": 50.0, "learning_rate": 1.6746411483253591e-06, "logits/chosen": -2.727038860321045, "logits/rejected": -2.585761070251465, "logps/chosen": -245.62393188476562, "logps/rejected": -205.93148803710938, "loss": 0.6877, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.005830951035022736, "rewards/margins": 0.011942476034164429, "rewards/rejected": -0.006111525930464268, "step": 70 }, { "epoch": 0.03839692824574034, "grad_norm": 50.25, "learning_rate": 1.9138755980861244e-06, "logits/chosen": -2.7241501808166504, "logits/rejected": -2.6147332191467285, "logps/chosen": -235.19338989257812, "logps/rejected": -201.25424194335938, "loss": 0.6807, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.01283216755837202, "rewards/margins": 0.026413241401314735, "rewards/rejected": -0.01358107291162014, "step": 80 }, { "epoch": 0.04319654427645788, "grad_norm": 49.5, "learning_rate": 2.15311004784689e-06, "logits/chosen": -2.7387120723724365, "logits/rejected": -2.5572338104248047, "logps/chosen": -250.86380004882812, "logps/rejected": -189.31814575195312, "loss": 0.6712, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.03785385563969612, "rewards/margins": 0.04662873595952988, "rewards/rejected": -0.00877488125115633, "step": 90 }, { "epoch": 0.04799616030717543, "grad_norm": 49.0, "learning_rate": 2.392344497607656e-06, "logits/chosen": -2.7061264514923096, "logits/rejected": -2.5762696266174316, "logps/chosen": -227.9839630126953, "logps/rejected": -206.23196411132812, "loss": 0.6702, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.06363014876842499, "rewards/margins": 0.05047706514596939, "rewards/rejected": 0.013153081759810448, "step": 100 }, { "epoch": 0.052795776337892966, "grad_norm": 49.5, "learning_rate": 2.631578947368421e-06, "logits/chosen": -2.7290475368499756, "logits/rejected": -2.548861026763916, "logps/chosen": -236.59375, "logps/rejected": -178.90164184570312, "loss": 0.6619, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.06783770024776459, "rewards/margins": 0.070295050740242, "rewards/rejected": -0.0024573481641709805, "step": 110 }, { "epoch": 0.05759539236861051, "grad_norm": 52.0, "learning_rate": 2.870813397129187e-06, "logits/chosen": -2.7224419116973877, "logits/rejected": -2.5972402095794678, "logps/chosen": -233.290283203125, "logps/rejected": -204.4892578125, "loss": 0.6691, "rewards/accuracies": 0.625, "rewards/chosen": 0.08400879800319672, "rewards/margins": 0.05939972400665283, "rewards/rejected": 0.024609070271253586, "step": 120 }, { "epoch": 0.06239500839932805, "grad_norm": 50.25, "learning_rate": 3.1100478468899525e-06, "logits/chosen": -2.7070722579956055, "logits/rejected": -2.5772013664245605, "logps/chosen": -250.78231811523438, "logps/rejected": -209.5062713623047, "loss": 0.6496, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.165785014629364, "rewards/margins": 0.10047496855258942, "rewards/rejected": 0.06531006097793579, "step": 130 }, { "epoch": 0.06719462443004559, "grad_norm": 49.5, "learning_rate": 3.3492822966507182e-06, "logits/chosen": -2.6727123260498047, "logits/rejected": -2.6064090728759766, "logps/chosen": -233.5751495361328, "logps/rejected": -217.31576538085938, "loss": 0.6574, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.20782318711280823, "rewards/margins": 0.0912146344780922, "rewards/rejected": 0.11660852283239365, "step": 140 }, { "epoch": 0.07199424046076314, "grad_norm": 45.75, "learning_rate": 3.5885167464114835e-06, "logits/chosen": -2.6911652088165283, "logits/rejected": -2.6086039543151855, "logps/chosen": -230.4497528076172, "logps/rejected": -244.7624053955078, "loss": 0.6583, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.19385293126106262, "rewards/margins": 0.0921817347407341, "rewards/rejected": 0.10167120397090912, "step": 150 }, { "epoch": 0.07679385649148068, "grad_norm": 51.5, "learning_rate": 3.827751196172249e-06, "logits/chosen": -2.65492844581604, "logits/rejected": -2.5312721729278564, "logps/chosen": -241.20199584960938, "logps/rejected": -200.1884765625, "loss": 0.6299, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.1793375313282013, "rewards/margins": 0.1596115678548813, "rewards/rejected": 0.019725963473320007, "step": 160 }, { "epoch": 0.08159347252219823, "grad_norm": 46.25, "learning_rate": 4.066985645933015e-06, "logits/chosen": -2.732844114303589, "logits/rejected": -2.5666396617889404, "logps/chosen": -255.0216827392578, "logps/rejected": -198.9121856689453, "loss": 0.6366, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.15441159904003143, "rewards/margins": 0.144814133644104, "rewards/rejected": 0.009597455151379108, "step": 170 }, { "epoch": 0.08639308855291576, "grad_norm": 43.25, "learning_rate": 4.30622009569378e-06, "logits/chosen": -2.7506613731384277, "logits/rejected": -2.6481566429138184, "logps/chosen": -240.16055297851562, "logps/rejected": -200.94822692871094, "loss": 0.6103, "rewards/accuracies": 0.71875, "rewards/chosen": 0.23929600417613983, "rewards/margins": 0.20658496022224426, "rewards/rejected": 0.032711055129766464, "step": 180 }, { "epoch": 0.09119270458363331, "grad_norm": 50.75, "learning_rate": 4.5454545454545455e-06, "logits/chosen": -2.675985097885132, "logits/rejected": -2.5708577632904053, "logps/chosen": -238.582763671875, "logps/rejected": -198.5998992919922, "loss": 0.6178, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.23324120044708252, "rewards/margins": 0.195271834731102, "rewards/rejected": 0.03796938806772232, "step": 190 }, { "epoch": 0.09599232061435085, "grad_norm": 37.25, "learning_rate": 4.784688995215312e-06, "logits/chosen": -2.7069146633148193, "logits/rejected": -2.5851969718933105, "logps/chosen": -225.49917602539062, "logps/rejected": -189.277587890625, "loss": 0.6075, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.3116636574268341, "rewards/margins": 0.22872300446033478, "rewards/rejected": 0.08294066041707993, "step": 200 }, { "epoch": 0.09599232061435085, "eval_logits/chosen": -2.6887047290802, "eval_logits/rejected": -2.5653347969055176, "eval_logps/chosen": -228.92459106445312, "eval_logps/rejected": -203.6246337890625, "eval_loss": 0.6051958799362183, "eval_rewards/accuracies": 0.7170000076293945, "eval_rewards/chosen": 0.4208393394947052, "eval_rewards/margins": 0.23022359609603882, "eval_rewards/rejected": 0.1906157284975052, "eval_runtime": 26.0524, "eval_samples_per_second": 38.384, "eval_steps_per_second": 9.596, "step": 200 }, { "epoch": 0.1007919366450684, "grad_norm": 45.75, "learning_rate": 4.999996487062011e-06, "logits/chosen": -2.645963668823242, "logits/rejected": -2.5576295852661133, "logps/chosen": -237.7938995361328, "logps/rejected": -211.9795684814453, "loss": 0.6059, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.4848661422729492, "rewards/margins": 0.2471170723438263, "rewards/rejected": 0.2377490997314453, "step": 210 }, { "epoch": 0.10559155267578593, "grad_norm": 41.5, "learning_rate": 4.999574946449064e-06, "logits/chosen": -2.6820342540740967, "logits/rejected": -2.550971269607544, "logps/chosen": -222.2080841064453, "logps/rejected": -184.314208984375, "loss": 0.6094, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.38701874017715454, "rewards/margins": 0.22917309403419495, "rewards/rejected": 0.1578456610441208, "step": 220 }, { "epoch": 0.11039116870650348, "grad_norm": 41.0, "learning_rate": 4.9984509539801644e-06, "logits/chosen": -2.6449620723724365, "logits/rejected": -2.532531976699829, "logps/chosen": -227.45291137695312, "logps/rejected": -221.8866729736328, "loss": 0.6227, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.3856458067893982, "rewards/margins": 0.22975265979766846, "rewards/rejected": 0.15589316189289093, "step": 230 }, { "epoch": 0.11519078473722102, "grad_norm": 48.0, "learning_rate": 4.996624825529257e-06, "logits/chosen": -2.7190098762512207, "logits/rejected": -2.6080780029296875, "logps/chosen": -212.9981231689453, "logps/rejected": -191.4314727783203, "loss": 0.5988, "rewards/accuracies": 0.75, "rewards/chosen": 0.36744850873947144, "rewards/margins": 0.2741442620754242, "rewards/rejected": 0.09330429136753082, "step": 240 }, { "epoch": 0.11999040076793857, "grad_norm": 47.5, "learning_rate": 4.994097074290524e-06, "logits/chosen": -2.6760241985321045, "logits/rejected": -2.555823564529419, "logps/chosen": -225.98098754882812, "logps/rejected": -199.85360717773438, "loss": 0.6114, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.2855023443698883, "rewards/margins": 0.23584535717964172, "rewards/rejected": 0.04965699464082718, "step": 250 }, { "epoch": 0.1247900167986561, "grad_norm": 45.0, "learning_rate": 4.990868410634163e-06, "logits/chosen": -2.683492660522461, "logits/rejected": -2.59763240814209, "logps/chosen": -222.82955932617188, "logps/rejected": -192.34671020507812, "loss": 0.592, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.2924219071865082, "rewards/margins": 0.2935822010040283, "rewards/rejected": -0.0011602870654314756, "step": 260 }, { "epoch": 0.12958963282937366, "grad_norm": 42.0, "learning_rate": 4.9869397419067535e-06, "logits/chosen": -2.6904869079589844, "logits/rejected": -2.58687162399292, "logps/chosen": -210.76589965820312, "logps/rejected": -191.6869354248047, "loss": 0.5514, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 0.34391769766807556, "rewards/margins": 0.35880047082901, "rewards/rejected": -0.014882763847708702, "step": 270 }, { "epoch": 0.13438924886009118, "grad_norm": 46.75, "learning_rate": 4.982312172176264e-06, "logits/chosen": -2.7495617866516113, "logits/rejected": -2.549598217010498, "logps/chosen": -269.2088928222656, "logps/rejected": -204.53211975097656, "loss": 0.5697, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.4326564371585846, "rewards/margins": 0.35469716787338257, "rewards/rejected": 0.07795925438404083, "step": 280 }, { "epoch": 0.13918886489080873, "grad_norm": 57.0, "learning_rate": 4.976987001921787e-06, "logits/chosen": -2.669323444366455, "logits/rejected": -2.547853708267212, "logps/chosen": -232.5638885498047, "logps/rejected": -205.17001342773438, "loss": 0.5782, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.3145369589328766, "rewards/margins": 0.34478241205215454, "rewards/rejected": -0.030245428904891014, "step": 290 }, { "epoch": 0.14398848092152627, "grad_norm": 37.75, "learning_rate": 4.97096572766805e-06, "logits/chosen": -2.6843204498291016, "logits/rejected": -2.5420849323272705, "logps/chosen": -238.2855987548828, "logps/rejected": -192.5761260986328, "loss": 0.548, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.2724495232105255, "rewards/margins": 0.4326988756656647, "rewards/rejected": -0.16024938225746155, "step": 300 }, { "epoch": 0.14878809695224382, "grad_norm": 43.75, "learning_rate": 4.964250041564868e-06, "logits/chosen": -2.664036989212036, "logits/rejected": -2.533687114715576, "logps/chosen": -230.8778076171875, "logps/rejected": -200.39389038085938, "loss": 0.5758, "rewards/accuracies": 0.71875, "rewards/chosen": 0.21766161918640137, "rewards/margins": 0.3927158713340759, "rewards/rejected": -0.17505425214767456, "step": 310 }, { "epoch": 0.15358771298296137, "grad_norm": 38.25, "learning_rate": 4.956841830911588e-06, "logits/chosen": -2.6427550315856934, "logits/rejected": -2.5176196098327637, "logps/chosen": -241.25448608398438, "logps/rejected": -200.30337524414062, "loss": 0.5428, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.3211400508880615, "rewards/margins": 0.465925931930542, "rewards/rejected": -0.14478585124015808, "step": 320 }, { "epoch": 0.1583873290136789, "grad_norm": 53.5, "learning_rate": 4.9487431776267095e-06, "logits/chosen": -2.6581196784973145, "logits/rejected": -2.5447604656219482, "logps/chosen": -214.78573608398438, "logps/rejected": -197.250732421875, "loss": 0.5574, "rewards/accuracies": 0.71875, "rewards/chosen": 0.35328906774520874, "rewards/margins": 0.42138591408729553, "rewards/rejected": -0.0680968165397644, "step": 330 }, { "epoch": 0.16318694504439646, "grad_norm": 42.0, "learning_rate": 4.939956357662806e-06, "logits/chosen": -2.609428882598877, "logits/rejected": -2.4489550590515137, "logps/chosen": -230.47982788085938, "logps/rejected": -180.9293975830078, "loss": 0.5271, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.2847859263420105, "rewards/margins": 0.49130839109420776, "rewards/rejected": -0.20652246475219727, "step": 340 }, { "epoch": 0.16798656107511398, "grad_norm": 42.75, "learning_rate": 4.9304838403669155e-06, "logits/chosen": -2.586233139038086, "logits/rejected": -2.4430768489837646, "logps/chosen": -250.468017578125, "logps/rejected": -196.97486877441406, "loss": 0.535, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.3728100061416626, "rewards/margins": 0.4706306457519531, "rewards/rejected": -0.0978206992149353, "step": 350 }, { "epoch": 0.17278617710583152, "grad_norm": 46.5, "learning_rate": 4.920328287786587e-06, "logits/chosen": -2.603339910507202, "logits/rejected": -2.4968552589416504, "logps/chosen": -227.32632446289062, "logps/rejected": -196.89515686035156, "loss": 0.5242, "rewards/accuracies": 0.8125, "rewards/chosen": 0.26675719022750854, "rewards/margins": 0.5004665851593018, "rewards/rejected": -0.23370936512947083, "step": 360 }, { "epoch": 0.17758579313654907, "grad_norm": 44.5, "learning_rate": 4.909492553921761e-06, "logits/chosen": -2.6493752002716064, "logits/rejected": -2.486344337463379, "logps/chosen": -241.6817626953125, "logps/rejected": -204.93899536132812, "loss": 0.5103, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.33704155683517456, "rewards/margins": 0.5699586868286133, "rewards/rejected": -0.23291714489459991, "step": 370 }, { "epoch": 0.18238540916726662, "grad_norm": 39.75, "learning_rate": 4.897979683922728e-06, "logits/chosen": -2.670883893966675, "logits/rejected": -2.564422130584717, "logps/chosen": -215.97842407226562, "logps/rejected": -182.91036987304688, "loss": 0.5319, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.2965424954891205, "rewards/margins": 0.5261348485946655, "rewards/rejected": -0.22959236800670624, "step": 380 }, { "epoch": 0.18718502519798416, "grad_norm": 38.25, "learning_rate": 4.885792913234339e-06, "logits/chosen": -2.603323221206665, "logits/rejected": -2.5463268756866455, "logps/chosen": -219.7723388671875, "logps/rejected": -210.0283203125, "loss": 0.5334, "rewards/accuracies": 0.71875, "rewards/chosen": 0.3986433148384094, "rewards/margins": 0.5653668642044067, "rewards/rejected": -0.16672348976135254, "step": 390 }, { "epoch": 0.1919846412287017, "grad_norm": 41.0, "learning_rate": 4.872935666686767e-06, "logits/chosen": -2.6278882026672363, "logits/rejected": -2.5184621810913086, "logps/chosen": -229.7825164794922, "logps/rejected": -213.7990264892578, "loss": 0.5198, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.38563206791877747, "rewards/margins": 0.5534602999687195, "rewards/rejected": -0.16782823204994202, "step": 400 }, { "epoch": 0.1919846412287017, "eval_logits/chosen": -2.6478958129882812, "eval_logits/rejected": -2.5310354232788086, "eval_logps/chosen": -230.05001831054688, "eval_logps/rejected": -208.3632354736328, "eval_loss": 0.5145431160926819, "eval_rewards/accuracies": 0.7850000262260437, "eval_rewards/chosen": 0.3082956075668335, "eval_rewards/margins": 0.591540515422821, "eval_rewards/rejected": -0.28324493765830994, "eval_runtime": 26.9361, "eval_samples_per_second": 37.125, "eval_steps_per_second": 9.281, "step": 400 }, { "epoch": 0.19678425725941925, "grad_norm": 49.25, "learning_rate": 4.859411557533019e-06, "logits/chosen": -2.640899181365967, "logits/rejected": -2.532985210418701, "logps/chosen": -227.2956085205078, "logps/rejected": -199.454345703125, "loss": 0.5372, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.2837000787258148, "rewards/margins": 0.5689016580581665, "rewards/rejected": -0.2852015495300293, "step": 410 }, { "epoch": 0.2015838732901368, "grad_norm": 36.25, "learning_rate": 4.8452243864335216e-06, "logits/chosen": -2.6203932762145996, "logits/rejected": -2.546025037765503, "logps/chosen": -206.45437622070312, "logps/rejected": -212.8076171875, "loss": 0.5688, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.15797384083271027, "rewards/margins": 0.42366623878479004, "rewards/rejected": -0.2656923830509186, "step": 420 }, { "epoch": 0.20638348932085432, "grad_norm": 34.75, "learning_rate": 4.830378140388016e-06, "logits/chosen": -2.726120710372925, "logits/rejected": -2.5788116455078125, "logps/chosen": -236.8133544921875, "logps/rejected": -197.07266235351562, "loss": 0.5284, "rewards/accuracies": 0.71875, "rewards/chosen": 0.21513256430625916, "rewards/margins": 0.62417072057724, "rewards/rejected": -0.40903815627098083, "step": 430 }, { "epoch": 0.21118310535157186, "grad_norm": 37.5, "learning_rate": 4.814876991615104e-06, "logits/chosen": -2.612753391265869, "logits/rejected": -2.5211071968078613, "logps/chosen": -228.63931274414062, "logps/rejected": -203.56838989257812, "loss": 0.5862, "rewards/accuracies": 0.6875, "rewards/chosen": -0.15136560797691345, "rewards/margins": 0.43973368406295776, "rewards/rejected": -0.5910992622375488, "step": 440 }, { "epoch": 0.2159827213822894, "grad_norm": 53.5, "learning_rate": 4.798725296379736e-06, "logits/chosen": -2.6407535076141357, "logits/rejected": -2.5609383583068848, "logps/chosen": -220.7452392578125, "logps/rejected": -198.64051818847656, "loss": 0.5094, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.08102991431951523, "rewards/margins": 0.5998750329017639, "rewards/rejected": -0.5188450813293457, "step": 450 }, { "epoch": 0.22078233741300696, "grad_norm": 44.25, "learning_rate": 4.781927593768969e-06, "logits/chosen": -2.6856577396392822, "logits/rejected": -2.5593128204345703, "logps/chosen": -232.1822509765625, "logps/rejected": -205.1845703125, "loss": 0.5215, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.05589114502072334, "rewards/margins": 0.6284939050674438, "rewards/rejected": -0.5726026892662048, "step": 460 }, { "epoch": 0.2255819534437245, "grad_norm": 46.75, "learning_rate": 4.764488604416365e-06, "logits/chosen": -2.6717689037323, "logits/rejected": -2.506671667098999, "logps/chosen": -255.5167236328125, "logps/rejected": -221.23788452148438, "loss": 0.4516, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 0.2418864220380783, "rewards/margins": 0.806577205657959, "rewards/rejected": -0.5646907687187195, "step": 470 }, { "epoch": 0.23038156947444205, "grad_norm": 40.25, "learning_rate": 4.7464132291753464e-06, "logits/chosen": -2.613154888153076, "logits/rejected": -2.5245513916015625, "logps/chosen": -212.85330200195312, "logps/rejected": -193.86257934570312, "loss": 0.5092, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.12177534401416779, "rewards/margins": 0.6296752691268921, "rewards/rejected": -0.5078999996185303, "step": 480 }, { "epoch": 0.2351811855051596, "grad_norm": 42.0, "learning_rate": 4.727706547741924e-06, "logits/chosen": -2.6002917289733887, "logits/rejected": -2.469181537628174, "logps/chosen": -228.36746215820312, "logps/rejected": -184.94297790527344, "loss": 0.5446, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.18128438293933868, "rewards/margins": 0.4943477213382721, "rewards/rejected": -0.31306329369544983, "step": 490 }, { "epoch": 0.23998080153587714, "grad_norm": 41.75, "learning_rate": 4.708373817227158e-06, "logits/chosen": -2.5972156524658203, "logits/rejected": -2.468087673187256, "logps/chosen": -240.36544799804688, "logps/rejected": -206.00051879882812, "loss": 0.5061, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.16172266006469727, "rewards/margins": 0.651920735836029, "rewards/rejected": -0.49019813537597656, "step": 500 }, { "epoch": 0.24478041756659466, "grad_norm": 35.75, "learning_rate": 4.688420470679754e-06, "logits/chosen": -2.5911366939544678, "logits/rejected": -2.4612526893615723, "logps/chosen": -244.9521942138672, "logps/rejected": -197.1681365966797, "loss": 0.4955, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.15817375481128693, "rewards/margins": 0.7028775215148926, "rewards/rejected": -0.5447037816047668, "step": 510 }, { "epoch": 0.2495800335973122, "grad_norm": 37.75, "learning_rate": 4.667852115559227e-06, "logits/chosen": -2.6258175373077393, "logits/rejected": -2.492220401763916, "logps/chosen": -254.71044921875, "logps/rejected": -222.6087188720703, "loss": 0.4794, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.30519407987594604, "rewards/margins": 0.7252100706100464, "rewards/rejected": -0.4200161099433899, "step": 520 }, { "epoch": 0.2543796496280298, "grad_norm": 38.5, "learning_rate": 4.646674532160041e-06, "logits/chosen": -2.657778739929199, "logits/rejected": -2.5602223873138428, "logps/chosen": -233.77200317382812, "logps/rejected": -212.9705810546875, "loss": 0.5216, "rewards/accuracies": 0.71875, "rewards/chosen": 0.1333063244819641, "rewards/margins": 0.635170042514801, "rewards/rejected": -0.5018636584281921, "step": 530 }, { "epoch": 0.2591792656587473, "grad_norm": 48.75, "learning_rate": 4.6248936719871855e-06, "logits/chosen": -2.6264724731445312, "logits/rejected": -2.5388011932373047, "logps/chosen": -220.25064086914062, "logps/rejected": -194.9637451171875, "loss": 0.4623, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.02713530696928501, "rewards/margins": 0.8179550170898438, "rewards/rejected": -0.7908197045326233, "step": 540 }, { "epoch": 0.2639788816894648, "grad_norm": 43.5, "learning_rate": 4.60251565608363e-06, "logits/chosen": -2.687124729156494, "logits/rejected": -2.6088709831237793, "logps/chosen": -234.5717010498047, "logps/rejected": -225.5623779296875, "loss": 0.4995, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.011210719123482704, "rewards/margins": 0.6683915257453918, "rewards/rejected": -0.6796022653579712, "step": 550 }, { "epoch": 0.26877849772018236, "grad_norm": 39.75, "learning_rate": 4.579546773310136e-06, "logits/chosen": -2.619655132293701, "logits/rejected": -2.446974754333496, "logps/chosen": -238.61904907226562, "logps/rejected": -216.04629516601562, "loss": 0.4966, "rewards/accuracies": 0.75, "rewards/chosen": 0.06748590618371964, "rewards/margins": 0.7394402027130127, "rewards/rejected": -0.6719542741775513, "step": 560 }, { "epoch": 0.2735781137508999, "grad_norm": 44.0, "learning_rate": 4.5559934785779115e-06, "logits/chosen": -2.667036771774292, "logits/rejected": -2.469698190689087, "logps/chosen": -243.6424560546875, "logps/rejected": -192.75326538085938, "loss": 0.4458, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.23885676264762878, "rewards/margins": 0.8359629511833191, "rewards/rejected": -0.5971060991287231, "step": 570 }, { "epoch": 0.27837772978161746, "grad_norm": 39.75, "learning_rate": 4.531862391034591e-06, "logits/chosen": -2.58920955657959, "logits/rejected": -2.4982354640960693, "logps/chosen": -232.32406616210938, "logps/rejected": -205.0082244873047, "loss": 0.4441, "rewards/accuracies": 0.8125, "rewards/chosen": 0.24206867814064026, "rewards/margins": 0.9316803216934204, "rewards/rejected": -0.689611554145813, "step": 580 }, { "epoch": 0.283177345812335, "grad_norm": 52.25, "learning_rate": 4.507160292204074e-06, "logits/chosen": -2.675287961959839, "logits/rejected": -2.5451769828796387, "logps/chosen": -237.7034149169922, "logps/rejected": -209.1581573486328, "loss": 0.4854, "rewards/accuracies": 0.78125, "rewards/chosen": -0.012547047808766365, "rewards/margins": 0.7794401049613953, "rewards/rejected": -0.7919871807098389, "step": 590 }, { "epoch": 0.28797696184305255, "grad_norm": 45.25, "learning_rate": 4.481894124080714e-06, "logits/chosen": -2.6691808700561523, "logits/rejected": -2.557055950164795, "logps/chosen": -236.5981903076172, "logps/rejected": -220.3197021484375, "loss": 0.4703, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.08622226119041443, "rewards/margins": 0.7651070952415466, "rewards/rejected": -0.8513293266296387, "step": 600 }, { "epoch": 0.28797696184305255, "eval_logits/chosen": -2.633901596069336, "eval_logits/rejected": -2.5215346813201904, "eval_logps/chosen": -233.1143798828125, "eval_logps/rejected": -213.52012634277344, "eval_loss": 0.4838341772556305, "eval_rewards/accuracies": 0.7940000295639038, "eval_rewards/chosen": 0.0018612403655424714, "eval_rewards/margins": 0.8007965683937073, "eval_rewards/rejected": -0.7989352941513062, "eval_runtime": 31.8483, "eval_samples_per_second": 31.399, "eval_steps_per_second": 7.85, "step": 600 }, { "epoch": 0.2927765778737701, "grad_norm": 37.0, "learning_rate": 4.456070987178427e-06, "logits/chosen": -2.636303424835205, "logits/rejected": -2.475978136062622, "logps/chosen": -220.087646484375, "logps/rejected": -184.07369995117188, "loss": 0.4705, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.108073391020298, "rewards/margins": 0.7934460639953613, "rewards/rejected": -0.9015194177627563, "step": 610 }, { "epoch": 0.29757619390448764, "grad_norm": 38.25, "learning_rate": 4.429698138535242e-06, "logits/chosen": -2.6021456718444824, "logits/rejected": -2.510624885559082, "logps/chosen": -237.7132110595703, "logps/rejected": -222.8521728515625, "loss": 0.5149, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.16316601634025574, "rewards/margins": 0.6969932317733765, "rewards/rejected": -0.5338272452354431, "step": 620 }, { "epoch": 0.3023758099352052, "grad_norm": 41.0, "learning_rate": 4.402782989673867e-06, "logits/chosen": -2.648524761199951, "logits/rejected": -2.49703311920166, "logps/chosen": -240.3001251220703, "logps/rejected": -206.89743041992188, "loss": 0.4332, "rewards/accuracies": 0.84375, "rewards/chosen": 0.16457466781139374, "rewards/margins": 0.8773033022880554, "rewards/rejected": -0.7127286195755005, "step": 630 }, { "epoch": 0.30717542596592273, "grad_norm": 53.25, "learning_rate": 4.375333104518842e-06, "logits/chosen": -2.567253589630127, "logits/rejected": -2.5297513008117676, "logps/chosen": -224.3708038330078, "logps/rejected": -223.1370086669922, "loss": 0.5021, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.020184341818094254, "rewards/margins": 0.7216410636901855, "rewards/rejected": -0.7418254613876343, "step": 640 }, { "epoch": 0.3119750419966403, "grad_norm": 52.0, "learning_rate": 4.347356197270852e-06, "logits/chosen": -2.629487991333008, "logits/rejected": -2.4799911975860596, "logps/chosen": -232.6698455810547, "logps/rejected": -211.57406616210938, "loss": 0.4987, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 0.05573948472738266, "rewards/margins": 0.8003142476081848, "rewards/rejected": -0.744574785232544, "step": 650 }, { "epoch": 0.3167746580273578, "grad_norm": 51.25, "learning_rate": 4.318860130238828e-06, "logits/chosen": -2.564279794692993, "logits/rejected": -2.4956247806549072, "logps/chosen": -216.66201782226562, "logps/rejected": -223.7574005126953, "loss": 0.4958, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.15754355490207672, "rewards/margins": 0.775588870048523, "rewards/rejected": -0.618045449256897, "step": 660 }, { "epoch": 0.32157427405807537, "grad_norm": 35.25, "learning_rate": 4.289852911630407e-06, "logits/chosen": -2.656132221221924, "logits/rejected": -2.506979465484619, "logps/chosen": -259.60101318359375, "logps/rejected": -215.4107208251953, "loss": 0.4511, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.25082486867904663, "rewards/margins": 0.930018424987793, "rewards/rejected": -0.6791934967041016, "step": 670 }, { "epoch": 0.3263738900887929, "grad_norm": 65.5, "learning_rate": 4.260342693301396e-06, "logits/chosen": -2.630061626434326, "logits/rejected": -2.4976465702056885, "logps/chosen": -234.99032592773438, "logps/rejected": -197.80252075195312, "loss": 0.4984, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.04209694266319275, "rewards/margins": 0.7394791841506958, "rewards/rejected": -0.6973822116851807, "step": 680 }, { "epoch": 0.33117350611951046, "grad_norm": 36.25, "learning_rate": 4.2303377684648735e-06, "logits/chosen": -2.6006178855895996, "logits/rejected": -2.531825304031372, "logps/chosen": -228.2000732421875, "logps/rejected": -233.7799530029297, "loss": 0.4636, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 0.18744561076164246, "rewards/margins": 0.8298454284667969, "rewards/rejected": -0.6423999071121216, "step": 690 }, { "epoch": 0.33597312215022795, "grad_norm": 44.0, "learning_rate": 4.199846569360558e-06, "logits/chosen": -2.6074843406677246, "logits/rejected": -2.5071351528167725, "logps/chosen": -234.10958862304688, "logps/rejected": -215.6813507080078, "loss": 0.5146, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.24338316917419434, "rewards/margins": 0.7707773447036743, "rewards/rejected": -0.52739417552948, "step": 700 }, { "epoch": 0.3407727381809455, "grad_norm": 47.0, "learning_rate": 4.168877664885104e-06, "logits/chosen": -2.610435962677002, "logits/rejected": -2.462646961212158, "logps/chosen": -227.91342163085938, "logps/rejected": -188.88912963867188, "loss": 0.4715, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.12846776843070984, "rewards/margins": 0.8934639096260071, "rewards/rejected": -0.7649961709976196, "step": 710 }, { "epoch": 0.34557235421166305, "grad_norm": 33.5, "learning_rate": 4.1374397581840035e-06, "logits/chosen": -2.6382699012756348, "logits/rejected": -2.5071628093719482, "logps/chosen": -226.78402709960938, "logps/rejected": -190.30447387695312, "loss": 0.4784, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.12799586355686188, "rewards/margins": 0.7761613130569458, "rewards/rejected": -0.6481654047966003, "step": 720 }, { "epoch": 0.3503719702423806, "grad_norm": 55.75, "learning_rate": 4.105541684205752e-06, "logits/chosen": -2.590768337249756, "logits/rejected": -2.486171007156372, "logps/chosen": -217.5499725341797, "logps/rejected": -202.66116333007812, "loss": 0.497, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.11741193383932114, "rewards/margins": 0.8474688529968262, "rewards/rejected": -0.7300569415092468, "step": 730 }, { "epoch": 0.35517158627309814, "grad_norm": 39.75, "learning_rate": 4.073192407218972e-06, "logits/chosen": -2.633659839630127, "logits/rejected": -2.501005172729492, "logps/chosen": -239.923095703125, "logps/rejected": -199.56039428710938, "loss": 0.4329, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 0.1744803935289383, "rewards/margins": 1.0109570026397705, "rewards/rejected": -0.8364765048027039, "step": 740 }, { "epoch": 0.3599712023038157, "grad_norm": 47.25, "learning_rate": 4.040401018293204e-06, "logits/chosen": -2.5651516914367676, "logits/rejected": -2.5003182888031006, "logps/chosen": -222.2145538330078, "logps/rejected": -236.4008331298828, "loss": 0.4993, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.06561844795942307, "rewards/margins": 0.7554530501365662, "rewards/rejected": -0.821071445941925, "step": 750 }, { "epoch": 0.36477081833453323, "grad_norm": 52.5, "learning_rate": 4.007176732744054e-06, "logits/chosen": -2.5720465183258057, "logits/rejected": -2.5482380390167236, "logps/chosen": -236.49081420898438, "logps/rejected": -238.8624725341797, "loss": 0.4684, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.019028931856155396, "rewards/margins": 0.9050389528274536, "rewards/rejected": -0.9240678548812866, "step": 760 }, { "epoch": 0.3695704343652508, "grad_norm": 41.5, "learning_rate": 3.9735288875434254e-06, "logits/chosen": -2.646091938018799, "logits/rejected": -2.4673545360565186, "logps/chosen": -244.1385040283203, "logps/rejected": -195.28207397460938, "loss": 0.4731, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.11497173458337784, "rewards/margins": 0.8432220220565796, "rewards/rejected": -0.9581937789916992, "step": 770 }, { "epoch": 0.3743700503959683, "grad_norm": 55.75, "learning_rate": 3.939466938695565e-06, "logits/chosen": -2.5559213161468506, "logits/rejected": -2.466019630432129, "logps/chosen": -253.15365600585938, "logps/rejected": -228.51220703125, "loss": 0.5308, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.10333450883626938, "rewards/margins": 0.7484658360481262, "rewards/rejected": -0.6451312303543091, "step": 780 }, { "epoch": 0.37916966642668587, "grad_norm": 36.25, "learning_rate": 3.905000458579657e-06, "logits/chosen": -2.570517063140869, "logits/rejected": -2.502270460128784, "logps/chosen": -209.8747100830078, "logps/rejected": -230.72579956054688, "loss": 0.5247, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.09132517874240875, "rewards/margins": 0.6891080141067505, "rewards/rejected": -0.5977829098701477, "step": 790 }, { "epoch": 0.3839692824574034, "grad_norm": 41.25, "learning_rate": 3.87013913325971e-06, "logits/chosen": -2.5875115394592285, "logits/rejected": -2.4476943016052246, "logps/chosen": -259.3215637207031, "logps/rejected": -214.0452423095703, "loss": 0.5223, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.05909506604075432, "rewards/margins": 0.7436100840568542, "rewards/rejected": -0.6845150589942932, "step": 800 }, { "epoch": 0.3839692824574034, "eval_logits/chosen": -2.6117820739746094, "eval_logits/rejected": -2.496819257736206, "eval_logps/chosen": -231.5717010498047, "eval_logps/rejected": -212.9447784423828, "eval_loss": 0.46309149265289307, "eval_rewards/accuracies": 0.7879999876022339, "eval_rewards/chosen": 0.15612684190273285, "eval_rewards/margins": 0.8975253701210022, "eval_rewards/rejected": -0.7413985729217529, "eval_runtime": 31.9562, "eval_samples_per_second": 31.293, "eval_steps_per_second": 7.823, "step": 800 }, { "epoch": 0.38876889848812096, "grad_norm": 41.75, "learning_rate": 3.8348927597624965e-06, "logits/chosen": -2.635223388671875, "logits/rejected": -2.5298709869384766, "logps/chosen": -231.2666473388672, "logps/rejected": -217.6613311767578, "loss": 0.4714, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.08412560075521469, "rewards/margins": 0.8177760243415833, "rewards/rejected": -0.7336505055427551, "step": 810 }, { "epoch": 0.3935685145188385, "grad_norm": 40.75, "learning_rate": 3.7992712433243117e-06, "logits/chosen": -2.6278603076934814, "logits/rejected": -2.470078945159912, "logps/chosen": -235.6140899658203, "logps/rejected": -188.5288848876953, "loss": 0.4796, "rewards/accuracies": 0.78125, "rewards/chosen": -0.1967240571975708, "rewards/margins": 0.8465791940689087, "rewards/rejected": -1.0433032512664795, "step": 820 }, { "epoch": 0.39836813054955605, "grad_norm": 48.0, "learning_rate": 3.7632845946073136e-06, "logits/chosen": -2.6680498123168945, "logits/rejected": -2.513561964035034, "logps/chosen": -250.830810546875, "logps/rejected": -192.11917114257812, "loss": 0.4341, "rewards/accuracies": 0.8125, "rewards/chosen": -0.35141512751579285, "rewards/margins": 0.932411789894104, "rewards/rejected": -1.2838269472122192, "step": 830 }, { "epoch": 0.4031677465802736, "grad_norm": 40.0, "learning_rate": 3.7269429268862513e-06, "logits/chosen": -2.630174398422241, "logits/rejected": -2.5625929832458496, "logps/chosen": -213.17184448242188, "logps/rejected": -208.76736450195312, "loss": 0.4875, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.3925122618675232, "rewards/margins": 0.8376442193984985, "rewards/rejected": -1.230156421661377, "step": 840 }, { "epoch": 0.40796736261099115, "grad_norm": 35.25, "learning_rate": 3.690256453206334e-06, "logits/chosen": -2.6113486289978027, "logits/rejected": -2.562234401702881, "logps/chosen": -217.27377319335938, "logps/rejected": -210.09207153320312, "loss": 0.4761, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.20575007796287537, "rewards/margins": 0.932608425617218, "rewards/rejected": -1.1383583545684814, "step": 850 }, { "epoch": 0.41276697864170864, "grad_norm": 54.5, "learning_rate": 3.6532354835130844e-06, "logits/chosen": -2.6446421146392822, "logits/rejected": -2.535243511199951, "logps/chosen": -249.38037109375, "logps/rejected": -225.61679077148438, "loss": 0.4788, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.0025300472043454647, "rewards/margins": 0.9329078793525696, "rewards/rejected": -0.9354379773139954, "step": 860 }, { "epoch": 0.4175665946724262, "grad_norm": 43.5, "learning_rate": 3.6158904217549446e-06, "logits/chosen": -2.640524387359619, "logits/rejected": -2.5773513317108154, "logps/chosen": -225.6608123779297, "logps/rejected": -203.6904296875, "loss": 0.5129, "rewards/accuracies": 0.75, "rewards/chosen": -0.13905248045921326, "rewards/margins": 0.7723478078842163, "rewards/rejected": -0.911400318145752, "step": 870 }, { "epoch": 0.42236621070314373, "grad_norm": 56.5, "learning_rate": 3.5782317629594708e-06, "logits/chosen": -2.6161234378814697, "logits/rejected": -2.5211544036865234, "logps/chosen": -240.87353515625, "logps/rejected": -220.1843719482422, "loss": 0.5035, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.07784248143434525, "rewards/margins": 0.8290297389030457, "rewards/rejected": -0.7511872053146362, "step": 880 }, { "epoch": 0.4271658267338613, "grad_norm": 41.5, "learning_rate": 3.5402700902839317e-06, "logits/chosen": -2.5109052658081055, "logits/rejected": -2.469113826751709, "logps/chosen": -206.3893280029297, "logps/rejected": -216.2916717529297, "loss": 0.481, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0023569464683532715, "rewards/margins": 0.8143336176872253, "rewards/rejected": -0.8166904449462891, "step": 890 }, { "epoch": 0.4319654427645788, "grad_norm": 38.0, "learning_rate": 3.5020160720411408e-06, "logits/chosen": -2.620961904525757, "logits/rejected": -2.4989562034606934, "logps/chosen": -232.7259979248047, "logps/rejected": -224.18222045898438, "loss": 0.457, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.1927359253168106, "rewards/margins": 0.9317037463188171, "rewards/rejected": -0.738967776298523, "step": 900 }, { "epoch": 0.43676505879529637, "grad_norm": 60.5, "learning_rate": 3.4634804587013505e-06, "logits/chosen": -2.5633127689361572, "logits/rejected": -2.51041841506958, "logps/chosen": -216.7091827392578, "logps/rejected": -217.47030639648438, "loss": 0.5084, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.10815312713384628, "rewards/margins": 0.7729175090789795, "rewards/rejected": -0.8810704946517944, "step": 910 }, { "epoch": 0.4415646748260139, "grad_norm": 44.0, "learning_rate": 3.424674079871073e-06, "logits/chosen": -2.5757408142089844, "logits/rejected": -2.4900078773498535, "logps/chosen": -217.65444946289062, "logps/rejected": -204.16091918945312, "loss": 0.5214, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.12332279980182648, "rewards/margins": 0.7060663104057312, "rewards/rejected": -0.8293890953063965, "step": 920 }, { "epoch": 0.44636429085673146, "grad_norm": 36.25, "learning_rate": 3.3856078412496424e-06, "logits/chosen": -2.6325020790100098, "logits/rejected": -2.5013327598571777, "logps/chosen": -240.5934295654297, "logps/rejected": -201.51132202148438, "loss": 0.4388, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.12551036477088928, "rewards/margins": 0.971343994140625, "rewards/rejected": -1.0968544483184814, "step": 930 }, { "epoch": 0.451163906887449, "grad_norm": 50.5, "learning_rate": 3.346292721564407e-06, "logits/chosen": -2.654001474380493, "logits/rejected": -2.5543465614318848, "logps/chosen": -265.34912109375, "logps/rejected": -228.43905639648438, "loss": 0.4906, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.23702020943164825, "rewards/margins": 0.903741717338562, "rewards/rejected": -1.1407619714736938, "step": 940 }, { "epoch": 0.45596352291816655, "grad_norm": 43.75, "learning_rate": 3.306739769485394e-06, "logits/chosen": -2.593740940093994, "logits/rejected": -2.4713730812072754, "logps/chosen": -235.24942016601562, "logps/rejected": -202.47262573242188, "loss": 0.446, "rewards/accuracies": 0.8125, "rewards/chosen": -0.12331932783126831, "rewards/margins": 0.9522517919540405, "rewards/rejected": -1.075571060180664, "step": 950 }, { "epoch": 0.4607631389488841, "grad_norm": 49.75, "learning_rate": 3.266960100520316e-06, "logits/chosen": -2.6186511516571045, "logits/rejected": -2.5293595790863037, "logps/chosen": -207.3675537109375, "logps/rejected": -198.95132446289062, "loss": 0.445, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.22513869404792786, "rewards/margins": 1.0883581638336182, "rewards/rejected": -1.3134969472885132, "step": 960 }, { "epoch": 0.46556275497960165, "grad_norm": 35.75, "learning_rate": 3.2269648938907977e-06, "logits/chosen": -2.5796661376953125, "logits/rejected": -2.469088315963745, "logps/chosen": -219.3197479248047, "logps/rejected": -195.90982055664062, "loss": 0.4861, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.38874492049217224, "rewards/margins": 0.9575654864311218, "rewards/rejected": -1.3463106155395508, "step": 970 }, { "epoch": 0.4703623710103192, "grad_norm": 35.75, "learning_rate": 3.186765389390696e-06, "logits/chosen": -2.6649694442749023, "logits/rejected": -2.528609037399292, "logps/chosen": -255.8763885498047, "logps/rejected": -208.0857391357422, "loss": 0.4569, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.3541041314601898, "rewards/margins": 0.9702304005622864, "rewards/rejected": -1.3243346214294434, "step": 980 }, { "epoch": 0.47516198704103674, "grad_norm": 53.0, "learning_rate": 3.146372884227393e-06, "logits/chosen": -2.630551815032959, "logits/rejected": -2.5320029258728027, "logps/chosen": -253.5113983154297, "logps/rejected": -226.9222412109375, "loss": 0.5222, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.3444979786872864, "rewards/margins": 0.7771540880203247, "rewards/rejected": -1.1216518878936768, "step": 990 }, { "epoch": 0.4799616030717543, "grad_norm": 35.0, "learning_rate": 3.1057987298469693e-06, "logits/chosen": -2.55789852142334, "logits/rejected": -2.4452617168426514, "logps/chosen": -217.3821258544922, "logps/rejected": -195.25726318359375, "loss": 0.4335, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.20461265742778778, "rewards/margins": 1.0646028518676758, "rewards/rejected": -1.26921546459198, "step": 1000 }, { "epoch": 0.4799616030717543, "eval_logits/chosen": -2.6097192764282227, "eval_logits/rejected": -2.497605085372925, "eval_logps/chosen": -234.7398681640625, "eval_logps/rejected": -217.08157348632812, "eval_loss": 0.45548132061958313, "eval_rewards/accuracies": 0.7950000166893005, "eval_rewards/chosen": -0.16069155931472778, "eval_rewards/margins": 0.9943889379501343, "eval_rewards/rejected": -1.1550804376602173, "eval_runtime": 32.012, "eval_samples_per_second": 31.238, "eval_steps_per_second": 7.81, "step": 1000 }, { "epoch": 0.48476121910247183, "grad_norm": 35.75, "learning_rate": 3.06505432874411e-06, "logits/chosen": -2.5709657669067383, "logits/rejected": -2.432443141937256, "logps/chosen": -251.8423614501953, "logps/rejected": -219.88711547851562, "loss": 0.469, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.1605864316225052, "rewards/margins": 0.9071685671806335, "rewards/rejected": -1.0677549839019775, "step": 1010 }, { "epoch": 0.4895608351331893, "grad_norm": 44.0, "learning_rate": 3.024151131257688e-06, "logits/chosen": -2.601938009262085, "logits/rejected": -2.4874444007873535, "logps/chosen": -249.327880859375, "logps/rejected": -204.016357421875, "loss": 0.4554, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.34456175565719604, "rewards/margins": 0.9197514653205872, "rewards/rejected": -1.2643131017684937, "step": 1020 }, { "epoch": 0.49436045116390687, "grad_norm": 52.25, "learning_rate": 2.983100632352889e-06, "logits/chosen": -2.67118501663208, "logits/rejected": -2.495823860168457, "logps/chosen": -256.9388732910156, "logps/rejected": -210.40402221679688, "loss": 0.4574, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.21712104976177216, "rewards/margins": 1.0202686786651611, "rewards/rejected": -1.2373896837234497, "step": 1030 }, { "epoch": 0.4991600671946244, "grad_norm": 48.0, "learning_rate": 2.9419143683907987e-06, "logits/chosen": -2.5967347621917725, "logits/rejected": -2.5237019062042236, "logps/chosen": -223.098876953125, "logps/rejected": -219.5789031982422, "loss": 0.4829, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.4448261260986328, "rewards/margins": 0.9350289106369019, "rewards/rejected": -1.3798550367355347, "step": 1040 }, { "epoch": 0.503959683225342, "grad_norm": 44.25, "learning_rate": 2.9006039138863572e-06, "logits/chosen": -2.5805346965789795, "logits/rejected": -2.4660849571228027, "logps/chosen": -247.05136108398438, "logps/rejected": -224.231201171875, "loss": 0.415, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.20926351845264435, "rewards/margins": 1.0519158840179443, "rewards/rejected": -1.2611795663833618, "step": 1050 }, { "epoch": 0.5087592992560596, "grad_norm": 36.0, "learning_rate": 2.8591808782555883e-06, "logits/chosen": -2.58505916595459, "logits/rejected": -2.5212624073028564, "logps/chosen": -233.4881134033203, "logps/rejected": -227.78945922851562, "loss": 0.4266, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.09728523343801498, "rewards/margins": 1.1170459985733032, "rewards/rejected": -1.2143312692642212, "step": 1060 }, { "epoch": 0.5135589152867771, "grad_norm": 52.75, "learning_rate": 2.817656902553024e-06, "logits/chosen": -2.6064798831939697, "logits/rejected": -2.511019468307495, "logps/chosen": -234.0939483642578, "logps/rejected": -214.7052764892578, "loss": 0.4975, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.10685852915048599, "rewards/margins": 0.8924004435539246, "rewards/rejected": -0.9992589950561523, "step": 1070 }, { "epoch": 0.5183585313174947, "grad_norm": 52.75, "learning_rate": 2.7760436562002354e-06, "logits/chosen": -2.5962636470794678, "logits/rejected": -2.428013324737549, "logps/chosen": -267.91326904296875, "logps/rejected": -191.6916046142578, "loss": 0.5203, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.1610618531703949, "rewards/margins": 0.8274253606796265, "rewards/rejected": -0.9884872436523438, "step": 1080 }, { "epoch": 0.5231581473482121, "grad_norm": 39.0, "learning_rate": 2.7343528337063924e-06, "logits/chosen": -2.6854658126831055, "logits/rejected": -2.561249256134033, "logps/chosen": -250.7506866455078, "logps/rejected": -225.0154266357422, "loss": 0.4171, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.04954652860760689, "rewards/margins": 1.022822618484497, "rewards/rejected": -1.07236909866333, "step": 1090 }, { "epoch": 0.5279577633789296, "grad_norm": 28.75, "learning_rate": 2.692596151381774e-06, "logits/chosen": -2.5916976928710938, "logits/rejected": -2.5371453762054443, "logps/chosen": -202.2080078125, "logps/rejected": -212.92587280273438, "loss": 0.4372, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.23717375099658966, "rewards/margins": 0.9503253698348999, "rewards/rejected": -1.1874991655349731, "step": 1100 }, { "epoch": 0.5327573794096472, "grad_norm": 43.5, "learning_rate": 2.650785344045149e-06, "logits/chosen": -2.6039624214172363, "logits/rejected": -2.515532970428467, "logps/chosen": -228.81729125976562, "logps/rejected": -217.0961151123047, "loss": 0.4695, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.10938185453414917, "rewards/margins": 1.068205714225769, "rewards/rejected": -1.177587628364563, "step": 1110 }, { "epoch": 0.5375569954403647, "grad_norm": 33.75, "learning_rate": 2.6089321617259583e-06, "logits/chosen": -2.6028213500976562, "logits/rejected": -2.479320764541626, "logps/chosen": -232.82186889648438, "logps/rejected": -216.1719207763672, "loss": 0.4028, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.030288416892290115, "rewards/margins": 1.2107077836990356, "rewards/rejected": -1.2409961223602295, "step": 1120 }, { "epoch": 0.5423566114710823, "grad_norm": 58.75, "learning_rate": 2.567048366362225e-06, "logits/chosen": -2.5917420387268066, "logits/rejected": -2.491093873977661, "logps/chosen": -242.26766967773438, "logps/rejected": -212.3538055419922, "loss": 0.4948, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.2735952138900757, "rewards/margins": 0.8888144493103027, "rewards/rejected": -1.1624095439910889, "step": 1130 }, { "epoch": 0.5471562275017998, "grad_norm": 33.5, "learning_rate": 2.525145728495106e-06, "logits/chosen": -2.608853816986084, "logits/rejected": -2.5186550617218018, "logps/chosen": -230.33004760742188, "logps/rejected": -204.28524780273438, "loss": 0.4649, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.2685418426990509, "rewards/margins": 1.0452083349227905, "rewards/rejected": -1.313750147819519, "step": 1140 }, { "epoch": 0.5519558435325174, "grad_norm": 38.0, "learning_rate": 2.4832360239610416e-06, "logits/chosen": -2.6019065380096436, "logits/rejected": -2.4967987537384033, "logps/chosen": -231.5498809814453, "logps/rejected": -213.8773651123047, "loss": 0.4629, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.29873836040496826, "rewards/margins": 0.9136406183242798, "rewards/rejected": -1.212378978729248, "step": 1150 }, { "epoch": 0.5567554595632349, "grad_norm": 49.25, "learning_rate": 2.441331030582407e-06, "logits/chosen": -2.615382671356201, "logits/rejected": -2.5341572761535645, "logps/chosen": -226.35079956054688, "logps/rejected": -216.3585662841797, "loss": 0.4676, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.21219320595264435, "rewards/margins": 0.8832317590713501, "rewards/rejected": -1.0954248905181885, "step": 1160 }, { "epoch": 0.5615550755939525, "grad_norm": 52.0, "learning_rate": 2.3994425248576102e-06, "logits/chosen": -2.6414036750793457, "logits/rejected": -2.504991054534912, "logps/chosen": -248.39572143554688, "logps/rejected": -202.54635620117188, "loss": 0.5142, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.26081639528274536, "rewards/margins": 0.7797340154647827, "rewards/rejected": -1.0405504703521729, "step": 1170 }, { "epoch": 0.56635469162467, "grad_norm": 56.0, "learning_rate": 2.357582278651553e-06, "logits/chosen": -2.580275535583496, "logits/rejected": -2.4745254516601562, "logps/chosen": -234.48818969726562, "logps/rejected": -222.6685028076172, "loss": 0.4538, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.2316078245639801, "rewards/margins": 1.032219409942627, "rewards/rejected": -1.2638272047042847, "step": 1180 }, { "epoch": 0.5711543076553875, "grad_norm": 44.5, "learning_rate": 2.315762055887411e-06, "logits/chosen": -2.6241419315338135, "logits/rejected": -2.4953231811523438, "logps/chosen": -246.7222442626953, "logps/rejected": -206.4970703125, "loss": 0.5141, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.31947702169418335, "rewards/margins": 0.9516233205795288, "rewards/rejected": -1.271100401878357, "step": 1190 }, { "epoch": 0.5759539236861051, "grad_norm": 37.5, "learning_rate": 2.273993609240629e-06, "logits/chosen": -2.5731539726257324, "logits/rejected": -2.483686685562134, "logps/chosen": -229.90762329101562, "logps/rejected": -225.7178955078125, "loss": 0.5214, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3317897915840149, "rewards/margins": 0.849661648273468, "rewards/rejected": -1.1814515590667725, "step": 1200 }, { "epoch": 0.5759539236861051, "eval_logits/chosen": -2.6119532585144043, "eval_logits/rejected": -2.5000758171081543, "eval_logps/chosen": -234.451904296875, "eval_logps/rejected": -217.00180053710938, "eval_loss": 0.45114314556121826, "eval_rewards/accuracies": 0.7990000247955322, "eval_rewards/chosen": -0.13189174234867096, "eval_rewards/margins": 1.0152093172073364, "eval_rewards/rejected": -1.1471011638641357, "eval_runtime": 31.9635, "eval_samples_per_second": 31.286, "eval_steps_per_second": 7.821, "step": 1200 }, { "epoch": 0.5807535397168226, "grad_norm": 59.0, "learning_rate": 2.2322886768360874e-06, "logits/chosen": -2.513803005218506, "logits/rejected": -2.4552152156829834, "logps/chosen": -239.845703125, "logps/rejected": -215.5902099609375, "loss": 0.4561, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.0830584168434143, "rewards/margins": 1.117357850074768, "rewards/rejected": -1.2004162073135376, "step": 1210 }, { "epoch": 0.5855531557475402, "grad_norm": 49.75, "learning_rate": 2.190658978949352e-06, "logits/chosen": -2.607837438583374, "logits/rejected": -2.4749207496643066, "logps/chosen": -220.0395050048828, "logps/rejected": -194.2960968017578, "loss": 0.4986, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.1924716681241989, "rewards/margins": 0.8406950235366821, "rewards/rejected": -1.0331666469573975, "step": 1220 }, { "epoch": 0.5903527717782577, "grad_norm": 57.25, "learning_rate": 2.149116214712943e-06, "logits/chosen": -2.6154885292053223, "logits/rejected": -2.509730577468872, "logps/chosen": -234.84402465820312, "logps/rejected": -219.07339477539062, "loss": 0.4841, "rewards/accuracies": 0.75, "rewards/chosen": -0.1697658747434616, "rewards/margins": 0.8840200304985046, "rewards/rejected": -1.053786039352417, "step": 1230 }, { "epoch": 0.5951523878089753, "grad_norm": 55.0, "learning_rate": 2.107672058828544e-06, "logits/chosen": -2.6242995262145996, "logits/rejected": -2.51531720161438, "logps/chosen": -228.4006805419922, "logps/rejected": -202.6013641357422, "loss": 0.474, "rewards/accuracies": 0.78125, "rewards/chosen": -0.13985280692577362, "rewards/margins": 0.875139594078064, "rewards/rejected": -1.0149924755096436, "step": 1240 }, { "epoch": 0.5999520038396928, "grad_norm": 60.5, "learning_rate": 2.066338158286083e-06, "logits/chosen": -2.6175296306610107, "logits/rejected": -2.549783229827881, "logps/chosen": -227.7768096923828, "logps/rejected": -222.3975830078125, "loss": 0.3924, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.06711763888597488, "rewards/margins": 1.2078065872192383, "rewards/rejected": -1.2749242782592773, "step": 1250 }, { "epoch": 0.6047516198704104, "grad_norm": 43.5, "learning_rate": 2.025126129090588e-06, "logits/chosen": -2.673826217651367, "logits/rejected": -2.53361177444458, "logps/chosen": -222.8029327392578, "logps/rejected": -191.4785919189453, "loss": 0.4424, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.0933801457285881, "rewards/margins": 1.0600812435150146, "rewards/rejected": -1.1534613370895386, "step": 1260 }, { "epoch": 0.6095512359011279, "grad_norm": 49.75, "learning_rate": 1.9840475529977655e-06, "logits/chosen": -2.6106059551239014, "logits/rejected": -2.5068392753601074, "logps/chosen": -230.1819610595703, "logps/rejected": -208.9829559326172, "loss": 0.4371, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.1759682148694992, "rewards/margins": 1.1224491596221924, "rewards/rejected": -1.298417329788208, "step": 1270 }, { "epoch": 0.6143508519318455, "grad_norm": 40.0, "learning_rate": 1.9431139742591897e-06, "logits/chosen": -2.594632625579834, "logits/rejected": -2.489180088043213, "logps/chosen": -209.05859375, "logps/rejected": -198.956298828125, "loss": 0.4101, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.08843652904033661, "rewards/margins": 1.0289084911346436, "rewards/rejected": -1.117344856262207, "step": 1280 }, { "epoch": 0.619150467962563, "grad_norm": 49.0, "learning_rate": 1.9023368963780458e-06, "logits/chosen": -2.6134049892425537, "logits/rejected": -2.5106282234191895, "logps/chosen": -234.5142059326172, "logps/rejected": -207.1331329345703, "loss": 0.4701, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.14769946038722992, "rewards/margins": 0.9224032163619995, "rewards/rejected": -1.0701026916503906, "step": 1290 }, { "epoch": 0.6239500839932806, "grad_norm": 42.0, "learning_rate": 1.861727778876314e-06, "logits/chosen": -2.5988595485687256, "logits/rejected": -2.4945011138916016, "logps/chosen": -209.90835571289062, "logps/rejected": -185.820068359375, "loss": 0.4437, "rewards/accuracies": 0.78125, "rewards/chosen": -0.21710054576396942, "rewards/margins": 1.0192363262176514, "rewards/rejected": -1.2363369464874268, "step": 1300 }, { "epoch": 0.6287497000239981, "grad_norm": 37.0, "learning_rate": 1.8212980340743152e-06, "logits/chosen": -2.6286704540252686, "logits/rejected": -2.5620837211608887, "logps/chosen": -225.5823974609375, "logps/rejected": -215.69375610351562, "loss": 0.4882, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.22902747988700867, "rewards/margins": 0.9663082957267761, "rewards/rejected": -1.195335865020752, "step": 1310 }, { "epoch": 0.6335493160547156, "grad_norm": 39.75, "learning_rate": 1.7810590238835279e-06, "logits/chosen": -2.5562384128570557, "logits/rejected": -2.534426212310791, "logps/chosen": -226.40902709960938, "logps/rejected": -251.94418334960938, "loss": 0.4495, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.19038750231266022, "rewards/margins": 1.0307300090789795, "rewards/rejected": -1.2211174964904785, "step": 1320 }, { "epoch": 0.6383489320854332, "grad_norm": 43.0, "learning_rate": 1.7410220566135605e-06, "logits/chosen": -2.6242737770080566, "logits/rejected": -2.5161290168762207, "logps/chosen": -228.24398803710938, "logps/rejected": -207.40316772460938, "loss": 0.4474, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.06271727383136749, "rewards/margins": 1.0257116556167603, "rewards/rejected": -1.0884288549423218, "step": 1330 }, { "epoch": 0.6431485481161507, "grad_norm": 58.5, "learning_rate": 1.7011983837942023e-06, "logits/chosen": -2.6005454063415527, "logits/rejected": -2.4911046028137207, "logps/chosen": -235.7122344970703, "logps/rejected": -217.5489044189453, "loss": 0.4614, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.12799616158008575, "rewards/margins": 0.9990032315254211, "rewards/rejected": -1.1269992589950562, "step": 1340 }, { "epoch": 0.6479481641468683, "grad_norm": 45.25, "learning_rate": 1.661599197013416e-06, "logits/chosen": -2.633547306060791, "logits/rejected": -2.5312139987945557, "logps/chosen": -223.9100341796875, "logps/rejected": -203.5366973876953, "loss": 0.4765, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.2294052541255951, "rewards/margins": 0.95598304271698, "rewards/rejected": -1.185388207435608, "step": 1350 }, { "epoch": 0.6527477801775858, "grad_norm": 46.75, "learning_rate": 1.6222356247721831e-06, "logits/chosen": -2.5966320037841797, "logits/rejected": -2.4941086769104004, "logps/chosen": -234.2575225830078, "logps/rejected": -221.3917999267578, "loss": 0.4476, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.12550556659698486, "rewards/margins": 1.0439780950546265, "rewards/rejected": -1.1694835424423218, "step": 1360 }, { "epoch": 0.6575473962083034, "grad_norm": 46.0, "learning_rate": 1.5831187293570826e-06, "logits/chosen": -2.616199016571045, "logits/rejected": -2.5082523822784424, "logps/chosen": -275.43511962890625, "logps/rejected": -229.6664276123047, "loss": 0.4631, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.2143980711698532, "rewards/margins": 1.0123686790466309, "rewards/rejected": -1.22676682472229, "step": 1370 }, { "epoch": 0.6623470122390209, "grad_norm": 48.5, "learning_rate": 1.544259503731465e-06, "logits/chosen": -2.612053632736206, "logits/rejected": -2.4867258071899414, "logps/chosen": -240.70059204101562, "logps/rejected": -198.64918518066406, "loss": 0.3862, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -0.0024546384811401367, "rewards/margins": 1.1859334707260132, "rewards/rejected": -1.1883881092071533, "step": 1380 }, { "epoch": 0.6671466282697385, "grad_norm": 36.0, "learning_rate": 1.5056688684461235e-06, "logits/chosen": -2.6124138832092285, "logits/rejected": -2.491760730743408, "logps/chosen": -245.6887969970703, "logps/rejected": -216.34268188476562, "loss": 0.4459, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.3357298672199249, "rewards/margins": 1.0037829875946045, "rewards/rejected": -1.3395130634307861, "step": 1390 }, { "epoch": 0.6719462443004559, "grad_norm": 54.0, "learning_rate": 1.4673576685703027e-06, "logits/chosen": -2.6035733222961426, "logits/rejected": -2.521059513092041, "logps/chosen": -243.61123657226562, "logps/rejected": -220.312744140625, "loss": 0.4784, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.27499574422836304, "rewards/margins": 0.896142840385437, "rewards/rejected": -1.1711386442184448, "step": 1400 }, { "epoch": 0.6719462443004559, "eval_logits/chosen": -2.6097218990325928, "eval_logits/rejected": -2.4986746311187744, "eval_logps/chosen": -234.91114807128906, "eval_logps/rejected": -217.71800231933594, "eval_loss": 0.4486246407032013, "eval_rewards/accuracies": 0.800000011920929, "eval_rewards/chosen": -0.17781423032283783, "eval_rewards/margins": 1.0409064292907715, "eval_rewards/rejected": -1.218720555305481, "eval_runtime": 31.822, "eval_samples_per_second": 31.425, "eval_steps_per_second": 7.856, "step": 1400 }, { "epoch": 0.6767458603311735, "grad_norm": 28.0, "learning_rate": 1.4293366706439293e-06, "logits/chosen": -2.5871994495391846, "logits/rejected": -2.4666335582733154, "logps/chosen": -221.1563262939453, "logps/rejected": -207.7967529296875, "loss": 0.4466, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.30613771080970764, "rewards/margins": 1.0556987524032593, "rewards/rejected": -1.361836314201355, "step": 1410 }, { "epoch": 0.681545476361891, "grad_norm": 41.25, "learning_rate": 1.3916165596519015e-06, "logits/chosen": -2.6185154914855957, "logits/rejected": -2.452584981918335, "logps/chosen": -233.75131225585938, "logps/rejected": -198.76773071289062, "loss": 0.4315, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.31702926754951477, "rewards/margins": 0.9871221780776978, "rewards/rejected": -1.3041512966156006, "step": 1420 }, { "epoch": 0.6863450923926085, "grad_norm": 47.0, "learning_rate": 1.3542079360213089e-06, "logits/chosen": -2.6181461811065674, "logits/rejected": -2.469588041305542, "logps/chosen": -224.7504425048828, "logps/rejected": -193.1361541748047, "loss": 0.4592, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.17981493473052979, "rewards/margins": 0.9263655543327332, "rewards/rejected": -1.1061805486679077, "step": 1430 }, { "epoch": 0.6911447084233261, "grad_norm": 52.75, "learning_rate": 1.317121312642406e-06, "logits/chosen": -2.6066277027130127, "logits/rejected": -2.465261936187744, "logps/chosen": -233.60720825195312, "logps/rejected": -211.9008026123047, "loss": 0.4429, "rewards/accuracies": 0.78125, "rewards/chosen": -0.18064916133880615, "rewards/margins": 1.064951777458191, "rewards/rejected": -1.245600938796997, "step": 1440 }, { "epoch": 0.6959443244540436, "grad_norm": 45.0, "learning_rate": 1.2803671119141953e-06, "logits/chosen": -2.5287094116210938, "logits/rejected": -2.4450387954711914, "logps/chosen": -248.5905303955078, "logps/rejected": -240.5746307373047, "loss": 0.478, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.2681611478328705, "rewards/margins": 1.0177576541900635, "rewards/rejected": -1.285918951034546, "step": 1450 }, { "epoch": 0.7007439404847612, "grad_norm": 34.5, "learning_rate": 1.2439556628154293e-06, "logits/chosen": -2.612083911895752, "logits/rejected": -2.450336456298828, "logps/chosen": -250.229248046875, "logps/rejected": -219.356689453125, "loss": 0.4872, "rewards/accuracies": 0.75, "rewards/chosen": -0.24559959769248962, "rewards/margins": 0.9850479960441589, "rewards/rejected": -1.2306474447250366, "step": 1460 }, { "epoch": 0.7055435565154787, "grad_norm": 41.5, "learning_rate": 1.207897198001878e-06, "logits/chosen": -2.644726037979126, "logits/rejected": -2.5499677658081055, "logps/chosen": -232.20126342773438, "logps/rejected": -208.1292724609375, "loss": 0.4442, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.15714068710803986, "rewards/margins": 1.035380482673645, "rewards/rejected": -1.192521095275879, "step": 1470 }, { "epoch": 0.7103431725461963, "grad_norm": 42.5, "learning_rate": 1.1722018509306587e-06, "logits/chosen": -2.6035397052764893, "logits/rejected": -2.4596409797668457, "logps/chosen": -250.045166015625, "logps/rejected": -202.33425903320312, "loss": 0.4371, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.22928068041801453, "rewards/margins": 1.1220018863677979, "rewards/rejected": -1.3512827157974243, "step": 1480 }, { "epoch": 0.7151427885769138, "grad_norm": 38.5, "learning_rate": 1.1368796530124442e-06, "logits/chosen": -2.5670337677001953, "logits/rejected": -2.446946620941162, "logps/chosen": -252.01260375976562, "logps/rejected": -207.9329071044922, "loss": 0.4135, "rewards/accuracies": 0.84375, "rewards/chosen": -0.18759331107139587, "rewards/margins": 1.0254666805267334, "rewards/rejected": -1.213059902191162, "step": 1490 }, { "epoch": 0.7199424046076314, "grad_norm": 51.25, "learning_rate": 1.101940530792356e-06, "logits/chosen": -2.6158971786499023, "logits/rejected": -2.5017120838165283, "logps/chosen": -245.5004425048828, "logps/rejected": -214.7168731689453, "loss": 0.4562, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.01713145337998867, "rewards/margins": 0.9989277124404907, "rewards/rejected": -1.016059160232544, "step": 1500 }, { "epoch": 0.7247420206383489, "grad_norm": 30.5, "learning_rate": 1.0673943031603134e-06, "logits/chosen": -2.605909824371338, "logits/rejected": -2.5186052322387695, "logps/chosen": -225.20675659179688, "logps/rejected": -219.88949584960938, "loss": 0.4336, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.2961525321006775, "rewards/margins": 1.0499502420425415, "rewards/rejected": -1.3461029529571533, "step": 1510 }, { "epoch": 0.7295416366690665, "grad_norm": 42.75, "learning_rate": 1.0332506785916524e-06, "logits/chosen": -2.590073823928833, "logits/rejected": -2.493535041809082, "logps/chosen": -241.81918334960938, "logps/rejected": -221.04598999023438, "loss": 0.4794, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.2541709542274475, "rewards/margins": 0.9314506649971008, "rewards/rejected": -1.1856216192245483, "step": 1520 }, { "epoch": 0.734341252699784, "grad_norm": 44.5, "learning_rate": 9.995192524187639e-07, "logits/chosen": -2.5296549797058105, "logits/rejected": -2.480565071105957, "logps/chosen": -224.52774047851562, "logps/rejected": -220.7323760986328, "loss": 0.506, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.37830549478530884, "rewards/margins": 0.9571624994277954, "rewards/rejected": -1.335468053817749, "step": 1530 }, { "epoch": 0.7391408687305016, "grad_norm": 48.0, "learning_rate": 9.662095041345318e-07, "logits/chosen": -2.569598436355591, "logits/rejected": -2.4485225677490234, "logps/chosen": -247.4419708251953, "logps/rejected": -228.1896209716797, "loss": 0.4693, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.18139569461345673, "rewards/margins": 1.0061061382293701, "rewards/rejected": -1.187502145767212, "step": 1540 }, { "epoch": 0.7439404847612191, "grad_norm": 46.75, "learning_rate": 9.333307947283258e-07, "logits/chosen": -2.628610610961914, "logits/rejected": -2.5239391326904297, "logps/chosen": -243.1181640625, "logps/rejected": -224.0470733642578, "loss": 0.448, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.21100196242332458, "rewards/margins": 0.9517234563827515, "rewards/rejected": -1.1627254486083984, "step": 1550 }, { "epoch": 0.7487401007919366, "grad_norm": 34.75, "learning_rate": 9.00892364055298e-07, "logits/chosen": -2.587705135345459, "logits/rejected": -2.5059056282043457, "logps/chosen": -214.00631713867188, "logps/rejected": -191.03036499023438, "loss": 0.4244, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.27500849962234497, "rewards/margins": 1.0308496952056885, "rewards/rejected": -1.3058582544326782, "step": 1560 }, { "epoch": 0.7535397168226542, "grad_norm": 47.5, "learning_rate": 8.689033282397166e-07, "logits/chosen": -2.6136879920959473, "logits/rejected": -2.5038022994995117, "logps/chosen": -231.7812957763672, "logps/rejected": -214.75051879882812, "loss": 0.4818, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.29626065492630005, "rewards/margins": 0.9530956149101257, "rewards/rejected": -1.2493562698364258, "step": 1570 }, { "epoch": 0.7583393328533717, "grad_norm": 30.125, "learning_rate": 8.373726771130769e-07, "logits/chosen": -2.607466220855713, "logits/rejected": -2.4886395931243896, "logps/chosen": -242.94442749023438, "logps/rejected": -211.0756072998047, "loss": 0.4417, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.22200937569141388, "rewards/margins": 1.0732426643371582, "rewards/rejected": -1.2952520847320557, "step": 1580 }, { "epoch": 0.7631389488840893, "grad_norm": 54.5, "learning_rate": 8.063092716877016e-07, "logits/chosen": -2.549015998840332, "logits/rejected": -2.454470157623291, "logps/chosen": -255.9425811767578, "logps/rejected": -221.3877410888672, "loss": 0.4876, "rewards/accuracies": 0.78125, "rewards/chosen": -0.1736731380224228, "rewards/margins": 0.8451417088508606, "rewards/rejected": -1.0188149213790894, "step": 1590 }, { "epoch": 0.7679385649148068, "grad_norm": 37.75, "learning_rate": 7.757218416665446e-07, "logits/chosen": -2.6379103660583496, "logits/rejected": -2.488036632537842, "logps/chosen": -232.8388214111328, "logps/rejected": -202.93429565429688, "loss": 0.4223, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.2727198600769043, "rewards/margins": 1.1647435426712036, "rewards/rejected": -1.4374632835388184, "step": 1600 }, { "epoch": 0.7679385649148068, "eval_logits/chosen": -2.609999656677246, "eval_logits/rejected": -2.498737096786499, "eval_logps/chosen": -234.40658569335938, "eval_logps/rejected": -217.20668029785156, "eval_loss": 0.44873249530792236, "eval_rewards/accuracies": 0.7990000247955322, "eval_rewards/chosen": -0.12736062705516815, "eval_rewards/margins": 1.040230393409729, "eval_rewards/rejected": -1.1675910949707031, "eval_runtime": 31.9547, "eval_samples_per_second": 31.294, "eval_steps_per_second": 7.824, "step": 1600 }, { "epoch": 0.7727381809455244, "grad_norm": 40.5, "learning_rate": 7.456189829898955e-07, "logits/chosen": -2.617159366607666, "logits/rejected": -2.4671359062194824, "logps/chosen": -239.3896484375, "logps/rejected": -200.057861328125, "loss": 0.4496, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.19172294437885284, "rewards/margins": 1.0450642108917236, "rewards/rejected": -1.23678719997406, "step": 1610 }, { "epoch": 0.7775377969762419, "grad_norm": 37.0, "learning_rate": 7.160091554196732e-07, "logits/chosen": -2.6596245765686035, "logits/rejected": -2.534268617630005, "logps/chosen": -236.6833953857422, "logps/rejected": -209.33248901367188, "loss": 0.4434, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.2100035846233368, "rewards/margins": 1.0765715837478638, "rewards/rejected": -1.2865750789642334, "step": 1620 }, { "epoch": 0.7823374130069595, "grad_norm": 48.5, "learning_rate": 6.869006801619941e-07, "logits/chosen": -2.57913875579834, "logits/rejected": -2.5170235633850098, "logps/chosen": -243.2036895751953, "logps/rejected": -240.40792846679688, "loss": 0.4967, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.19071198999881744, "rewards/margins": 1.0176457166671753, "rewards/rejected": -1.208357810974121, "step": 1630 }, { "epoch": 0.787137029037677, "grad_norm": 54.5, "learning_rate": 6.583017375286726e-07, "logits/chosen": -2.5934314727783203, "logits/rejected": -2.4739251136779785, "logps/chosen": -231.6185760498047, "logps/rejected": -208.304443359375, "loss": 0.4574, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.2090485543012619, "rewards/margins": 1.0793238878250122, "rewards/rejected": -1.2883723974227905, "step": 1640 }, { "epoch": 0.7919366450683946, "grad_norm": 41.75, "learning_rate": 6.30220364638324e-07, "logits/chosen": -2.6110548973083496, "logits/rejected": -2.519888401031494, "logps/chosen": -244.562255859375, "logps/rejected": -207.3623504638672, "loss": 0.4747, "rewards/accuracies": 0.78125, "rewards/chosen": -0.21555647253990173, "rewards/margins": 0.8874263763427734, "rewards/rejected": -1.1029828786849976, "step": 1650 }, { "epoch": 0.7967362610991121, "grad_norm": 45.25, "learning_rate": 6.02664453157703e-07, "logits/chosen": -2.6588504314422607, "logits/rejected": -2.5547537803649902, "logps/chosen": -235.1988983154297, "logps/rejected": -226.4233856201172, "loss": 0.4643, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.2098228931427002, "rewards/margins": 0.9475749731063843, "rewards/rejected": -1.157397985458374, "step": 1660 }, { "epoch": 0.8015358771298297, "grad_norm": 38.75, "learning_rate": 5.756417470839195e-07, "logits/chosen": -2.6417441368103027, "logits/rejected": -2.549515724182129, "logps/chosen": -229.06411743164062, "logps/rejected": -208.6210174560547, "loss": 0.445, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.18316276371479034, "rewards/margins": 0.9890239834785461, "rewards/rejected": -1.1721866130828857, "step": 1670 }, { "epoch": 0.8063354931605472, "grad_norm": 46.0, "learning_rate": 5.491598405681559e-07, "logits/chosen": -2.673280954360962, "logits/rejected": -2.498465061187744, "logps/chosen": -247.2686767578125, "logps/rejected": -202.4930419921875, "loss": 0.4942, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.17769768834114075, "rewards/margins": 0.8722941279411316, "rewards/rejected": -1.0499918460845947, "step": 1680 }, { "epoch": 0.8111351091912647, "grad_norm": 35.25, "learning_rate": 5.232261757814924e-07, "logits/chosen": -2.571895122528076, "logits/rejected": -2.438910722732544, "logps/chosen": -241.5240936279297, "logps/rejected": -213.72073364257812, "loss": 0.4317, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.1457543671131134, "rewards/margins": 1.0501019954681396, "rewards/rejected": -1.1958563327789307, "step": 1690 }, { "epoch": 0.8159347252219823, "grad_norm": 50.0, "learning_rate": 4.978480408234465e-07, "logits/chosen": -2.5237784385681152, "logits/rejected": -2.4956631660461426, "logps/chosen": -216.0598602294922, "logps/rejected": -214.1880340576172, "loss": 0.5087, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.20147447288036346, "rewards/margins": 0.8618594408035278, "rewards/rejected": -1.0633338689804077, "step": 1700 }, { "epoch": 0.8207343412526998, "grad_norm": 51.75, "learning_rate": 4.73032567673809e-07, "logits/chosen": -2.6058714389801025, "logits/rejected": -2.504986047744751, "logps/chosen": -230.72103881835938, "logps/rejected": -206.2929229736328, "loss": 0.4769, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.30534833669662476, "rewards/margins": 0.8801034092903137, "rewards/rejected": -1.1854515075683594, "step": 1710 }, { "epoch": 0.8255339572834173, "grad_norm": 46.75, "learning_rate": 4.487867301883528e-07, "logits/chosen": -2.5329365730285645, "logits/rejected": -2.43863844871521, "logps/chosen": -225.4794921875, "logps/rejected": -214.2443389892578, "loss": 0.5013, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.2076941281557083, "rewards/margins": 0.8862035870552063, "rewards/rejected": -1.0938977003097534, "step": 1720 }, { "epoch": 0.8303335733141348, "grad_norm": 42.25, "learning_rate": 4.2511734213898093e-07, "logits/chosen": -2.64858341217041, "logits/rejected": -2.5094025135040283, "logps/chosen": -254.30532836914062, "logps/rejected": -218.06900024414062, "loss": 0.5122, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.28009095788002014, "rewards/margins": 0.8344430923461914, "rewards/rejected": -1.1145341396331787, "step": 1730 }, { "epoch": 0.8351331893448524, "grad_norm": 69.5, "learning_rate": 4.020310552988632e-07, "logits/chosen": -2.620943546295166, "logits/rejected": -2.456711530685425, "logps/chosen": -250.24374389648438, "logps/rejected": -217.15219116210938, "loss": 0.4241, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.21671004593372345, "rewards/margins": 1.0417428016662598, "rewards/rejected": -1.2584527730941772, "step": 1740 }, { "epoch": 0.8399328053755699, "grad_norm": 51.5, "learning_rate": 3.7953435757309756e-07, "logits/chosen": -2.628960132598877, "logits/rejected": -2.504699468612671, "logps/chosen": -253.9635467529297, "logps/rejected": -227.1519317626953, "loss": 0.5159, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.17710088193416595, "rewards/margins": 0.8642765879631042, "rewards/rejected": -1.0413774251937866, "step": 1750 }, { "epoch": 0.8447324214062875, "grad_norm": 52.25, "learning_rate": 3.5763357117542364e-07, "logits/chosen": -2.6271114349365234, "logits/rejected": -2.5810532569885254, "logps/chosen": -237.49758911132812, "logps/rejected": -229.40185546875, "loss": 0.4646, "rewards/accuracies": 0.75, "rewards/chosen": -0.2700980305671692, "rewards/margins": 1.0334694385528564, "rewards/rejected": -1.3035674095153809, "step": 1760 }, { "epoch": 0.849532037437005, "grad_norm": 63.75, "learning_rate": 3.363348508515016e-07, "logits/chosen": -2.6458356380462646, "logits/rejected": -2.519512176513672, "logps/chosen": -238.7860565185547, "logps/rejected": -220.2080078125, "loss": 0.5092, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.2170904129743576, "rewards/margins": 0.8388406038284302, "rewards/rejected": -1.0559309720993042, "step": 1770 }, { "epoch": 0.8543316534677226, "grad_norm": 45.25, "learning_rate": 3.156441821492506e-07, "logits/chosen": -2.602996349334717, "logits/rejected": -2.490618944168091, "logps/chosen": -237.382568359375, "logps/rejected": -219.4293975830078, "loss": 0.4473, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2120467871427536, "rewards/margins": 1.101704478263855, "rewards/rejected": -1.313751220703125, "step": 1780 }, { "epoch": 0.8591312694984401, "grad_norm": 38.75, "learning_rate": 2.9556737973674117e-07, "logits/chosen": -2.609147310256958, "logits/rejected": -2.473086357116699, "logps/chosen": -242.01962280273438, "logps/rejected": -202.3125762939453, "loss": 0.4658, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.25016945600509644, "rewards/margins": 1.0174734592437744, "rewards/rejected": -1.267642855644226, "step": 1790 }, { "epoch": 0.8639308855291576, "grad_norm": 51.25, "learning_rate": 2.761100857681068e-07, "logits/chosen": -2.625156879425049, "logits/rejected": -2.513139486312866, "logps/chosen": -219.42385864257812, "logps/rejected": -200.65432739257812, "loss": 0.5114, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.38931506872177124, "rewards/margins": 0.8760172724723816, "rewards/rejected": -1.2653322219848633, "step": 1800 }, { "epoch": 0.8639308855291576, "eval_logits/chosen": -2.6100218296051025, "eval_logits/rejected": -2.4987967014312744, "eval_logps/chosen": -234.25213623046875, "eval_logps/rejected": -217.06610107421875, "eval_loss": 0.44831663370132446, "eval_rewards/accuracies": 0.7990000247955322, "eval_rewards/chosen": -0.11191659420728683, "eval_rewards/margins": 1.0416151285171509, "eval_rewards/rejected": -1.1535316705703735, "eval_runtime": 31.9766, "eval_samples_per_second": 31.273, "eval_steps_per_second": 7.818, "step": 1800 }, { "epoch": 0.8687305015598752, "grad_norm": 40.75, "learning_rate": 2.5727776829793774e-07, "logits/chosen": -2.6402649879455566, "logits/rejected": -2.48222017288208, "logps/chosen": -238.41708374023438, "logps/rejected": -186.76150512695312, "loss": 0.4385, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.12383349239826202, "rewards/margins": 1.0703352689743042, "rewards/rejected": -1.1941686868667603, "step": 1810 }, { "epoch": 0.8735301175905927, "grad_norm": 64.5, "learning_rate": 2.3907571974460255e-07, "logits/chosen": -2.614182949066162, "logits/rejected": -2.4683122634887695, "logps/chosen": -242.4246063232422, "logps/rejected": -199.64266967773438, "loss": 0.4524, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.18747808039188385, "rewards/margins": 0.9742029905319214, "rewards/rejected": -1.1616809368133545, "step": 1820 }, { "epoch": 0.8783297336213103, "grad_norm": 52.5, "learning_rate": 2.2150905540292589e-07, "logits/chosen": -2.6297433376312256, "logits/rejected": -2.4974730014801025, "logps/chosen": -229.2626953125, "logps/rejected": -213.91567993164062, "loss": 0.4491, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.12955379486083984, "rewards/margins": 0.960136890411377, "rewards/rejected": -1.0896905660629272, "step": 1830 }, { "epoch": 0.8831293496520278, "grad_norm": 39.5, "learning_rate": 2.0458271200664626e-07, "logits/chosen": -2.5526020526885986, "logits/rejected": -2.517185688018799, "logps/chosen": -214.2257843017578, "logps/rejected": -210.78115844726562, "loss": 0.4151, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.33652427792549133, "rewards/margins": 1.0866222381591797, "rewards/rejected": -1.4231464862823486, "step": 1840 }, { "epoch": 0.8879289656827454, "grad_norm": 32.5, "learning_rate": 1.8830144634105206e-07, "logits/chosen": -2.5996253490448, "logits/rejected": -2.453141927719116, "logps/chosen": -247.02804565429688, "logps/rejected": -199.14678955078125, "loss": 0.4273, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.08400492370128632, "rewards/margins": 1.1507220268249512, "rewards/rejected": -1.234726905822754, "step": 1850 }, { "epoch": 0.8927285817134629, "grad_norm": 42.25, "learning_rate": 1.7266983390618997e-07, "logits/chosen": -2.5644283294677734, "logits/rejected": -2.453029155731201, "logps/chosen": -228.55294799804688, "logps/rejected": -198.43435668945312, "loss": 0.4349, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.06119618937373161, "rewards/margins": 1.1090104579925537, "rewards/rejected": -1.1702066659927368, "step": 1860 }, { "epoch": 0.8975281977441805, "grad_norm": 49.0, "learning_rate": 1.5769226763101887e-07, "logits/chosen": -2.5068180561065674, "logits/rejected": -2.47198748588562, "logps/chosen": -231.07266235351562, "logps/rejected": -219.62387084960938, "loss": 0.4895, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.2498408854007721, "rewards/margins": 0.9979842901229858, "rewards/rejected": -1.2478251457214355, "step": 1870 }, { "epoch": 0.902327813774898, "grad_norm": 54.0, "learning_rate": 1.4337295663887086e-07, "logits/chosen": -2.659472942352295, "logits/rejected": -2.517972946166992, "logps/chosen": -239.847900390625, "logps/rejected": -198.91372680664062, "loss": 0.4644, "rewards/accuracies": 0.78125, "rewards/chosen": -0.11448581516742706, "rewards/margins": 1.0199806690216064, "rewards/rejected": -1.134466528892517, "step": 1880 }, { "epoch": 0.9071274298056156, "grad_norm": 63.0, "learning_rate": 1.2971592506456799e-07, "logits/chosen": -2.5675461292266846, "logits/rejected": -2.4976906776428223, "logps/chosen": -206.1525421142578, "logps/rejected": -200.3158721923828, "loss": 0.4568, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.2682192623615265, "rewards/margins": 1.0316725969314575, "rewards/rejected": -1.2998919486999512, "step": 1890 }, { "epoch": 0.9119270458363331, "grad_norm": 41.25, "learning_rate": 1.1672501092352545e-07, "logits/chosen": -2.613196611404419, "logits/rejected": -2.4825501441955566, "logps/chosen": -241.4010772705078, "logps/rejected": -214.26339721679688, "loss": 0.4681, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.11850067228078842, "rewards/margins": 0.9811543226242065, "rewards/rejected": -1.0996549129486084, "step": 1900 }, { "epoch": 0.9167266618670507, "grad_norm": 56.0, "learning_rate": 1.0440386503315969e-07, "logits/chosen": -2.5355827808380127, "logits/rejected": -2.4754626750946045, "logps/chosen": -224.82363891601562, "logps/rejected": -254.0577850341797, "loss": 0.4594, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.16619113087654114, "rewards/margins": 1.0043249130249023, "rewards/rejected": -1.1705158948898315, "step": 1910 }, { "epoch": 0.9215262778977682, "grad_norm": 47.0, "learning_rate": 9.275594998690574e-08, "logits/chosen": -2.5817465782165527, "logits/rejected": -2.4193766117095947, "logps/chosen": -251.2194061279297, "logps/rejected": -203.05404663085938, "loss": 0.4609, "rewards/accuracies": 0.78125, "rewards/chosen": -0.17528951168060303, "rewards/margins": 0.9491412043571472, "rewards/rejected": -1.1244306564331055, "step": 1920 }, { "epoch": 0.9263258939284857, "grad_norm": 44.75, "learning_rate": 8.178453918112783e-08, "logits/chosen": -2.5946240425109863, "logits/rejected": -2.4726099967956543, "logps/chosen": -223.893798828125, "logps/rejected": -191.31539916992188, "loss": 0.4227, "rewards/accuracies": 0.8125, "rewards/chosen": -0.033659275621175766, "rewards/margins": 1.185791015625, "rewards/rejected": -1.2194502353668213, "step": 1930 }, { "epoch": 0.9311255099592033, "grad_norm": 72.0, "learning_rate": 7.149271589520167e-08, "logits/chosen": -2.551396131515503, "logits/rejected": -2.4351983070373535, "logps/chosen": -212.9639129638672, "logps/rejected": -204.0959930419922, "loss": 0.4853, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.29301124811172485, "rewards/margins": 0.9584601521492004, "rewards/rejected": -1.2514712810516357, "step": 1940 }, { "epoch": 0.9359251259899208, "grad_norm": 42.5, "learning_rate": 6.188337242502784e-08, "logits/chosen": -2.592571496963501, "logits/rejected": -2.4578793048858643, "logps/chosen": -238.7187042236328, "logps/rejected": -204.2020263671875, "loss": 0.4352, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.20334915816783905, "rewards/margins": 1.0652612447738647, "rewards/rejected": -1.2686102390289307, "step": 1950 }, { "epoch": 0.9407247420206384, "grad_norm": 34.25, "learning_rate": 5.295920927021109e-08, "logits/chosen": -2.6256229877471924, "logits/rejected": -2.5241026878356934, "logps/chosen": -236.45401000976562, "logps/rejected": -210.2203369140625, "loss": 0.4396, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.19231542944908142, "rewards/margins": 1.1057020425796509, "rewards/rejected": -1.2980175018310547, "step": 1960 }, { "epoch": 0.9455243580513559, "grad_norm": 44.5, "learning_rate": 4.472273437514357e-08, "logits/chosen": -2.644946336746216, "logits/rejected": -2.526045083999634, "logps/chosen": -258.8535461425781, "logps/rejected": -222.54019165039062, "loss": 0.4178, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.06698047369718552, "rewards/margins": 1.1277925968170166, "rewards/rejected": -1.1947730779647827, "step": 1970 }, { "epoch": 0.9503239740820735, "grad_norm": 36.0, "learning_rate": 3.717626242420252e-08, "logits/chosen": -2.60685658454895, "logits/rejected": -2.5159687995910645, "logps/chosen": -223.88827514648438, "logps/rejected": -212.7635498046875, "loss": 0.5017, "rewards/accuracies": 0.75, "rewards/chosen": -0.22438673675060272, "rewards/margins": 0.8267514109611511, "rewards/rejected": -1.0511382818222046, "step": 1980 }, { "epoch": 0.955123590112791, "grad_norm": 43.75, "learning_rate": 3.03219141912553e-08, "logits/chosen": -2.586005687713623, "logits/rejected": -2.484550714492798, "logps/chosen": -244.2949676513672, "logps/rejected": -225.6749725341797, "loss": 0.4699, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.22158965468406677, "rewards/margins": 0.8752668499946594, "rewards/rejected": -1.0968565940856934, "step": 1990 }, { "epoch": 0.9599232061435086, "grad_norm": 49.75, "learning_rate": 2.4161615943664174e-08, "logits/chosen": -2.6366090774536133, "logits/rejected": -2.5460915565490723, "logps/chosen": -219.8328094482422, "logps/rejected": -214.83737182617188, "loss": 0.4763, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.11922893673181534, "rewards/margins": 0.9408013224601746, "rewards/rejected": -1.060030221939087, "step": 2000 }, { "epoch": 0.9599232061435086, "eval_logits/chosen": -2.610217571258545, "eval_logits/rejected": -2.4989378452301025, "eval_logps/chosen": -234.27659606933594, "eval_logps/rejected": -217.0829620361328, "eval_loss": 0.4485087990760803, "eval_rewards/accuracies": 0.7950000166893005, "eval_rewards/chosen": -0.11436203867197037, "eval_rewards/margins": 1.0408560037612915, "eval_rewards/rejected": -1.1552180051803589, "eval_runtime": 21.4702, "eval_samples_per_second": 46.576, "eval_steps_per_second": 11.644, "step": 2000 }, { "epoch": 0.9647228221742261, "grad_norm": 34.5, "learning_rate": 1.8697098900948285e-08, "logits/chosen": -2.5669498443603516, "logits/rejected": -2.4752142429351807, "logps/chosen": -227.6999053955078, "logps/rejected": -230.0476531982422, "loss": 0.435, "rewards/accuracies": 0.8125, "rewards/chosen": -0.15688031911849976, "rewards/margins": 1.0234278440475464, "rewards/rejected": -1.180308222770691, "step": 2010 }, { "epoch": 0.9695224382049437, "grad_norm": 48.25, "learning_rate": 1.392989874826195e-08, "logits/chosen": -2.6464297771453857, "logits/rejected": -2.513470411300659, "logps/chosen": -231.412353515625, "logps/rejected": -216.2265167236328, "loss": 0.5319, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.2918005585670471, "rewards/margins": 0.7504169940948486, "rewards/rejected": -1.042217493057251, "step": 2020 }, { "epoch": 0.9743220542356611, "grad_norm": 52.5, "learning_rate": 9.861355204825173e-09, "logits/chosen": -2.6138572692871094, "logits/rejected": -2.4997787475585938, "logps/chosen": -259.1074523925781, "logps/rejected": -211.08761596679688, "loss": 0.4733, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.19954562187194824, "rewards/margins": 1.0070618391036987, "rewards/rejected": -1.206607460975647, "step": 2030 }, { "epoch": 0.9791216702663786, "grad_norm": 36.5, "learning_rate": 6.492611647420932e-09, "logits/chosen": -2.594069004058838, "logits/rejected": -2.459033250808716, "logps/chosen": -230.0123291015625, "logps/rejected": -206.733642578125, "loss": 0.4503, "rewards/accuracies": 0.8125, "rewards/chosen": -0.22017189860343933, "rewards/margins": 1.0728034973144531, "rewards/rejected": -1.2929753065109253, "step": 2040 }, { "epoch": 0.9839212862970962, "grad_norm": 51.25, "learning_rate": 3.8246147890763645e-09, "logits/chosen": -2.643317699432373, "logits/rejected": -2.504284143447876, "logps/chosen": -246.441162109375, "logps/rejected": -210.845458984375, "loss": 0.494, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.2079886943101883, "rewards/margins": 0.7775675058364868, "rewards/rejected": -0.9855562448501587, "step": 2050 }, { "epoch": 0.9887209023278137, "grad_norm": 38.0, "learning_rate": 1.8581144130089269e-09, "logits/chosen": -2.567206859588623, "logits/rejected": -2.506730794906616, "logps/chosen": -222.69644165039062, "logps/rejected": -218.9879913330078, "loss": 0.4439, "rewards/accuracies": 0.8125, "rewards/chosen": -0.1257813274860382, "rewards/margins": 1.1031692028045654, "rewards/rejected": -1.2289507389068604, "step": 2060 }, { "epoch": 0.9935205183585313, "grad_norm": 53.25, "learning_rate": 5.936631619152256e-10, "logits/chosen": -2.615421772003174, "logits/rejected": -2.54068660736084, "logps/chosen": -233.13818359375, "logps/rejected": -214.350341796875, "loss": 0.4651, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.03799530863761902, "rewards/margins": 1.0076466798782349, "rewards/rejected": -1.0456420183181763, "step": 2070 }, { "epoch": 0.9983201343892488, "grad_norm": 55.75, "learning_rate": 3.161638266302447e-11, "logits/chosen": -2.651322841644287, "logits/rejected": -2.516923666000366, "logps/chosen": -235.73876953125, "logps/rejected": -220.57421875, "loss": 0.47, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.25053197145462036, "rewards/margins": 0.9338071942329407, "rewards/rejected": -1.184339165687561, "step": 2080 }, { "epoch": 0.9997600191984641, "step": 2083, "total_flos": 0.0, "train_loss": 0.49830314429746014, "train_runtime": 2918.5416, "train_samples_per_second": 11.421, "train_steps_per_second": 0.714 } ], "logging_steps": 10, "max_steps": 2083, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }