{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9989528795811519, "eval_steps": 200, "global_step": 477, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0020942408376963353, "grad_norm": 334.2563171386719, "learning_rate": 0.0, "logits/chosen": -0.615048885345459, "logits/rejected": -0.6184952855110168, "logps/chosen": -1.287253737449646, "logps/rejected": -1.4317338466644287, "loss": 100.0182, "rewards/accuracies": 0.625, "rewards/chosen": 0.00019188039004802704, "rewards/margins": 0.00016489533300045878, "rewards/rejected": 2.698507159948349e-05, "step": 1 }, { "epoch": 0.020942408376963352, "grad_norm": 285.50787353515625, "learning_rate": 9.375e-08, "logits/chosen": -0.6403717398643494, "logits/rejected": -0.6338582634925842, "logps/chosen": -1.1636602878570557, "logps/rejected": -1.3377324342727661, "loss": 99.9909, "rewards/accuracies": 0.5208333134651184, "rewards/chosen": 6.887399649713188e-05, "rewards/margins": 5.176978083909489e-05, "rewards/rejected": 1.7104217477026395e-05, "step": 10 }, { "epoch": 0.041884816753926704, "grad_norm": 435.0426330566406, "learning_rate": 1.9791666666666664e-07, "logits/chosen": -0.6049723625183105, "logits/rejected": -0.6281808614730835, "logps/chosen": -1.197880506515503, "logps/rejected": -1.3490830659866333, "loss": 99.9683, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.00015270955918822438, "rewards/margins": 0.00011640439333859831, "rewards/rejected": 3.630516948760487e-05, "step": 20 }, { "epoch": 0.06282722513089005, "grad_norm": 340.81402587890625, "learning_rate": 3.020833333333333e-07, "logits/chosen": -0.5800901055335999, "logits/rejected": -0.5592249035835266, "logps/chosen": -1.1031697988510132, "logps/rejected": -1.1418795585632324, "loss": 99.8945, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0005954386433586478, "rewards/margins": 0.00042125614709220827, "rewards/rejected": 0.0001741824671626091, "step": 30 }, { "epoch": 0.08376963350785341, "grad_norm": 417.978271484375, "learning_rate": 4.0625e-07, "logits/chosen": -0.6128225922584534, "logits/rejected": -0.6447689533233643, "logps/chosen": -1.1222411394119263, "logps/rejected": -1.2407972812652588, "loss": 99.5546, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.0018988971132785082, "rewards/margins": 0.0013067866675555706, "rewards/rejected": 0.0005921103293076158, "step": 40 }, { "epoch": 0.10471204188481675, "grad_norm": 316.5003356933594, "learning_rate": 4.999932966293553e-07, "logits/chosen": -0.6670976877212524, "logits/rejected": -0.6841738820075989, "logps/chosen": -1.0955612659454346, "logps/rejected": -1.2405668497085571, "loss": 99.0369, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.00708451634272933, "rewards/margins": 0.00436831172555685, "rewards/rejected": 0.0027162046171724796, "step": 50 }, { "epoch": 0.1256544502617801, "grad_norm": 419.10931396484375, "learning_rate": 4.991893270335525e-07, "logits/chosen": -0.6393335461616516, "logits/rejected": -0.6426213383674622, "logps/chosen": -1.1490199565887451, "logps/rejected": -1.2731598615646362, "loss": 98.2609, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.006263996474444866, "rewards/margins": 0.006520474795252085, "rewards/rejected": -0.00025647730217315257, "step": 60 }, { "epoch": 0.14659685863874344, "grad_norm": 455.3122863769531, "learning_rate": 4.970496218214204e-07, "logits/chosen": -0.7453044056892395, "logits/rejected": -0.7440285682678223, "logps/chosen": -1.1631481647491455, "logps/rejected": -1.4671075344085693, "loss": 97.3229, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.0009300081292167306, "rewards/margins": 0.0067420280538499355, "rewards/rejected": -0.005812020041048527, "step": 70 }, { "epoch": 0.16753926701570682, "grad_norm": 581.3535766601562, "learning_rate": 4.935856505068998e-07, "logits/chosen": -0.7049443125724792, "logits/rejected": -0.7213777303695679, "logps/chosen": -1.2213430404663086, "logps/rejected": -1.5481337308883667, "loss": 95.8171, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.009956983849406242, "rewards/margins": 0.012672394514083862, "rewards/rejected": -0.022629376500844955, "step": 80 }, { "epoch": 0.18848167539267016, "grad_norm": 433.6556396484375, "learning_rate": 4.8881598109976e-07, "logits/chosen": -0.6699498891830444, "logits/rejected": -0.7027503252029419, "logps/chosen": -1.3308374881744385, "logps/rejected": -1.6979453563690186, "loss": 94.2675, "rewards/accuracies": 0.65625, "rewards/chosen": -0.01737907901406288, "rewards/margins": 0.019169464707374573, "rewards/rejected": -0.036548543721437454, "step": 90 }, { "epoch": 0.2094240837696335, "grad_norm": 591.3351440429688, "learning_rate": 4.827661805750437e-07, "logits/chosen": -0.6697665452957153, "logits/rejected": -0.6784194111824036, "logps/chosen": -1.2227742671966553, "logps/rejected": -1.5098785161972046, "loss": 94.1616, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.01010363269597292, "rewards/margins": 0.01724056527018547, "rewards/rejected": -0.027344200760126114, "step": 100 }, { "epoch": 0.23036649214659685, "grad_norm": 1046.2178955078125, "learning_rate": 4.75468677825789e-07, "logits/chosen": -0.6205618977546692, "logits/rejected": -0.6708791851997375, "logps/chosen": -1.1823982000350952, "logps/rejected": -1.7904584407806396, "loss": 92.3693, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.014755316078662872, "rewards/margins": 0.02819245494902134, "rewards/rejected": -0.04294777661561966, "step": 110 }, { "epoch": 0.2513089005235602, "grad_norm": 456.5362243652344, "learning_rate": 4.669625898336438e-07, "logits/chosen": -0.6897668838500977, "logits/rejected": -0.6946333646774292, "logps/chosen": -1.475966215133667, "logps/rejected": -1.7667391300201416, "loss": 92.8502, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.026928585022687912, "rewards/margins": 0.020536581054329872, "rewards/rejected": -0.04746516793966293, "step": 120 }, { "epoch": 0.27225130890052357, "grad_norm": 493.82305908203125, "learning_rate": 4.5729351198915705e-07, "logits/chosen": -0.675911009311676, "logits/rejected": -0.6610211730003357, "logps/chosen": -1.4558229446411133, "logps/rejected": -1.6363474130630493, "loss": 91.8914, "rewards/accuracies": 0.59375, "rewards/chosen": -0.024449264630675316, "rewards/margins": 0.015649044886231422, "rewards/rejected": -0.04009830951690674, "step": 130 }, { "epoch": 0.2931937172774869, "grad_norm": 656.6390380859375, "learning_rate": 4.4651327368569684e-07, "logits/chosen": -0.7115468382835388, "logits/rejected": -0.6819766759872437, "logps/chosen": -1.3681796789169312, "logps/rejected": -1.7001540660858154, "loss": 92.2874, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.013815701007843018, "rewards/margins": 0.026623845100402832, "rewards/rejected": -0.04043954610824585, "step": 140 }, { "epoch": 0.31413612565445026, "grad_norm": 1577.8076171875, "learning_rate": 4.346796604970912e-07, "logits/chosen": -0.7165893912315369, "logits/rejected": -0.7037891745567322, "logps/chosen": -1.3247240781784058, "logps/rejected": -1.9355542659759521, "loss": 90.6388, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.01547580398619175, "rewards/margins": 0.04593699425458908, "rewards/rejected": -0.06141280010342598, "step": 150 }, { "epoch": 0.33507853403141363, "grad_norm": 1120.6865234375, "learning_rate": 4.218561044282098e-07, "logits/chosen": -0.6731705069541931, "logits/rejected": -0.6826528906822205, "logps/chosen": -1.5552542209625244, "logps/rejected": -2.336169719696045, "loss": 89.2392, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.03482538461685181, "rewards/margins": 0.05508958175778389, "rewards/rejected": -0.0899149626493454, "step": 160 }, { "epoch": 0.35602094240837695, "grad_norm": 1590.9273681640625, "learning_rate": 4.081113438988443e-07, "logits/chosen": -0.6908739805221558, "logits/rejected": -0.6841577291488647, "logps/chosen": -1.4513384103775024, "logps/rejected": -1.9829685688018799, "loss": 88.2583, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.0341075137257576, "rewards/margins": 0.03856590390205383, "rewards/rejected": -0.07267341762781143, "step": 170 }, { "epoch": 0.3769633507853403, "grad_norm": 1529.81201171875, "learning_rate": 3.935190552834828e-07, "logits/chosen": -0.6555457711219788, "logits/rejected": -0.6698058843612671, "logps/chosen": -1.414902925491333, "logps/rejected": -1.8965734243392944, "loss": 88.1778, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.021763872355222702, "rewards/margins": 0.03738107159733772, "rewards/rejected": -0.059144943952560425, "step": 180 }, { "epoch": 0.39790575916230364, "grad_norm": 1098.9884033203125, "learning_rate": 3.781574579820464e-07, "logits/chosen": -0.7295094132423401, "logits/rejected": -0.7264297008514404, "logps/chosen": -1.3178633451461792, "logps/rejected": -2.14937162399292, "loss": 87.544, "rewards/accuracies": 0.75, "rewards/chosen": -0.02538556233048439, "rewards/margins": 0.06075022369623184, "rewards/rejected": -0.08613577485084534, "step": 190 }, { "epoch": 0.418848167539267, "grad_norm": 920.0095825195312, "learning_rate": 3.621088951385353e-07, "logits/chosen": -0.7311328649520874, "logits/rejected": -0.7470929026603699, "logps/chosen": -1.468587040901184, "logps/rejected": -2.221226453781128, "loss": 87.5502, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.03510420024394989, "rewards/margins": 0.05219089239835739, "rewards/rejected": -0.08729508519172668, "step": 200 }, { "epoch": 0.418848167539267, "eval_logits/chosen": -0.7410492897033691, "eval_logits/rejected": -0.7169657945632935, "eval_logps/chosen": -1.6581848859786987, "eval_logps/rejected": -2.345245361328125, "eval_loss": 21.880334854125977, "eval_rewards/accuracies": 0.7056451439857483, "eval_rewards/chosen": -0.05191723257303238, "eval_rewards/margins": 0.04926152899861336, "eval_rewards/rejected": -0.10117875784635544, "eval_runtime": 44.8829, "eval_samples_per_second": 44.56, "eval_steps_per_second": 1.404, "step": 200 }, { "epoch": 0.4397905759162304, "grad_norm": 895.4285278320312, "learning_rate": 3.454593922550693e-07, "logits/chosen": -0.6665436029434204, "logits/rejected": -0.6757727861404419, "logps/chosen": -1.485873818397522, "logps/rejected": -2.344456672668457, "loss": 86.8513, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.03312455862760544, "rewards/margins": 0.06565652042627335, "rewards/rejected": -0.09878107905387878, "step": 210 }, { "epoch": 0.4607329842931937, "grad_norm": 1199.5086669921875, "learning_rate": 3.2829819606729477e-07, "logits/chosen": -0.7451668977737427, "logits/rejected": -0.7250313758850098, "logps/chosen": -1.6201412677764893, "logps/rejected": -2.3489794731140137, "loss": 85.5906, "rewards/accuracies": 0.71875, "rewards/chosen": -0.046348538249731064, "rewards/margins": 0.056174565106630325, "rewards/rejected": -0.10252310335636139, "step": 220 }, { "epoch": 0.4816753926701571, "grad_norm": 1276.23583984375, "learning_rate": 3.1071729615293424e-07, "logits/chosen": -0.768721878528595, "logits/rejected": -0.7249744534492493, "logps/chosen": -1.8909496068954468, "logps/rejected": -2.51973557472229, "loss": 85.1963, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.06488613784313202, "rewards/margins": 0.05823909491300583, "rewards/rejected": -0.12312524020671844, "step": 230 }, { "epoch": 0.5026178010471204, "grad_norm": 1169.336181640625, "learning_rate": 2.9281093183781403e-07, "logits/chosen": -0.6923402547836304, "logits/rejected": -0.7125403881072998, "logps/chosen": -1.8178189992904663, "logps/rejected": -2.7802040576934814, "loss": 83.9422, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.06791480630636215, "rewards/margins": 0.0749489963054657, "rewards/rejected": -0.14286379516124725, "step": 240 }, { "epoch": 0.5235602094240838, "grad_norm": 1323.8421630859375, "learning_rate": 2.7467508704251135e-07, "logits/chosen": -0.7305997610092163, "logits/rejected": -0.7117995023727417, "logps/chosen": -1.8664335012435913, "logps/rejected": -2.734891653060913, "loss": 83.9814, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.0722884088754654, "rewards/margins": 0.06869889050722122, "rewards/rejected": -0.14098729193210602, "step": 250 }, { "epoch": 0.5445026178010471, "grad_norm": 1842.348388671875, "learning_rate": 2.5640697577740815e-07, "logits/chosen": -0.7306697368621826, "logits/rejected": -0.7254212498664856, "logps/chosen": -2.0410964488983154, "logps/rejected": -2.9278271198272705, "loss": 84.5359, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.08307617902755737, "rewards/margins": 0.07588212937116623, "rewards/rejected": -0.1589583158493042, "step": 260 }, { "epoch": 0.5654450261780105, "grad_norm": 1502.2591552734375, "learning_rate": 2.381045210440644e-07, "logits/chosen": -0.7093442678451538, "logits/rejected": -0.6900134682655334, "logps/chosen": -2.196712017059326, "logps/rejected": -2.765611171722412, "loss": 83.7083, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.10604780912399292, "rewards/margins": 0.04271745681762695, "rewards/rejected": -0.14876528084278107, "step": 270 }, { "epoch": 0.5863874345549738, "grad_norm": 1244.79345703125, "learning_rate": 2.1986582993616925e-07, "logits/chosen": -0.7046034336090088, "logits/rejected": -0.7077471017837524, "logps/chosen": -1.9278895854949951, "logps/rejected": -3.006953001022339, "loss": 81.093, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.0760059580206871, "rewards/margins": 0.0849492996931076, "rewards/rejected": -0.1609552651643753, "step": 280 }, { "epoch": 0.6073298429319371, "grad_norm": 2431.488037109375, "learning_rate": 2.0178866775369774e-07, "logits/chosen": -0.7360321879386902, "logits/rejected": -0.7089033126831055, "logps/chosen": -2.280106544494629, "logps/rejected": -3.052873373031616, "loss": 84.3058, "rewards/accuracies": 0.65625, "rewards/chosen": -0.10656996071338654, "rewards/margins": 0.06549294292926788, "rewards/rejected": -0.17206290364265442, "step": 290 }, { "epoch": 0.6282722513089005, "grad_norm": 2390.089599609375, "learning_rate": 1.839699339491937e-07, "logits/chosen": -0.7405005097389221, "logits/rejected": -0.7184410691261292, "logps/chosen": -2.2960636615753174, "logps/rejected": -3.015368700027466, "loss": 83.439, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.10949590057134628, "rewards/margins": 0.06635172665119171, "rewards/rejected": -0.17584764957427979, "step": 300 }, { "epoch": 0.6492146596858639, "grad_norm": 1713.3050537109375, "learning_rate": 1.6650514271527465e-07, "logits/chosen": -0.6984419822692871, "logits/rejected": -0.6775582432746887, "logps/chosen": -2.1996333599090576, "logps/rejected": -2.9705100059509277, "loss": 80.5471, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.09805257618427277, "rewards/margins": 0.07093057781457901, "rewards/rejected": -0.16898314654827118, "step": 310 }, { "epoch": 0.6701570680628273, "grad_norm": 1855.712646484375, "learning_rate": 1.4948791099758052e-07, "logits/chosen": -0.7052116990089417, "logits/rejected": -0.7140225172042847, "logps/chosen": -2.282247304916382, "logps/rejected": -3.390331745147705, "loss": 80.2757, "rewards/accuracies": 0.75, "rewards/chosen": -0.107896588742733, "rewards/margins": 0.09624334424734116, "rewards/rejected": -0.20413991808891296, "step": 320 }, { "epoch": 0.6910994764397905, "grad_norm": 1676.298095703125, "learning_rate": 1.3300945667758012e-07, "logits/chosen": -0.6669484972953796, "logits/rejected": -0.6471594572067261, "logps/chosen": -2.1016829013824463, "logps/rejected": -3.35722279548645, "loss": 81.4342, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.08771068602800369, "rewards/margins": 0.11208438873291016, "rewards/rejected": -0.19979506731033325, "step": 330 }, { "epoch": 0.7120418848167539, "grad_norm": 1909.808349609375, "learning_rate": 1.1715810961514072e-07, "logits/chosen": -0.6822046041488647, "logits/rejected": -0.651155948638916, "logps/chosen": -2.583623170852661, "logps/rejected": -3.5456066131591797, "loss": 81.1622, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.13260190188884735, "rewards/margins": 0.08422346413135529, "rewards/rejected": -0.21682536602020264, "step": 340 }, { "epoch": 0.7329842931937173, "grad_norm": 1723.5054931640625, "learning_rate": 1.0201883817182949e-07, "logits/chosen": -0.6957032084465027, "logits/rejected": -0.6772996783256531, "logps/chosen": -2.392089366912842, "logps/rejected": -3.3031153678894043, "loss": 81.021, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.12260153144598007, "rewards/margins": 0.07098677009344101, "rewards/rejected": -0.19358830153942108, "step": 350 }, { "epoch": 0.7539267015706806, "grad_norm": 1900.4688720703125, "learning_rate": 8.76727937529367e-08, "logits/chosen": -0.6489254236221313, "logits/rejected": -0.6335070133209229, "logps/chosen": -2.3418354988098145, "logps/rejected": -3.38752818107605, "loss": 82.1297, "rewards/accuracies": 0.65625, "rewards/chosen": -0.11146201193332672, "rewards/margins": 0.0862349346280098, "rewards/rejected": -0.19769695401191711, "step": 360 }, { "epoch": 0.774869109947644, "grad_norm": 1725.67431640625, "learning_rate": 7.419687580962222e-08, "logits/chosen": -0.6811009645462036, "logits/rejected": -0.6841970682144165, "logps/chosen": -2.3078436851501465, "logps/rejected": -3.5094189643859863, "loss": 78.1712, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.11148089170455933, "rewards/margins": 0.10181178897619247, "rewards/rejected": -0.2132926881313324, "step": 370 }, { "epoch": 0.7958115183246073, "grad_norm": 1767.7569580078125, "learning_rate": 6.166331963291519e-08, "logits/chosen": -0.6415311694145203, "logits/rejected": -0.6283861398696899, "logps/chosen": -2.2348546981811523, "logps/rejected": -3.141282320022583, "loss": 81.5469, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.10441361367702484, "rewards/margins": 0.08075843751430511, "rewards/rejected": -0.18517205119132996, "step": 380 }, { "epoch": 0.8167539267015707, "grad_norm": 1984.725341796875, "learning_rate": 5.013930914912476e-08, "logits/chosen": -0.691740870475769, "logits/rejected": -0.6813372373580933, "logps/chosen": -2.3861072063446045, "logps/rejected": -3.2833335399627686, "loss": 81.5221, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.12453154474496841, "rewards/margins": 0.07637959718704224, "rewards/rejected": -0.20091113448143005, "step": 390 }, { "epoch": 0.837696335078534, "grad_norm": 1985.95068359375, "learning_rate": 3.968661679220467e-08, "logits/chosen": -0.6570695042610168, "logits/rejected": -0.6685432195663452, "logps/chosen": -2.366621732711792, "logps/rejected": -3.2288424968719482, "loss": 80.9952, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.11433804035186768, "rewards/margins": 0.08440788835287094, "rewards/rejected": -0.1987459361553192, "step": 400 }, { "epoch": 0.837696335078534, "eval_logits/chosen": -0.677091121673584, "eval_logits/rejected": -0.6593382358551025, "eval_logps/chosen": -2.3106672763824463, "eval_logps/rejected": -3.3355889320373535, "eval_loss": 20.19257926940918, "eval_rewards/accuracies": 0.7620967626571655, "eval_rewards/chosen": -0.11716549098491669, "eval_rewards/margins": 0.08304762095212936, "eval_rewards/rejected": -0.20021310448646545, "eval_runtime": 44.6349, "eval_samples_per_second": 44.808, "eval_steps_per_second": 1.411, "step": 400 }, { "epoch": 0.8586387434554974, "grad_norm": 1995.79248046875, "learning_rate": 3.036127238347164e-08, "logits/chosen": -0.6521174907684326, "logits/rejected": -0.6390538811683655, "logps/chosen": -2.400747299194336, "logps/rejected": -3.1259586811065674, "loss": 80.8247, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.12548241019248962, "rewards/margins": 0.05977809429168701, "rewards/rejected": -0.18526050448417664, "step": 410 }, { "epoch": 0.8795811518324608, "grad_norm": 1795.328857421875, "learning_rate": 2.2213262793589482e-08, "logits/chosen": -0.6523452997207642, "logits/rejected": -0.6344730257987976, "logps/chosen": -2.3337454795837402, "logps/rejected": -3.2163987159729004, "loss": 79.558, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.12196169048547745, "rewards/margins": 0.07503517717123032, "rewards/rejected": -0.19699685275554657, "step": 420 }, { "epoch": 0.900523560209424, "grad_norm": 2082.081298828125, "learning_rate": 1.5286263996730026e-08, "logits/chosen": -0.6856145858764648, "logits/rejected": -0.6647241115570068, "logps/chosen": -2.348968982696533, "logps/rejected": -3.445862293243408, "loss": 78.859, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.11975270509719849, "rewards/margins": 0.08451148122549057, "rewards/rejected": -0.20426419377326965, "step": 430 }, { "epoch": 0.9214659685863874, "grad_norm": 2494.26806640625, "learning_rate": 9.617406953185136e-09, "logits/chosen": -0.6526113748550415, "logits/rejected": -0.6604099869728088, "logps/chosen": -2.3740854263305664, "logps/rejected": -3.1448159217834473, "loss": 81.5522, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.12368841469287872, "rewards/margins": 0.060592371970415115, "rewards/rejected": -0.18428078293800354, "step": 440 }, { "epoch": 0.9424083769633508, "grad_norm": 1965.380615234375, "learning_rate": 5.2370785753763356e-09, "logits/chosen": -0.6635817289352417, "logits/rejected": -0.653299868106842, "logps/chosen": -2.3846700191497803, "logps/rejected": -3.3813278675079346, "loss": 79.7652, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.1247202605009079, "rewards/margins": 0.08005838841199875, "rewards/rejected": -0.20477867126464844, "step": 450 }, { "epoch": 0.9633507853403142, "grad_norm": 2279.437744140625, "learning_rate": 2.168758844148272e-09, "logits/chosen": -0.6574541330337524, "logits/rejected": -0.6650590300559998, "logps/chosen": -2.2831614017486572, "logps/rejected": -3.247992753982544, "loss": 82.4212, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.11224757134914398, "rewards/margins": 0.07813762873411179, "rewards/rejected": -0.19038519263267517, "step": 460 }, { "epoch": 0.9842931937172775, "grad_norm": 2007.5858154296875, "learning_rate": 4.288949484559934e-10, "logits/chosen": -0.6454898118972778, "logits/rejected": -0.6453719139099121, "logps/chosen": -2.3356616497039795, "logps/rejected": -3.382112503051758, "loss": 79.3744, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.11207450926303864, "rewards/margins": 0.09411285072565079, "rewards/rejected": -0.20618736743927002, "step": 470 }, { "epoch": 0.9989528795811519, "step": 477, "total_flos": 0.0, "train_loss": 87.11727690946631, "train_runtime": 3396.1479, "train_samples_per_second": 18.001, "train_steps_per_second": 0.14 } ], "logging_steps": 10, "max_steps": 477, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }