{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9989528795811519, "eval_steps": 200, "global_step": 477, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0020942408376963353, "grad_norm": 26.356996536254883, "learning_rate": 0.0, "logits/chosen": -0.6103914976119995, "logits/rejected": -0.6099507808685303, "logps/chosen": -1.2526695728302002, "logps/rejected": -1.3568028211593628, "loss": 2.7368, "rewards/accuracies": 0.5625, "rewards/chosen": -2.5053391456604004, "rewards/margins": 0.20826660096645355, "rewards/rejected": -2.7136056423187256, "step": 1 }, { "epoch": 0.020942408376963352, "grad_norm": 17.384063720703125, "learning_rate": 1.125e-07, "logits/chosen": -0.6848690509796143, "logits/rejected": -0.6671006679534912, "logps/chosen": -1.1007671356201172, "logps/rejected": -1.2118645906448364, "loss": 2.6132, "rewards/accuracies": 0.5416666865348816, "rewards/chosen": -2.2015342712402344, "rewards/margins": 0.2221948802471161, "rewards/rejected": -2.423729181289673, "step": 10 }, { "epoch": 0.041884816753926704, "grad_norm": 25.446407318115234, "learning_rate": 2.3749999999999998e-07, "logits/chosen": -0.6405870318412781, "logits/rejected": -0.6533440351486206, "logps/chosen": -1.134265661239624, "logps/rejected": -1.3199212551116943, "loss": 2.5636, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.268531322479248, "rewards/margins": 0.3713109493255615, "rewards/rejected": -2.6398425102233887, "step": 20 }, { "epoch": 0.06282722513089005, "grad_norm": 22.045013427734375, "learning_rate": 3.6249999999999997e-07, "logits/chosen": -0.6656152009963989, "logits/rejected": -0.67214435338974, "logps/chosen": -1.082200288772583, "logps/rejected": -1.2129806280136108, "loss": 2.5356, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.164400577545166, "rewards/margins": 0.26156044006347656, "rewards/rejected": -2.4259612560272217, "step": 30 }, { "epoch": 0.08376963350785341, "grad_norm": 18.265399932861328, "learning_rate": 4.875e-07, "logits/chosen": -0.6319286227226257, "logits/rejected": -0.6386948227882385, "logps/chosen": -1.0040572881698608, "logps/rejected": -1.1640336513519287, "loss": 2.4952, "rewards/accuracies": 0.59375, "rewards/chosen": -2.0081145763397217, "rewards/margins": 0.31995266675949097, "rewards/rejected": -2.3280673027038574, "step": 40 }, { "epoch": 0.10471204188481675, "grad_norm": 13.381461143493652, "learning_rate": 5.999919559552264e-07, "logits/chosen": -0.6912713050842285, "logits/rejected": -0.7003507018089294, "logps/chosen": -1.003204584121704, "logps/rejected": -1.0995913743972778, "loss": 2.4474, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.006409168243408, "rewards/margins": 0.1927735060453415, "rewards/rejected": -2.1991827487945557, "step": 50 }, { "epoch": 0.1256544502617801, "grad_norm": 15.925363540649414, "learning_rate": 5.99027192440263e-07, "logits/chosen": -0.6539224982261658, "logits/rejected": -0.6533476114273071, "logps/chosen": -1.056113362312317, "logps/rejected": -1.220874309539795, "loss": 2.5074, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.112226724624634, "rewards/margins": 0.32952195405960083, "rewards/rejected": -2.44174861907959, "step": 60 }, { "epoch": 0.14659685863874344, "grad_norm": 18.052942276000977, "learning_rate": 5.964595461857045e-07, "logits/chosen": -0.670150876045227, "logits/rejected": -0.6788632273674011, "logps/chosen": -1.0223743915557861, "logps/rejected": -1.2135789394378662, "loss": 2.4297, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.0447487831115723, "rewards/margins": 0.3824090361595154, "rewards/rejected": -2.4271578788757324, "step": 70 }, { "epoch": 0.16753926701570682, "grad_norm": 15.041752815246582, "learning_rate": 5.923027806082798e-07, "logits/chosen": -0.6617791056632996, "logits/rejected": -0.666069746017456, "logps/chosen": -1.0529370307922363, "logps/rejected": -1.243398666381836, "loss": 2.4036, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.1058740615844727, "rewards/margins": 0.38092344999313354, "rewards/rejected": -2.486797332763672, "step": 80 }, { "epoch": 0.18848167539267016, "grad_norm": 17.72820472717285, "learning_rate": 5.865791773197119e-07, "logits/chosen": -0.644040584564209, "logits/rejected": -0.662616491317749, "logps/chosen": -1.0962177515029907, "logps/rejected": -1.333801507949829, "loss": 2.3523, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.1924355030059814, "rewards/margins": 0.47516757249832153, "rewards/rejected": -2.667603015899658, "step": 90 }, { "epoch": 0.2094240837696335, "grad_norm": 18.668445587158203, "learning_rate": 5.793194166900525e-07, "logits/chosen": -0.6190682053565979, "logits/rejected": -0.6458654403686523, "logps/chosen": -1.0701862573623657, "logps/rejected": -1.314082145690918, "loss": 2.4183, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.1403725147247314, "rewards/margins": 0.48779162764549255, "rewards/rejected": -2.628164291381836, "step": 100 }, { "epoch": 0.23036649214659685, "grad_norm": 23.80174446105957, "learning_rate": 5.705624133909468e-07, "logits/chosen": -0.6541261672973633, "logits/rejected": -0.6746215224266052, "logps/chosen": -1.1125367879867554, "logps/rejected": -1.4309237003326416, "loss": 2.2568, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.2250735759735107, "rewards/margins": 0.6367737054824829, "rewards/rejected": -2.861847400665283, "step": 110 }, { "epoch": 0.2513089005235602, "grad_norm": 20.943939208984375, "learning_rate": 5.603551078003725e-07, "logits/chosen": -0.682715892791748, "logits/rejected": -0.6975741386413574, "logps/chosen": -1.1100105047225952, "logps/rejected": -1.4905935525894165, "loss": 2.3622, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.2200210094451904, "rewards/margins": 0.7611659169197083, "rewards/rejected": -2.981187105178833, "step": 120 }, { "epoch": 0.27225130890052357, "grad_norm": 15.774776458740234, "learning_rate": 5.487522143869884e-07, "logits/chosen": -0.6576386094093323, "logits/rejected": -0.6527346968650818, "logps/chosen": -1.1969980001449585, "logps/rejected": -1.3874703645706177, "loss": 2.3036, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.393996000289917, "rewards/margins": 0.38094475865364075, "rewards/rejected": -2.7749407291412354, "step": 130 }, { "epoch": 0.2931937172774869, "grad_norm": 22.345949172973633, "learning_rate": 5.358159284228362e-07, "logits/chosen": -0.6398555040359497, "logits/rejected": -0.6468138694763184, "logps/chosen": -1.1364694833755493, "logps/rejected": -1.3303481340408325, "loss": 2.4059, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.2729389667510986, "rewards/margins": 0.38775745034217834, "rewards/rejected": -2.660696268081665, "step": 140 }, { "epoch": 0.31413612565445026, "grad_norm": 16.85527801513672, "learning_rate": 5.216155925965094e-07, "logits/chosen": -0.673059344291687, "logits/rejected": -0.6615468263626099, "logps/chosen": -1.0914809703826904, "logps/rejected": -1.4301069974899292, "loss": 2.2954, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.182961940765381, "rewards/margins": 0.6772524118423462, "rewards/rejected": -2.8602139949798584, "step": 150 }, { "epoch": 0.33507853403141363, "grad_norm": 21.642011642456055, "learning_rate": 5.062273253138518e-07, "logits/chosen": -0.6647015810012817, "logits/rejected": -0.6685549020767212, "logps/chosen": -1.1738396883010864, "logps/rejected": -1.4873461723327637, "loss": 2.2683, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.347679376602173, "rewards/margins": 0.6270130276679993, "rewards/rejected": -2.9746923446655273, "step": 160 }, { "epoch": 0.35602094240837695, "grad_norm": 28.569908142089844, "learning_rate": 4.897336126786132e-07, "logits/chosen": -0.6699012517929077, "logits/rejected": -0.679189920425415, "logps/chosen": -1.1100311279296875, "logps/rejected": -1.4672223329544067, "loss": 2.2618, "rewards/accuracies": 0.625, "rewards/chosen": -2.220062255859375, "rewards/margins": 0.7143827080726624, "rewards/rejected": -2.9344446659088135, "step": 170 }, { "epoch": 0.3769633507853403, "grad_norm": 34.985107421875, "learning_rate": 4.722228663401794e-07, "logits/chosen": -0.6271170377731323, "logits/rejected": -0.6412331461906433, "logps/chosen": -1.1353423595428467, "logps/rejected": -1.403346300125122, "loss": 2.2445, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.2706847190856934, "rewards/margins": 0.5360078811645508, "rewards/rejected": -2.806692600250244, "step": 180 }, { "epoch": 0.39790575916230364, "grad_norm": 20.208932876586914, "learning_rate": 4.537889495784557e-07, "logits/chosen": -0.6816627383232117, "logits/rejected": -0.6941916942596436, "logps/chosen": -1.2068521976470947, "logps/rejected": -1.6221401691436768, "loss": 2.1744, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.4137043952941895, "rewards/margins": 0.8305760622024536, "rewards/rejected": -3.2442803382873535, "step": 190 }, { "epoch": 0.418848167539267, "grad_norm": 20.75760841369629, "learning_rate": 4.345306741662423e-07, "logits/chosen": -0.6786922216415405, "logits/rejected": -0.6947230100631714, "logps/chosen": -1.2382934093475342, "logps/rejected": -1.6950000524520874, "loss": 2.1958, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.4765868186950684, "rewards/margins": 0.9134136438369751, "rewards/rejected": -3.390000104904175, "step": 200 }, { "epoch": 0.418848167539267, "eval_logits/chosen": -0.6647953987121582, "eval_logits/rejected": -0.6653565764427185, "eval_logps/chosen": -1.2070682048797607, "eval_logps/rejected": -1.6066378355026245, "eval_loss": 1.0817621946334839, "eval_rewards/accuracies": 0.6935483813285828, "eval_rewards/chosen": -2.4141364097595215, "eval_rewards/margins": 0.7991395592689514, "eval_rewards/rejected": -3.213275671005249, "eval_runtime": 25.1854, "eval_samples_per_second": 79.411, "eval_steps_per_second": 1.271, "step": 200 }, { "epoch": 0.4397905759162304, "grad_norm": 20.93620491027832, "learning_rate": 4.145512707060832e-07, "logits/chosen": -0.6518492698669434, "logits/rejected": -0.656699001789093, "logps/chosen": -1.2251121997833252, "logps/rejected": -1.6555721759796143, "loss": 2.2519, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.4502243995666504, "rewards/margins": 0.8609197735786438, "rewards/rejected": -3.3111443519592285, "step": 210 }, { "epoch": 0.4607329842931937, "grad_norm": 25.06297492980957, "learning_rate": 3.939578352807537e-07, "logits/chosen": -0.6913427710533142, "logits/rejected": -0.6714206337928772, "logps/chosen": -1.3224003314971924, "logps/rejected": -1.7003147602081299, "loss": 2.2159, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.6448006629943848, "rewards/margins": 0.7558287382125854, "rewards/rejected": -3.4006295204162598, "step": 220 }, { "epoch": 0.4816753926701571, "grad_norm": 32.439979553222656, "learning_rate": 3.7286075538352106e-07, "logits/chosen": -0.686276376247406, "logits/rejected": -0.6746798753738403, "logps/chosen": -1.322486162185669, "logps/rejected": -1.6524722576141357, "loss": 2.164, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.644972324371338, "rewards/margins": 0.6599723696708679, "rewards/rejected": -3.3049445152282715, "step": 230 }, { "epoch": 0.5026178010471204, "grad_norm": 21.72540855407715, "learning_rate": 3.5137311820537683e-07, "logits/chosen": -0.6239339113235474, "logits/rejected": -0.6532629132270813, "logps/chosen": -1.204216718673706, "logps/rejected": -1.6880419254302979, "loss": 2.1821, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.408433437347412, "rewards/margins": 0.9676502346992493, "rewards/rejected": -3.3760838508605957, "step": 240 }, { "epoch": 0.5235602094240838, "grad_norm": 27.567710876464844, "learning_rate": 3.296101044510136e-07, "logits/chosen": -0.6909536123275757, "logits/rejected": -0.6883527040481567, "logps/chosen": -1.2906936407089233, "logps/rejected": -1.750509262084961, "loss": 2.1408, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.5813872814178467, "rewards/margins": 0.9196311235427856, "rewards/rejected": -3.501018524169922, "step": 250 }, { "epoch": 0.5445026178010471, "grad_norm": 32.868133544921875, "learning_rate": 3.076883709328898e-07, "logits/chosen": -0.6995635628700256, "logits/rejected": -0.6956952214241028, "logps/chosen": -1.3284438848495483, "logps/rejected": -1.7075936794281006, "loss": 2.2316, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.6568877696990967, "rewards/margins": 0.758299708366394, "rewards/rejected": -3.415187358856201, "step": 260 }, { "epoch": 0.5654450261780105, "grad_norm": 62.45464324951172, "learning_rate": 2.857254252528773e-07, "logits/chosen": -0.691063404083252, "logits/rejected": -0.6710867285728455, "logps/chosen": -1.402073621749878, "logps/rejected": -1.8776384592056274, "loss": 2.1945, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.804147243499756, "rewards/margins": 0.9511295557022095, "rewards/rejected": -3.755276918411255, "step": 270 }, { "epoch": 0.5863874345549738, "grad_norm": 38.48459243774414, "learning_rate": 2.638389959234031e-07, "logits/chosen": -0.6948752403259277, "logits/rejected": -0.6988323330879211, "logps/chosen": -1.332465648651123, "logps/rejected": -2.0018393993377686, "loss": 2.0703, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.664931297302246, "rewards/margins": 1.3387477397918701, "rewards/rejected": -4.003678798675537, "step": 280 }, { "epoch": 0.6073298429319371, "grad_norm": 56.83732604980469, "learning_rate": 2.421464013044373e-07, "logits/chosen": -0.7247036099433899, "logits/rejected": -0.7209808230400085, "logps/chosen": -1.400423526763916, "logps/rejected": -1.9026374816894531, "loss": 2.1931, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.800847053527832, "rewards/margins": 1.0044279098510742, "rewards/rejected": -3.8052749633789062, "step": 290 }, { "epoch": 0.6282722513089005, "grad_norm": 31.81602668762207, "learning_rate": 2.2076392073903244e-07, "logits/chosen": -0.7202259302139282, "logits/rejected": -0.7023282647132874, "logps/chosen": -1.4328830242156982, "logps/rejected": -1.7894738912582397, "loss": 2.1941, "rewards/accuracies": 0.65625, "rewards/chosen": -2.8657660484313965, "rewards/margins": 0.7131820321083069, "rewards/rejected": -3.5789477825164795, "step": 300 }, { "epoch": 0.6492146596858639, "grad_norm": 34.869110107421875, "learning_rate": 1.9980617125832958e-07, "logits/chosen": -0.7520186305046082, "logits/rejected": -0.7353061437606812, "logps/chosen": -1.3960636854171753, "logps/rejected": -1.8475030660629272, "loss": 2.0703, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.7921273708343506, "rewards/margins": 0.9028788805007935, "rewards/rejected": -3.6950061321258545, "step": 310 }, { "epoch": 0.6701570680628273, "grad_norm": 33.654422760009766, "learning_rate": 1.7938549319709663e-07, "logits/chosen": -0.7805129289627075, "logits/rejected": -0.75370854139328, "logps/chosen": -1.513319492340088, "logps/rejected": -2.104207992553711, "loss": 2.0994, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -3.026638984680176, "rewards/margins": 1.181776762008667, "rewards/rejected": -4.208415985107422, "step": 320 }, { "epoch": 0.6910994764397905, "grad_norm": 34.889400482177734, "learning_rate": 1.5961134801309614e-07, "logits/chosen": -0.7453498840332031, "logits/rejected": -0.7288404107093811, "logps/chosen": -1.5413320064544678, "logps/rejected": -2.1900875568389893, "loss": 2.1361, "rewards/accuracies": 0.6875, "rewards/chosen": -3.0826640129089355, "rewards/margins": 1.2975105047225952, "rewards/rejected": -4.3801751136779785, "step": 330 }, { "epoch": 0.7120418848167539, "grad_norm": 58.18461227416992, "learning_rate": 1.4058973153816886e-07, "logits/chosen": -0.7768866419792175, "logits/rejected": -0.7656865119934082, "logps/chosen": -1.6624263525009155, "logps/rejected": -2.255127429962158, "loss": 2.1658, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -3.324852705001831, "rewards/margins": 1.1854020357131958, "rewards/rejected": -4.510254859924316, "step": 340 }, { "epoch": 0.7329842931937173, "grad_norm": 55.78643798828125, "learning_rate": 1.2242260580619538e-07, "logits/chosen": -0.7563885450363159, "logits/rejected": -0.7533862590789795, "logps/chosen": -1.6073474884033203, "logps/rejected": -2.1334166526794434, "loss": 2.0949, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.2146949768066406, "rewards/margins": 1.052138090133667, "rewards/rejected": -4.266833305358887, "step": 350 }, { "epoch": 0.7539267015706806, "grad_norm": 39.962528228759766, "learning_rate": 1.0520735250352405e-07, "logits/chosen": -0.703062891960144, "logits/rejected": -0.702917218208313, "logps/chosen": -1.4934108257293701, "logps/rejected": -2.0725302696228027, "loss": 2.1009, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.9868216514587402, "rewards/margins": 1.1582386493682861, "rewards/rejected": -4.1450605392456055, "step": 360 }, { "epoch": 0.774869109947644, "grad_norm": 36.544063568115234, "learning_rate": 8.903625097154667e-08, "logits/chosen": -0.7489289045333862, "logits/rejected": -0.7575147747993469, "logps/chosen": -1.3683464527130127, "logps/rejected": -2.0208771228790283, "loss": 1.9718, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.7366929054260254, "rewards/margins": 1.3050611019134521, "rewards/rejected": -4.041754245758057, "step": 370 }, { "epoch": 0.7958115183246073, "grad_norm": 53.06337356567383, "learning_rate": 7.399598355949822e-08, "logits/chosen": -0.7504714727401733, "logits/rejected": -0.7333730459213257, "logps/chosen": -1.5020231008529663, "logps/rejected": -1.9611724615097046, "loss": 2.1079, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -3.0040462017059326, "rewards/margins": 0.9182990789413452, "rewards/rejected": -3.922344923019409, "step": 380 }, { "epoch": 0.8167539267015707, "grad_norm": 32.34917449951172, "learning_rate": 6.01671709789497e-08, "logits/chosen": -0.761965811252594, "logits/rejected": -0.7571056485176086, "logps/chosen": -1.473826289176941, "logps/rejected": -1.9839645624160767, "loss": 2.0849, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.947652578353882, "rewards/margins": 1.020276427268982, "rewards/rejected": -3.9679291248321533, "step": 390 }, { "epoch": 0.837696335078534, "grad_norm": 57.54490280151367, "learning_rate": 4.76239401506456e-08, "logits/chosen": -0.7651081681251526, "logits/rejected": -0.7489873170852661, "logps/chosen": -1.4767208099365234, "logps/rejected": -1.997377634048462, "loss": 2.1186, "rewards/accuracies": 0.6875, "rewards/chosen": -2.953441619873047, "rewards/margins": 1.0413135290145874, "rewards/rejected": -3.994755268096924, "step": 400 }, { "epoch": 0.837696335078534, "eval_logits/chosen": -0.75456303358078, "eval_logits/rejected": -0.7413101196289062, "eval_logps/chosen": -1.4940153360366821, "eval_logps/rejected": -2.0286271572113037, "eval_loss": 1.0269173383712769, "eval_rewards/accuracies": 0.7379032373428345, "eval_rewards/chosen": -2.9880306720733643, "eval_rewards/margins": 1.0692235231399536, "eval_rewards/rejected": -4.057254314422607, "eval_runtime": 24.6202, "eval_samples_per_second": 81.234, "eval_steps_per_second": 1.3, "step": 400 }, { "epoch": 0.8586387434554974, "grad_norm": 47.848445892333984, "learning_rate": 3.643352686016596e-08, "logits/chosen": -0.790420651435852, "logits/rejected": -0.7755736112594604, "logps/chosen": -1.5548263788223267, "logps/rejected": -2.1035819053649902, "loss": 2.047, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.1096527576446533, "rewards/margins": 1.0975114107131958, "rewards/rejected": -4.2071638107299805, "step": 410 }, { "epoch": 0.8795811518324608, "grad_norm": 35.658634185791016, "learning_rate": 2.665591535230738e-08, "logits/chosen": -0.7204955220222473, "logits/rejected": -0.6889880895614624, "logps/chosen": -1.4325242042541504, "logps/rejected": -2.0371756553649902, "loss": 2.0869, "rewards/accuracies": 0.71875, "rewards/chosen": -2.865048408508301, "rewards/margins": 1.2093026638031006, "rewards/rejected": -4.0743513107299805, "step": 420 }, { "epoch": 0.900523560209424, "grad_norm": 46.57322311401367, "learning_rate": 1.834351679607603e-08, "logits/chosen": -0.7658016681671143, "logits/rejected": -0.7545614838600159, "logps/chosen": -1.573411226272583, "logps/rejected": -2.168039321899414, "loss": 2.0035, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -3.146822452545166, "rewards/margins": 1.1892563104629517, "rewards/rejected": -4.336078643798828, "step": 430 }, { "epoch": 0.9214659685863874, "grad_norm": 49.09744644165039, "learning_rate": 1.1540888343822164e-08, "logits/chosen": -0.7819451093673706, "logits/rejected": -0.7810377478599548, "logps/chosen": -1.533359408378601, "logps/rejected": -1.9900741577148438, "loss": 2.0797, "rewards/accuracies": 0.65625, "rewards/chosen": -3.066718816757202, "rewards/margins": 0.9134290814399719, "rewards/rejected": -3.9801483154296875, "step": 440 }, { "epoch": 0.9424083769633508, "grad_norm": 30.21855926513672, "learning_rate": 6.284494290451603e-09, "logits/chosen": -0.7444645762443542, "logits/rejected": -0.7367347478866577, "logps/chosen": -1.6120609045028687, "logps/rejected": -2.1023104190826416, "loss": 2.0586, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -3.2241218090057373, "rewards/margins": 0.9804989099502563, "rewards/rejected": -4.204620838165283, "step": 450 }, { "epoch": 0.9633507853403142, "grad_norm": 48.88844680786133, "learning_rate": 2.6025106129779263e-09, "logits/chosen": -0.7713245153427124, "logits/rejected": -0.763340950012207, "logps/chosen": -1.6260992288589478, "logps/rejected": -2.057934284210205, "loss": 2.1323, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -3.2521984577178955, "rewards/margins": 0.863669753074646, "rewards/rejected": -4.11586856842041, "step": 460 }, { "epoch": 0.9842931937172775, "grad_norm": 30.39113426208496, "learning_rate": 5.146739381471921e-10, "logits/chosen": -0.742178201675415, "logits/rejected": -0.7218775749206543, "logps/chosen": -1.443922996520996, "logps/rejected": -2.284044027328491, "loss": 2.0688, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.887845993041992, "rewards/margins": 1.6802421808242798, "rewards/rejected": -4.568088054656982, "step": 470 }, { "epoch": 0.9989528795811519, "step": 477, "total_flos": 0.0, "train_loss": 2.230338767889411, "train_runtime": 2802.8273, "train_samples_per_second": 21.812, "train_steps_per_second": 0.17 } ], "logging_steps": 10, "max_steps": 477, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }