{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9989528795811519, "eval_steps": 200, "global_step": 477, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0020942408376963353, "grad_norm": 12262.6455078125, "learning_rate": 0.0, "logits/chosen": -0.6038292050361633, "logits/rejected": -0.6174172163009644, "logps/chosen": -275.28570556640625, "logps/rejected": -222.9645233154297, "loss": 3043.0391, "rewards/accuracies": 0.46875, "rewards/chosen": -275.28570556640625, "rewards/margins": -52.3211669921875, "rewards/rejected": -222.9645233154297, "slic/ce_loss": 275.28570556640625, "slic/rank_loss": 105.09413146972656, "step": 1 }, { "epoch": 0.020942408376963352, "grad_norm": 11722.5625, "learning_rate": 9.375e-08, "logits/chosen": -0.6442743539810181, "logits/rejected": -0.6519261598587036, "logps/chosen": -290.2613525390625, "logps/rejected": -264.83160400390625, "loss": 3090.2092, "rewards/accuracies": 0.4626736044883728, "rewards/chosen": -290.2613525390625, "rewards/margins": -25.429737091064453, "rewards/rejected": -264.83160400390625, "slic/ce_loss": 290.2613525390625, "slic/rank_loss": 96.01480102539062, "step": 10 }, { "epoch": 0.041884816753926704, "grad_norm": 12801.0009765625, "learning_rate": 1.9791666666666664e-07, "logits/chosen": -0.6172284483909607, "logits/rejected": -0.631966769695282, "logps/chosen": -286.7103576660156, "logps/rejected": -259.05560302734375, "loss": 3052.1316, "rewards/accuracies": 0.47187501192092896, "rewards/chosen": -286.7103576660156, "rewards/margins": -27.65475082397461, "rewards/rejected": -259.05560302734375, "slic/ce_loss": 286.7103576660156, "slic/rank_loss": 94.8061294555664, "step": 20 }, { "epoch": 0.06282722513089005, "grad_norm": 9744.6474609375, "learning_rate": 3.020833333333333e-07, "logits/chosen": -0.6371282935142517, "logits/rejected": -0.6436103582382202, "logps/chosen": -277.3744201660156, "logps/rejected": -255.4679412841797, "loss": 2954.9688, "rewards/accuracies": 0.4820312559604645, "rewards/chosen": -277.3744201660156, "rewards/margins": -21.906490325927734, "rewards/rejected": -255.4679412841797, "slic/ce_loss": 277.3744201660156, "slic/rank_loss": 91.99668884277344, "step": 30 }, { "epoch": 0.08376963350785341, "grad_norm": 8187.505859375, "learning_rate": 4.0625e-07, "logits/chosen": -0.6269849538803101, "logits/rejected": -0.6466041803359985, "logps/chosen": -279.5166320800781, "logps/rejected": -251.68496704101562, "loss": 3012.034, "rewards/accuracies": 0.47734373807907104, "rewards/chosen": -279.5166320800781, "rewards/margins": -27.8316707611084, "rewards/rejected": -251.68496704101562, "slic/ce_loss": 279.5166320800781, "slic/rank_loss": 96.98760223388672, "step": 40 }, { "epoch": 0.10471204188481675, "grad_norm": 7351.79052734375, "learning_rate": 4.999932966293553e-07, "logits/chosen": -0.6411020755767822, "logits/rejected": -0.657455563545227, "logps/chosen": -273.2268371582031, "logps/rejected": -256.38946533203125, "loss": 2900.1408, "rewards/accuracies": 0.484375, "rewards/chosen": -273.2268371582031, "rewards/margins": -16.837379455566406, "rewards/rejected": -256.38946533203125, "slic/ce_loss": 273.2268371582031, "slic/rank_loss": 89.29072570800781, "step": 50 }, { "epoch": 0.1256544502617801, "grad_norm": 6973.84375, "learning_rate": 4.991893270335525e-07, "logits/chosen": -0.6497636437416077, "logits/rejected": -0.6595814228057861, "logps/chosen": -261.78167724609375, "logps/rejected": -248.3544921875, "loss": 2815.4137, "rewards/accuracies": 0.50390625, "rewards/chosen": -261.78167724609375, "rewards/margins": -13.427162170410156, "rewards/rejected": -248.3544921875, "slic/ce_loss": 261.78167724609375, "slic/rank_loss": 90.14505767822266, "step": 60 }, { "epoch": 0.14659685863874344, "grad_norm": 7103.94580078125, "learning_rate": 4.970496218214204e-07, "logits/chosen": -0.6443999409675598, "logits/rejected": -0.6562803983688354, "logps/chosen": -261.08099365234375, "logps/rejected": -245.8149871826172, "loss": 2767.8164, "rewards/accuracies": 0.48828125, "rewards/chosen": -261.08099365234375, "rewards/margins": -15.265989303588867, "rewards/rejected": -245.8149871826172, "slic/ce_loss": 261.08099365234375, "slic/rank_loss": 84.89605712890625, "step": 70 }, { "epoch": 0.16753926701570682, "grad_norm": 6954.5859375, "learning_rate": 4.935856505068998e-07, "logits/chosen": -0.6128605008125305, "logits/rejected": -0.6215260028839111, "logps/chosen": -262.12835693359375, "logps/rejected": -246.1211395263672, "loss": 2764.8988, "rewards/accuracies": 0.47578126192092896, "rewards/chosen": -262.12835693359375, "rewards/margins": -16.00722885131836, "rewards/rejected": -246.1211395263672, "slic/ce_loss": 262.12835693359375, "slic/rank_loss": 83.48396301269531, "step": 80 }, { "epoch": 0.18848167539267016, "grad_norm": 6543.72314453125, "learning_rate": 4.8881598109976e-07, "logits/chosen": -0.6393535733222961, "logits/rejected": -0.642610490322113, "logps/chosen": -259.01385498046875, "logps/rejected": -244.76968383789062, "loss": 2757.5949, "rewards/accuracies": 0.5, "rewards/chosen": -259.01385498046875, "rewards/margins": -14.244140625, "rewards/rejected": -244.76968383789062, "slic/ce_loss": 259.01385498046875, "slic/rank_loss": 85.68550872802734, "step": 90 }, { "epoch": 0.2094240837696335, "grad_norm": 7359.39697265625, "learning_rate": 4.827661805750437e-07, "logits/chosen": -0.6121981739997864, "logits/rejected": -0.6247469782829285, "logps/chosen": -260.1445007324219, "logps/rejected": -240.54080200195312, "loss": 2780.1023, "rewards/accuracies": 0.500781238079071, "rewards/chosen": -260.1445007324219, "rewards/margins": -19.60370445251465, "rewards/rejected": -240.54080200195312, "slic/ce_loss": 260.1445007324219, "slic/rank_loss": 87.36830139160156, "step": 100 }, { "epoch": 0.23036649214659685, "grad_norm": 6633.2919921875, "learning_rate": 4.75468677825789e-07, "logits/chosen": -0.6110386252403259, "logits/rejected": -0.6201988458633423, "logps/chosen": -259.8690185546875, "logps/rejected": -244.67117309570312, "loss": 2769.8471, "rewards/accuracies": 0.47734373807907104, "rewards/chosen": -259.8690185546875, "rewards/margins": -15.197855949401855, "rewards/rejected": -244.67117309570312, "slic/ce_loss": 259.8690185546875, "slic/rank_loss": 86.36186981201172, "step": 110 }, { "epoch": 0.2513089005235602, "grad_norm": 6849.99609375, "learning_rate": 4.669625898336438e-07, "logits/chosen": -0.6245466470718384, "logits/rejected": -0.6278253197669983, "logps/chosen": -264.4799499511719, "logps/rejected": -248.22763061523438, "loss": 2824.259, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -264.4799499511719, "rewards/margins": -16.252330780029297, "rewards/rejected": -248.22763061523438, "slic/ce_loss": 264.4799499511719, "slic/rank_loss": 88.55240631103516, "step": 120 }, { "epoch": 0.27225130890052357, "grad_norm": 6854.18701171875, "learning_rate": 4.5729351198915705e-07, "logits/chosen": -0.6144854426383972, "logits/rejected": -0.6145707368850708, "logps/chosen": -263.3558044433594, "logps/rejected": -245.08395385742188, "loss": 2830.0254, "rewards/accuracies": 0.4906249940395355, "rewards/chosen": -263.3558044433594, "rewards/margins": -18.271860122680664, "rewards/rejected": -245.08395385742188, "slic/ce_loss": 263.3558044433594, "slic/rank_loss": 90.39739227294922, "step": 130 }, { "epoch": 0.2931937172774869, "grad_norm": 7111.4072265625, "learning_rate": 4.4651327368569684e-07, "logits/chosen": -0.6158766150474548, "logits/rejected": -0.610289454460144, "logps/chosen": -265.9961853027344, "logps/rejected": -250.8537139892578, "loss": 2811.3402, "rewards/accuracies": 0.4984374940395355, "rewards/chosen": -265.9961853027344, "rewards/margins": -15.142511367797852, "rewards/rejected": -250.8537139892578, "slic/ce_loss": 265.9961853027344, "slic/rank_loss": 85.42132568359375, "step": 140 }, { "epoch": 0.31413612565445026, "grad_norm": 6560.322265625, "learning_rate": 4.346796604970912e-07, "logits/chosen": -0.6126202344894409, "logits/rejected": -0.6171335577964783, "logps/chosen": -262.45489501953125, "logps/rejected": -238.64248657226562, "loss": 2792.7324, "rewards/accuracies": 0.46015626192092896, "rewards/chosen": -262.45489501953125, "rewards/margins": -23.812393188476562, "rewards/rejected": -238.64248657226562, "slic/ce_loss": 262.45489501953125, "slic/rank_loss": 86.63667297363281, "step": 150 }, { "epoch": 0.33507853403141363, "grad_norm": 6536.52099609375, "learning_rate": 4.218561044282098e-07, "logits/chosen": -0.615364670753479, "logits/rejected": -0.6180033087730408, "logps/chosen": -260.1828308105469, "logps/rejected": -246.5723114013672, "loss": 2790.3223, "rewards/accuracies": 0.49687498807907104, "rewards/chosen": -260.1828308105469, "rewards/margins": -13.610522270202637, "rewards/rejected": -246.5723114013672, "slic/ce_loss": 260.1828308105469, "slic/rank_loss": 88.60743713378906, "step": 160 }, { "epoch": 0.35602094240837695, "grad_norm": 6896.39892578125, "learning_rate": 4.081113438988443e-07, "logits/chosen": -0.6077988147735596, "logits/rejected": -0.6157752871513367, "logps/chosen": -264.1897277832031, "logps/rejected": -232.72091674804688, "loss": 2870.3471, "rewards/accuracies": 0.46406251192092896, "rewards/chosen": -264.1897277832031, "rewards/margins": -31.468795776367188, "rewards/rejected": -232.72091674804688, "slic/ce_loss": 264.1897277832031, "slic/rank_loss": 94.60362243652344, "step": 170 }, { "epoch": 0.3769633507853403, "grad_norm": 6520.38671875, "learning_rate": 3.935190552834828e-07, "logits/chosen": -0.6066499352455139, "logits/rejected": -0.6182885766029358, "logps/chosen": -263.59375, "logps/rejected": -244.91696166992188, "loss": 2795.4867, "rewards/accuracies": 0.4867187440395355, "rewards/chosen": -263.59375, "rewards/margins": -18.676807403564453, "rewards/rejected": -244.91696166992188, "slic/ce_loss": 263.59375, "slic/rank_loss": 85.84205627441406, "step": 180 }, { "epoch": 0.39790575916230364, "grad_norm": 6230.771484375, "learning_rate": 3.781574579820464e-07, "logits/chosen": -0.6069104075431824, "logits/rejected": -0.62060546875, "logps/chosen": -261.4915466308594, "logps/rejected": -239.55990600585938, "loss": 2811.309, "rewards/accuracies": 0.4781250059604645, "rewards/chosen": -261.4915466308594, "rewards/margins": -21.9316349029541, "rewards/rejected": -239.55990600585938, "slic/ce_loss": 261.4915466308594, "slic/rank_loss": 89.92210388183594, "step": 190 }, { "epoch": 0.418848167539267, "grad_norm": 6762.1396484375, "learning_rate": 3.621088951385353e-07, "logits/chosen": -0.6015563011169434, "logits/rejected": -0.6054785251617432, "logps/chosen": -257.34716796875, "logps/rejected": -241.6367950439453, "loss": 2735.9918, "rewards/accuracies": 0.49531251192092896, "rewards/chosen": -257.34716796875, "rewards/margins": -15.71037483215332, "rewards/rejected": -241.6367950439453, "slic/ce_loss": 257.34716796875, "slic/rank_loss": 84.6518325805664, "step": 200 }, { "epoch": 0.418848167539267, "eval_logits/chosen": -0.6110028028488159, "eval_logits/rejected": -0.6186715364456177, "eval_logps/chosen": -262.1006164550781, "eval_logps/rejected": -246.28273010253906, "eval_loss": 345.5438232421875, "eval_rewards/accuracies": 0.4884999990463257, "eval_rewards/chosen": -262.1006164550781, "eval_rewards/margins": -15.81789779663086, "eval_rewards/rejected": -246.28273010253906, "eval_runtime": 42.8327, "eval_samples_per_second": 46.693, "eval_slic/ce_loss": 262.1006164550781, "eval_slic/rank_loss": 83.44320678710938, "eval_steps_per_second": 2.918, "step": 200 }, { "epoch": 0.4397905759162304, "grad_norm": 6237.728515625, "learning_rate": 3.454593922550693e-07, "logits/chosen": -0.6002607941627502, "logits/rejected": -0.600605309009552, "logps/chosen": -262.0538024902344, "logps/rejected": -250.4560546875, "loss": 2777.6777, "rewards/accuracies": 0.47968751192092896, "rewards/chosen": -262.0538024902344, "rewards/margins": -11.597768783569336, "rewards/rejected": -250.4560546875, "slic/ce_loss": 262.0538024902344, "slic/rank_loss": 85.15589904785156, "step": 210 }, { "epoch": 0.4607329842931937, "grad_norm": 6908.84033203125, "learning_rate": 3.2829819606729477e-07, "logits/chosen": -0.599699854850769, "logits/rejected": -0.6079216599464417, "logps/chosen": -268.8377380371094, "logps/rejected": -252.35330200195312, "loss": 2872.3611, "rewards/accuracies": 0.4820312559604645, "rewards/chosen": -268.8377380371094, "rewards/margins": -16.484455108642578, "rewards/rejected": -252.35330200195312, "slic/ce_loss": 268.8377380371094, "slic/rank_loss": 90.20738983154297, "step": 220 }, { "epoch": 0.4816753926701571, "grad_norm": 6286.37451171875, "learning_rate": 3.1071729615293424e-07, "logits/chosen": -0.593070924282074, "logits/rejected": -0.6033838987350464, "logps/chosen": -256.0763244628906, "logps/rejected": -239.1165771484375, "loss": 2713.8352, "rewards/accuracies": 0.484375, "rewards/chosen": -256.0763244628906, "rewards/margins": -16.959781646728516, "rewards/rejected": -239.1165771484375, "slic/ce_loss": 256.0763244628906, "slic/rank_loss": 83.153076171875, "step": 230 }, { "epoch": 0.5026178010471204, "grad_norm": 6890.95263671875, "learning_rate": 2.9281093183781403e-07, "logits/chosen": -0.5985504388809204, "logits/rejected": -0.6077064275741577, "logps/chosen": -262.922607421875, "logps/rejected": -244.4534454345703, "loss": 2804.6604, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -262.922607421875, "rewards/margins": -18.46915626525879, "rewards/rejected": -244.4534454345703, "slic/ce_loss": 262.922607421875, "slic/rank_loss": 87.6599349975586, "step": 240 }, { "epoch": 0.5235602094240838, "grad_norm": 6481.29931640625, "learning_rate": 2.7467508704251135e-07, "logits/chosen": -0.5857258439064026, "logits/rejected": -0.5922163128852844, "logps/chosen": -261.371826171875, "logps/rejected": -238.2184295654297, "loss": 2811.9553, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -261.371826171875, "rewards/margins": -23.153379440307617, "rewards/rejected": -238.2184295654297, "slic/ce_loss": 261.371826171875, "slic/rank_loss": 90.12258911132812, "step": 250 }, { "epoch": 0.5445026178010471, "grad_norm": 6607.6845703125, "learning_rate": 2.5640697577740815e-07, "logits/chosen": -0.5988560914993286, "logits/rejected": -0.5961240530014038, "logps/chosen": -261.5967712402344, "logps/rejected": -237.8933868408203, "loss": 2822.6381, "rewards/accuracies": 0.46484375, "rewards/chosen": -261.5967712402344, "rewards/margins": -23.703397750854492, "rewards/rejected": -237.8933868408203, "slic/ce_loss": 261.5967712402344, "slic/rank_loss": 91.23295593261719, "step": 260 }, { "epoch": 0.5654450261780105, "grad_norm": 6657.15087890625, "learning_rate": 2.381045210440644e-07, "logits/chosen": -0.582733154296875, "logits/rejected": -0.5935451984405518, "logps/chosen": -254.5479278564453, "logps/rejected": -237.6572265625, "loss": 2701.4529, "rewards/accuracies": 0.47968751192092896, "rewards/chosen": -254.5479278564453, "rewards/margins": -16.890687942504883, "rewards/rejected": -237.6572265625, "slic/ce_loss": 254.5479278564453, "slic/rank_loss": 83.13374328613281, "step": 270 }, { "epoch": 0.5863874345549738, "grad_norm": 6212.56103515625, "learning_rate": 2.1986582993616925e-07, "logits/chosen": -0.5971206426620483, "logits/rejected": -0.598262369632721, "logps/chosen": -253.74880981445312, "logps/rejected": -241.04623413085938, "loss": 2685.0725, "rewards/accuracies": 0.48906248807907104, "rewards/chosen": -253.74880981445312, "rewards/margins": -12.702553749084473, "rewards/rejected": -241.04623413085938, "slic/ce_loss": 253.74880981445312, "slic/rank_loss": 81.88532257080078, "step": 280 }, { "epoch": 0.6073298429319371, "grad_norm": 6822.04150390625, "learning_rate": 2.0178866775369774e-07, "logits/chosen": -0.5831255316734314, "logits/rejected": -0.5880999565124512, "logps/chosen": -268.67706298828125, "logps/rejected": -250.81631469726562, "loss": 2880.4166, "rewards/accuracies": 0.4781250059604645, "rewards/chosen": -268.67706298828125, "rewards/margins": -17.860719680786133, "rewards/rejected": -250.81631469726562, "slic/ce_loss": 268.67706298828125, "slic/rank_loss": 91.37500762939453, "step": 290 }, { "epoch": 0.6282722513089005, "grad_norm": 6906.6796875, "learning_rate": 1.839699339491937e-07, "logits/chosen": -0.5904260277748108, "logits/rejected": -0.5913136005401611, "logps/chosen": -255.6902618408203, "logps/rejected": -247.8364715576172, "loss": 2685.1258, "rewards/accuracies": 0.5093749761581421, "rewards/chosen": -255.6902618408203, "rewards/margins": -7.853767395019531, "rewards/rejected": -247.8364715576172, "slic/ce_loss": 255.6902618408203, "slic/rank_loss": 79.95047760009766, "step": 300 }, { "epoch": 0.6492146596858639, "grad_norm": 6676.84130859375, "learning_rate": 1.6650514271527465e-07, "logits/chosen": -0.5759958028793335, "logits/rejected": -0.5911142826080322, "logps/chosen": -258.6521301269531, "logps/rejected": -238.955322265625, "loss": 2770.6453, "rewards/accuracies": 0.4742187559604645, "rewards/chosen": -258.6521301269531, "rewards/margins": -19.696758270263672, "rewards/rejected": -238.955322265625, "slic/ce_loss": 258.6521301269531, "slic/rank_loss": 87.67857360839844, "step": 310 }, { "epoch": 0.6701570680628273, "grad_norm": 7249.5908203125, "learning_rate": 1.4948791099758052e-07, "logits/chosen": -0.6019054651260376, "logits/rejected": -0.5995901226997375, "logps/chosen": -263.072021484375, "logps/rejected": -240.22134399414062, "loss": 2814.36, "rewards/accuracies": 0.48515623807907104, "rewards/chosen": -263.072021484375, "rewards/margins": -22.85066795349121, "rewards/rejected": -240.22134399414062, "slic/ce_loss": 263.072021484375, "slic/rank_loss": 88.72297668457031, "step": 320 }, { "epoch": 0.6910994764397905, "grad_norm": 6414.8857421875, "learning_rate": 1.3300945667758012e-07, "logits/chosen": -0.5962297320365906, "logits/rejected": -0.5947962999343872, "logps/chosen": -258.00311279296875, "logps/rejected": -244.7356719970703, "loss": 2729.925, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -258.00311279296875, "rewards/margins": -13.2674560546875, "rewards/rejected": -244.7356719970703, "slic/ce_loss": 258.00311279296875, "slic/rank_loss": 83.2375259399414, "step": 330 }, { "epoch": 0.7120418848167539, "grad_norm": 5969.27587890625, "learning_rate": 1.1715810961514072e-07, "logits/chosen": -0.5996378660202026, "logits/rejected": -0.5939691662788391, "logps/chosen": -251.0337677001953, "logps/rejected": -234.95639038085938, "loss": 2683.643, "rewards/accuracies": 0.4859375059604645, "rewards/chosen": -251.0337677001953, "rewards/margins": -16.077373504638672, "rewards/rejected": -234.95639038085938, "slic/ce_loss": 251.0337677001953, "slic/rank_loss": 84.42159271240234, "step": 340 }, { "epoch": 0.7329842931937173, "grad_norm": 8791.7958984375, "learning_rate": 1.0201883817182949e-07, "logits/chosen": -0.6010726094245911, "logits/rejected": -0.6074205040931702, "logps/chosen": -265.9036865234375, "logps/rejected": -244.1355438232422, "loss": 2822.1586, "rewards/accuracies": 0.4546875059604645, "rewards/chosen": -265.9036865234375, "rewards/margins": -21.768173217773438, "rewards/rejected": -244.1355438232422, "slic/ce_loss": 265.9036865234375, "slic/rank_loss": 86.86607360839844, "step": 350 }, { "epoch": 0.7539267015706806, "grad_norm": 6849.009765625, "learning_rate": 8.76727937529367e-08, "logits/chosen": -0.6024104356765747, "logits/rejected": -0.6169945597648621, "logps/chosen": -250.9459991455078, "logps/rejected": -233.37088012695312, "loss": 2662.359, "rewards/accuracies": 0.5015624761581421, "rewards/chosen": -250.9459991455078, "rewards/margins": -17.57510757446289, "rewards/rejected": -233.37088012695312, "slic/ce_loss": 250.9459991455078, "slic/rank_loss": 81.84888458251953, "step": 360 }, { "epoch": 0.774869109947644, "grad_norm": 6163.64599609375, "learning_rate": 7.419687580962222e-08, "logits/chosen": -0.5869948863983154, "logits/rejected": -0.5933431386947632, "logps/chosen": -257.76495361328125, "logps/rejected": -240.93856811523438, "loss": 2751.2512, "rewards/accuracies": 0.49687498807907104, "rewards/chosen": -257.76495361328125, "rewards/margins": -16.826370239257812, "rewards/rejected": -240.93856811523438, "slic/ce_loss": 257.76495361328125, "slic/rank_loss": 86.14141845703125, "step": 370 }, { "epoch": 0.7958115183246073, "grad_norm": 6802.92919921875, "learning_rate": 6.166331963291519e-08, "logits/chosen": -0.598025918006897, "logits/rejected": -0.6036067008972168, "logps/chosen": -275.9155578613281, "logps/rejected": -248.60989379882812, "loss": 2926.8623, "rewards/accuracies": 0.47265625, "rewards/chosen": -275.9155578613281, "rewards/margins": -27.30564308166504, "rewards/rejected": -248.60989379882812, "slic/ce_loss": 275.9155578613281, "slic/rank_loss": 89.94223022460938, "step": 380 }, { "epoch": 0.8167539267015707, "grad_norm": 6247.5087890625, "learning_rate": 5.013930914912476e-08, "logits/chosen": -0.5993385314941406, "logits/rejected": -0.5995285511016846, "logps/chosen": -253.06851196289062, "logps/rejected": -245.85745239257812, "loss": 2641.3674, "rewards/accuracies": 0.5132812261581421, "rewards/chosen": -253.06851196289062, "rewards/margins": -7.211063385009766, "rewards/rejected": -245.85745239257812, "slic/ce_loss": 253.06851196289062, "slic/rank_loss": 77.10240936279297, "step": 390 }, { "epoch": 0.837696335078534, "grad_norm": 6252.97314453125, "learning_rate": 3.968661679220467e-08, "logits/chosen": -0.5878058075904846, "logits/rejected": -0.595999538898468, "logps/chosen": -265.92987060546875, "logps/rejected": -247.3778839111328, "loss": 2791.6219, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -265.92987060546875, "rewards/margins": -18.551965713500977, "rewards/rejected": -247.3778839111328, "slic/ce_loss": 265.92987060546875, "slic/rank_loss": 83.02286529541016, "step": 400 }, { "epoch": 0.837696335078534, "eval_logits/chosen": -0.6036794185638428, "eval_logits/rejected": -0.6097184419631958, "eval_logps/chosen": -260.79754638671875, "eval_logps/rejected": -247.10818481445312, "eval_loss": 341.8598937988281, "eval_rewards/accuracies": 0.4934999942779541, "eval_rewards/chosen": -260.79754638671875, "eval_rewards/margins": -13.689358711242676, "eval_rewards/rejected": -247.10818481445312, "eval_runtime": 42.51, "eval_samples_per_second": 47.048, "eval_slic/ce_loss": 260.79754638671875, "eval_slic/rank_loss": 81.0623550415039, "eval_steps_per_second": 2.94, "step": 400 }, { "epoch": 0.8586387434554974, "grad_norm": 5975.84033203125, "learning_rate": 3.036127238347164e-08, "logits/chosen": -0.6068440675735474, "logits/rejected": -0.6084403991699219, "logps/chosen": -263.7471008300781, "logps/rejected": -248.447021484375, "loss": 2812.4121, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -263.7471008300781, "rewards/margins": -15.300073623657227, "rewards/rejected": -248.447021484375, "slic/ce_loss": 263.7471008300781, "slic/rank_loss": 87.80433654785156, "step": 410 }, { "epoch": 0.8795811518324608, "grad_norm": 6574.27978515625, "learning_rate": 2.2213262793589482e-08, "logits/chosen": -0.6027593016624451, "logits/rejected": -0.6067181825637817, "logps/chosen": -262.3794860839844, "logps/rejected": -246.2481231689453, "loss": 2759.7773, "rewards/accuracies": 0.48515623807907104, "rewards/chosen": -262.3794860839844, "rewards/margins": -16.13137435913086, "rewards/rejected": -246.2481231689453, "slic/ce_loss": 262.3794860839844, "slic/rank_loss": 82.59269714355469, "step": 420 }, { "epoch": 0.900523560209424, "grad_norm": 6577.6103515625, "learning_rate": 1.5286263996730026e-08, "logits/chosen": -0.5887020826339722, "logits/rejected": -0.6053365468978882, "logps/chosen": -264.7728576660156, "logps/rejected": -240.94216918945312, "loss": 2829.2809, "rewards/accuracies": 0.48359376192092896, "rewards/chosen": -264.7728576660156, "rewards/margins": -23.83070945739746, "rewards/rejected": -240.94216918945312, "slic/ce_loss": 264.7728576660156, "slic/rank_loss": 88.88728332519531, "step": 430 }, { "epoch": 0.9214659685863874, "grad_norm": 6817.033203125, "learning_rate": 9.617406953185136e-09, "logits/chosen": -0.5980589985847473, "logits/rejected": -0.6065895557403564, "logps/chosen": -253.0465087890625, "logps/rejected": -241.0380859375, "loss": 2692.71, "rewards/accuracies": 0.50390625, "rewards/chosen": -253.0465087890625, "rewards/margins": -12.008459091186523, "rewards/rejected": -241.0380859375, "slic/ce_loss": 253.0465087890625, "slic/rank_loss": 83.54225158691406, "step": 440 }, { "epoch": 0.9424083769633508, "grad_norm": 6651.41357421875, "learning_rate": 5.2370785753763356e-09, "logits/chosen": -0.5972884893417358, "logits/rejected": -0.6050039529800415, "logps/chosen": -259.58685302734375, "logps/rejected": -238.9262237548828, "loss": 2777.9844, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -259.58685302734375, "rewards/margins": -20.660663604736328, "rewards/rejected": -238.9262237548828, "slic/ce_loss": 259.58685302734375, "slic/rank_loss": 87.66117095947266, "step": 450 }, { "epoch": 0.9633507853403142, "grad_norm": 7045.99609375, "learning_rate": 2.168758844148272e-09, "logits/chosen": -0.5794906616210938, "logits/rejected": -0.589801549911499, "logps/chosen": -275.762451171875, "logps/rejected": -259.4653015136719, "loss": 2896.2551, "rewards/accuracies": 0.4867187440395355, "rewards/chosen": -275.762451171875, "rewards/margins": -16.297168731689453, "rewards/rejected": -259.4653015136719, "slic/ce_loss": 275.762451171875, "slic/rank_loss": 86.26937866210938, "step": 460 }, { "epoch": 0.9842931937172775, "grad_norm": 7251.65869140625, "learning_rate": 4.288949484559934e-10, "logits/chosen": -0.5941784977912903, "logits/rejected": -0.5964524149894714, "logps/chosen": -265.3995056152344, "logps/rejected": -239.29824829101562, "loss": 2848.368, "rewards/accuracies": 0.4867187440395355, "rewards/chosen": -265.3995056152344, "rewards/margins": -26.10125160217285, "rewards/rejected": -239.29824829101562, "slic/ce_loss": 265.3995056152344, "slic/rank_loss": 90.64649963378906, "step": 470 }, { "epoch": 0.9989528795811519, "step": 477, "total_flos": 0.0, "train_loss": 2803.1413415552934, "train_runtime": 5510.6328, "train_samples_per_second": 11.094, "train_steps_per_second": 0.087 } ], "logging_steps": 10, "max_steps": 477, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }