{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9989528795811519, "eval_steps": 200, "global_step": 477, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0020942408376963353, "grad_norm": 6693.150390625, "learning_rate": 0.0, "logits/chosen": -0.5995081663131714, "logits/rejected": -0.6144353747367859, "logps/chosen": -1.1853606700897217, "logps/rejected": -1.4816904067993164, "loss": 20000.3594, "rewards/accuracies": 0.59375, "rewards/chosen": 1.3230741387815215e-05, "rewards/margins": 7.880535122239962e-06, "rewards/rejected": 5.350205356080551e-06, "step": 1 }, { "epoch": 0.020942408376963352, "grad_norm": 5701.76220703125, "learning_rate": 9.375e-08, "logits/chosen": -0.632120668888092, "logits/rejected": -0.6369706392288208, "logps/chosen": -1.1411508321762085, "logps/rejected": -1.3229453563690186, "loss": 19999.7205, "rewards/accuracies": 0.5173611044883728, "rewards/chosen": 2.9837883630534634e-06, "rewards/margins": 3.864423888444435e-06, "rewards/rejected": -8.806358096080658e-07, "step": 10 }, { "epoch": 0.041884816753926704, "grad_norm": 9201.447265625, "learning_rate": 1.9791666666666664e-07, "logits/chosen": -0.5988906621932983, "logits/rejected": -0.6298493146896362, "logps/chosen": -1.1650886535644531, "logps/rejected": -1.311722993850708, "loss": 19999.3156, "rewards/accuracies": 0.515625, "rewards/chosen": 1.1492621524666902e-05, "rewards/margins": 9.438592314836569e-06, "rewards/rejected": 2.05402784558828e-06, "step": 20 }, { "epoch": 0.06282722513089005, "grad_norm": 6738.72607421875, "learning_rate": 3.020833333333333e-07, "logits/chosen": -0.6149581670761108, "logits/rejected": -0.6070072054862976, "logps/chosen": -1.1174428462982178, "logps/rejected": -1.2004567384719849, "loss": 19997.9203, "rewards/accuracies": 0.546875, "rewards/chosen": 6.718002259731293e-05, "rewards/margins": 3.3566444471944124e-05, "rewards/rejected": 3.3613578125368804e-05, "step": 30 }, { "epoch": 0.08376963350785341, "grad_norm": 8354.3896484375, "learning_rate": 4.0625e-07, "logits/chosen": -0.6167833805084229, "logits/rejected": -0.6415922045707703, "logps/chosen": -1.1156193017959595, "logps/rejected": -1.2774651050567627, "loss": 19990.4047, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.00021335871133487672, "rewards/margins": 9.219734056387097e-05, "rewards/rejected": 0.00012116136349504814, "step": 40 }, { "epoch": 0.10471204188481675, "grad_norm": 6678.84423828125, "learning_rate": 4.999932966293553e-07, "logits/chosen": -0.6263213753700256, "logits/rejected": -0.6669066548347473, "logps/chosen": -1.0474059581756592, "logps/rejected": -1.251434326171875, "loss": 19978.1375, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.0006399197736755013, "rewards/margins": 0.00036057684337720275, "rewards/rejected": 0.00027934290119446814, "step": 50 }, { "epoch": 0.1256544502617801, "grad_norm": 9391.8212890625, "learning_rate": 4.991893270335525e-07, "logits/chosen": -0.6445637345314026, "logits/rejected": -0.6592258810997009, "logps/chosen": -1.1116560697555542, "logps/rejected": -1.2951438426971436, "loss": 19958.8281, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": 0.0007664122385904193, "rewards/margins": 0.00041008397238329053, "rewards/rejected": 0.0003563283826224506, "step": 60 }, { "epoch": 0.14659685863874344, "grad_norm": 9794.3427734375, "learning_rate": 4.970496218214204e-07, "logits/chosen": -0.70770663022995, "logits/rejected": -0.7134765386581421, "logps/chosen": -1.134606122970581, "logps/rejected": -1.3565254211425781, "loss": 19938.9969, "rewards/accuracies": 0.653124988079071, "rewards/chosen": 0.0003916005662176758, "rewards/margins": 0.000627793138846755, "rewards/rejected": -0.0002361925144214183, "step": 70 }, { "epoch": 0.16753926701570682, "grad_norm": 16482.955078125, "learning_rate": 4.935856505068998e-07, "logits/chosen": -0.6878141164779663, "logits/rejected": -0.6850681304931641, "logps/chosen": -1.1881755590438843, "logps/rejected": -1.5133380889892578, "loss": 19887.7562, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.0005251518450677395, "rewards/margins": 0.0014378412161022425, "rewards/rejected": -0.001962993061169982, "step": 80 }, { "epoch": 0.18848167539267016, "grad_norm": 15171.9814453125, "learning_rate": 4.8881598109976e-07, "logits/chosen": -0.6935184001922607, "logits/rejected": -0.6993836164474487, "logps/chosen": -1.2852985858917236, "logps/rejected": -1.6855100393295288, "loss": 19835.8234, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.0014577357796952128, "rewards/margins": 0.002161522628739476, "rewards/rejected": -0.0036192580591887236, "step": 90 }, { "epoch": 0.2094240837696335, "grad_norm": 43909.03125, "learning_rate": 4.827661805750437e-07, "logits/chosen": -0.6590100526809692, "logits/rejected": -0.6758258938789368, "logps/chosen": -1.3185642957687378, "logps/rejected": -1.6853891611099243, "loss": 19789.7562, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.00188564439304173, "rewards/margins": 0.002156579401344061, "rewards/rejected": -0.0040422240272164345, "step": 100 }, { "epoch": 0.23036649214659685, "grad_norm": 46707.671875, "learning_rate": 4.75468677825789e-07, "logits/chosen": -0.6606122255325317, "logits/rejected": -0.6832340955734253, "logps/chosen": -1.3893611431121826, "logps/rejected": -2.1219935417175293, "loss": 19730.8734, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.00285837659612298, "rewards/margins": 0.004579269327223301, "rewards/rejected": -0.007437646389007568, "step": 110 }, { "epoch": 0.2513089005235602, "grad_norm": 31155.9921875, "learning_rate": 4.669625898336438e-07, "logits/chosen": -0.6470817923545837, "logits/rejected": -0.6482914686203003, "logps/chosen": -1.5591058731079102, "logps/rejected": -1.9671964645385742, "loss": 19737.225, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.004161308519542217, "rewards/margins": 0.00257851486094296, "rewards/rejected": -0.006739823613315821, "step": 120 }, { "epoch": 0.27225130890052357, "grad_norm": 47056.62109375, "learning_rate": 4.5729351198915705e-07, "logits/chosen": -0.633264422416687, "logits/rejected": -0.6238052248954773, "logps/chosen": -1.7964448928833008, "logps/rejected": -2.325413227081299, "loss": 19654.5281, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.006049484945833683, "rewards/margins": 0.0038619127590209246, "rewards/rejected": -0.009911397472023964, "step": 130 }, { "epoch": 0.2931937172774869, "grad_norm": 49205.953125, "learning_rate": 4.4651327368569684e-07, "logits/chosen": -0.6288710832595825, "logits/rejected": -0.6355851888656616, "logps/chosen": -2.379579544067383, "logps/rejected": -2.7832019329071045, "loss": 19769.0641, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.011800874024629593, "rewards/margins": 0.002473782515153289, "rewards/rejected": -0.014274654909968376, "step": 140 }, { "epoch": 0.31413612565445026, "grad_norm": 26854.67578125, "learning_rate": 4.346796604970912e-07, "logits/chosen": -0.6340184807777405, "logits/rejected": -0.632671058177948, "logps/chosen": -2.1093215942382812, "logps/rejected": -3.087489604949951, "loss": 19566.3844, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.009258531033992767, "rewards/margins": 0.008069212548434734, "rewards/rejected": -0.017327744513750076, "step": 150 }, { "epoch": 0.33507853403141363, "grad_norm": 57529.83984375, "learning_rate": 4.218561044282098e-07, "logits/chosen": -0.6213893294334412, "logits/rejected": -0.623960554599762, "logps/chosen": -2.0895004272460938, "logps/rejected": -2.9133753776550293, "loss": 19599.4, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.009251957759261131, "rewards/margins": 0.006096340250223875, "rewards/rejected": -0.01534829568117857, "step": 160 }, { "epoch": 0.35602094240837695, "grad_norm": 37782.34765625, "learning_rate": 4.081113438988443e-07, "logits/chosen": -0.6065059900283813, "logits/rejected": -0.6017466187477112, "logps/chosen": -1.9792190790176392, "logps/rejected": -2.9278252124786377, "loss": 19481.5719, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.008601201698184013, "rewards/margins": 0.007898198440670967, "rewards/rejected": -0.01649939827620983, "step": 170 }, { "epoch": 0.3769633507853403, "grad_norm": 82435.9609375, "learning_rate": 3.935190552834828e-07, "logits/chosen": -0.5981167554855347, "logits/rejected": -0.6125859022140503, "logps/chosen": -2.3680925369262695, "logps/rejected": -3.2327022552490234, "loss": 19458.8375, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.011699057184159756, "rewards/margins": 0.007529159542173147, "rewards/rejected": -0.019228216260671616, "step": 180 }, { "epoch": 0.39790575916230364, "grad_norm": 63375.296875, "learning_rate": 3.781574579820464e-07, "logits/chosen": -0.6075039505958557, "logits/rejected": -0.6245567798614502, "logps/chosen": -2.516505718231201, "logps/rejected": -3.326948881149292, "loss": 19459.0375, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.013854382559657097, "rewards/margins": 0.006254198960959911, "rewards/rejected": -0.020108580589294434, "step": 190 }, { "epoch": 0.418848167539267, "grad_norm": 48348.94140625, "learning_rate": 3.621088951385353e-07, "logits/chosen": -0.613773763179779, "logits/rejected": -0.6296969652175903, "logps/chosen": -2.6673552989959717, "logps/rejected": -3.9382033348083496, "loss": 19287.1, "rewards/accuracies": 0.609375, "rewards/chosen": -0.015280306339263916, "rewards/margins": 0.010868425481021404, "rewards/rejected": -0.026148730888962746, "step": 200 }, { "epoch": 0.418848167539267, "eval_logits/chosen": -0.6205040216445923, "eval_logits/rejected": -0.6159693598747253, "eval_logps/chosen": -2.9446616172790527, "eval_logps/rejected": -3.935300588607788, "eval_loss": 2426.459228515625, "eval_rewards/accuracies": 0.628000020980835, "eval_rewards/chosen": -0.018280988559126854, "eval_rewards/margins": 0.008097735233604908, "eval_rewards/rejected": -0.026378722861409187, "eval_runtime": 79.1947, "eval_samples_per_second": 25.254, "eval_steps_per_second": 1.578, "step": 200 }, { "epoch": 0.4397905759162304, "grad_norm": 155612.59375, "learning_rate": 3.454593922550693e-07, "logits/chosen": -0.5876352190971375, "logits/rejected": -0.6093825101852417, "logps/chosen": -3.1378333568573, "logps/rejected": -4.437775611877441, "loss": 19445.9656, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.019884048029780388, "rewards/margins": 0.011111686006188393, "rewards/rejected": -0.03099573589861393, "step": 210 }, { "epoch": 0.4607329842931937, "grad_norm": 71028.4765625, "learning_rate": 3.2829819606729477e-07, "logits/chosen": -0.6457465291023254, "logits/rejected": -0.6453076601028442, "logps/chosen": -2.845641613006592, "logps/rejected": -3.9203414916992188, "loss": 19382.9562, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.016577597707509995, "rewards/margins": 0.009232145734131336, "rewards/rejected": -0.025809740647673607, "step": 220 }, { "epoch": 0.4816753926701571, "grad_norm": 115599.875, "learning_rate": 3.1071729615293424e-07, "logits/chosen": -0.6181890964508057, "logits/rejected": -0.6093004941940308, "logps/chosen": -2.9002511501312256, "logps/rejected": -3.8673794269561768, "loss": 19334.4938, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.01705673709511757, "rewards/margins": 0.008125408552587032, "rewards/rejected": -0.025182146579027176, "step": 230 }, { "epoch": 0.5026178010471204, "grad_norm": 211070.734375, "learning_rate": 2.9281093183781403e-07, "logits/chosen": -0.5894284844398499, "logits/rejected": -0.6112298965454102, "logps/chosen": -2.810957431793213, "logps/rejected": -3.9238739013671875, "loss": 19306.5266, "rewards/accuracies": 0.6875, "rewards/chosen": -0.016638856381177902, "rewards/margins": 0.009543554857373238, "rewards/rejected": -0.02618240937590599, "step": 240 }, { "epoch": 0.5235602094240838, "grad_norm": 65152.41796875, "learning_rate": 2.7467508704251135e-07, "logits/chosen": -0.6282380223274231, "logits/rejected": -0.6259430646896362, "logps/chosen": -2.4997780323028564, "logps/rejected": -4.1547417640686035, "loss": 19091.5047, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.013699686154723167, "rewards/margins": 0.013582974672317505, "rewards/rejected": -0.027282658964395523, "step": 250 }, { "epoch": 0.5445026178010471, "grad_norm": 151292.875, "learning_rate": 2.5640697577740815e-07, "logits/chosen": -0.6392425298690796, "logits/rejected": -0.6286466717720032, "logps/chosen": -2.742441415786743, "logps/rejected": -4.296938896179199, "loss": 19164.5906, "rewards/accuracies": 0.65625, "rewards/chosen": -0.015572240576148033, "rewards/margins": 0.014227310195565224, "rewards/rejected": -0.02979954704642296, "step": 260 }, { "epoch": 0.5654450261780105, "grad_norm": 97880.8359375, "learning_rate": 2.381045210440644e-07, "logits/chosen": -0.613749623298645, "logits/rejected": -0.615020215511322, "logps/chosen": -3.2430672645568848, "logps/rejected": -4.990499973297119, "loss": 19050.5672, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.02098112925887108, "rewards/margins": 0.015692265704274178, "rewards/rejected": -0.036673396825790405, "step": 270 }, { "epoch": 0.5863874345549738, "grad_norm": 154077.0625, "learning_rate": 2.1986582993616925e-07, "logits/chosen": -0.6264781951904297, "logits/rejected": -0.6161786913871765, "logps/chosen": -3.209202289581299, "logps/rejected": -5.913137912750244, "loss": 18823.7281, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.020146917551755905, "rewards/margins": 0.02518734335899353, "rewards/rejected": -0.045334260910749435, "step": 280 }, { "epoch": 0.6073298429319371, "grad_norm": 200861.8125, "learning_rate": 2.0178866775369774e-07, "logits/chosen": -0.611282229423523, "logits/rejected": -0.5838695168495178, "logps/chosen": -3.689143657684326, "logps/rejected": -5.111196041107178, "loss": 18970.7062, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.025332489982247353, "rewards/margins": 0.012581191956996918, "rewards/rejected": -0.03791367635130882, "step": 290 }, { "epoch": 0.6282722513089005, "grad_norm": 166784.578125, "learning_rate": 1.839699339491937e-07, "logits/chosen": -0.6036723852157593, "logits/rejected": -0.5822815895080566, "logps/chosen": -4.102442264556885, "logps/rejected": -5.181426048278809, "loss": 19116.6344, "rewards/accuracies": 0.625, "rewards/chosen": -0.028961803764104843, "rewards/margins": 0.010249540209770203, "rewards/rejected": -0.039211343973875046, "step": 300 }, { "epoch": 0.6492146596858639, "grad_norm": 61378.703125, "learning_rate": 1.6650514271527465e-07, "logits/chosen": -0.5674837231636047, "logits/rejected": -0.5671803951263428, "logps/chosen": -3.5371127128601074, "logps/rejected": -4.755034923553467, "loss": 19014.2906, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.023985305801033974, "rewards/margins": 0.010895274579524994, "rewards/rejected": -0.034880585968494415, "step": 310 }, { "epoch": 0.6701570680628273, "grad_norm": 69563.6875, "learning_rate": 1.4948791099758052e-07, "logits/chosen": -0.5789169073104858, "logits/rejected": -0.5831333994865417, "logps/chosen": -3.350994825363159, "logps/rejected": -5.1058349609375, "loss": 18803.8, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.02192312851548195, "rewards/margins": 0.016279790550470352, "rewards/rejected": -0.0382029190659523, "step": 320 }, { "epoch": 0.6910994764397905, "grad_norm": 116812.1875, "learning_rate": 1.3300945667758012e-07, "logits/chosen": -0.5629091262817383, "logits/rejected": -0.5619910955429077, "logps/chosen": -3.25724720954895, "logps/rejected": -5.660012245178223, "loss": 18933.75, "rewards/accuracies": 0.671875, "rewards/chosen": -0.02022087574005127, "rewards/margins": 0.022823648527264595, "rewards/rejected": -0.043044526129961014, "step": 330 }, { "epoch": 0.7120418848167539, "grad_norm": 136348.578125, "learning_rate": 1.1715810961514072e-07, "logits/chosen": -0.5339993238449097, "logits/rejected": -0.5259178876876831, "logps/chosen": -3.9198012351989746, "logps/rejected": -5.808166027069092, "loss": 18915.2109, "rewards/accuracies": 0.6875, "rewards/chosen": -0.026918912306427956, "rewards/margins": 0.017473099753260612, "rewards/rejected": -0.04439201205968857, "step": 340 }, { "epoch": 0.7329842931937173, "grad_norm": 125149.328125, "learning_rate": 1.0201883817182949e-07, "logits/chosen": -0.5391268134117126, "logits/rejected": -0.5409469604492188, "logps/chosen": -3.6542420387268066, "logps/rejected": -5.775312423706055, "loss": 18757.6156, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.024714555591344833, "rewards/margins": 0.019216233864426613, "rewards/rejected": -0.043930791318416595, "step": 350 }, { "epoch": 0.7539267015706806, "grad_norm": 223470.640625, "learning_rate": 8.76727937529367e-08, "logits/chosen": -0.49748390913009644, "logits/rejected": -0.5006095767021179, "logps/chosen": -3.7832627296447754, "logps/rejected": -5.971843719482422, "loss": 18893.1125, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.026226039975881577, "rewards/margins": 0.01982385478913784, "rewards/rejected": -0.046049896627664566, "step": 360 }, { "epoch": 0.774869109947644, "grad_norm": 241752.5, "learning_rate": 7.419687580962222e-08, "logits/chosen": -0.542875349521637, "logits/rejected": -0.534200131893158, "logps/chosen": -3.9745116233825684, "logps/rejected": -6.106555461883545, "loss": 18608.9672, "rewards/accuracies": 0.671875, "rewards/chosen": -0.027682432904839516, "rewards/margins": 0.019355323165655136, "rewards/rejected": -0.0470377579331398, "step": 370 }, { "epoch": 0.7958115183246073, "grad_norm": 114267.7890625, "learning_rate": 6.166331963291519e-08, "logits/chosen": -0.4814079701900482, "logits/rejected": -0.481245756149292, "logps/chosen": -3.6965994834899902, "logps/rejected": -6.086516857147217, "loss": 18694.2781, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.02523530088365078, "rewards/margins": 0.022404661402106285, "rewards/rejected": -0.04763996601104736, "step": 380 }, { "epoch": 0.8167539267015707, "grad_norm": 589834.9375, "learning_rate": 5.013930914912476e-08, "logits/chosen": -0.47561925649642944, "logits/rejected": -0.47512689232826233, "logps/chosen": -4.354737281799316, "logps/rejected": -6.447417259216309, "loss": 18879.7594, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.0320982001721859, "rewards/margins": 0.019259590655565262, "rewards/rejected": -0.05135778710246086, "step": 390 }, { "epoch": 0.837696335078534, "grad_norm": 178425.046875, "learning_rate": 3.968661679220467e-08, "logits/chosen": -0.4660119116306305, "logits/rejected": -0.4752522110939026, "logps/chosen": -4.376452922821045, "logps/rejected": -5.880141258239746, "loss": 18745.8219, "rewards/accuracies": 0.640625, "rewards/chosen": -0.032034579664468765, "rewards/margins": 0.014202708378434181, "rewards/rejected": -0.046237289905548096, "step": 400 }, { "epoch": 0.837696335078534, "eval_logits/chosen": -0.48531287908554077, "eval_logits/rejected": -0.483078271150589, "eval_logps/chosen": -4.361822128295898, "eval_logps/rejected": -6.55994176864624, "eval_loss": 2344.3515625, "eval_rewards/accuracies": 0.6880000233650208, "eval_rewards/chosen": -0.032452598214149475, "eval_rewards/margins": 0.020172545686364174, "eval_rewards/rejected": -0.0526251420378685, "eval_runtime": 79.0642, "eval_samples_per_second": 25.296, "eval_steps_per_second": 1.581, "step": 400 }, { "epoch": 0.8586387434554974, "grad_norm": 215014.171875, "learning_rate": 3.036127238347164e-08, "logits/chosen": -0.4601953625679016, "logits/rejected": -0.4607599377632141, "logps/chosen": -4.512561798095703, "logps/rejected": -6.529176235198975, "loss": 18725.1625, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.03354647010564804, "rewards/margins": 0.018701782450079918, "rewards/rejected": -0.05224825069308281, "step": 410 }, { "epoch": 0.8795811518324608, "grad_norm": 102427.7265625, "learning_rate": 2.2213262793589482e-08, "logits/chosen": -0.4713996350765228, "logits/rejected": -0.4640986919403076, "logps/chosen": -4.065049648284912, "logps/rejected": -6.243768215179443, "loss": 18609.1844, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.029407471418380737, "rewards/margins": 0.020186370238661766, "rewards/rejected": -0.049593839794397354, "step": 420 }, { "epoch": 0.900523560209424, "grad_norm": 232973.828125, "learning_rate": 1.5286263996730026e-08, "logits/chosen": -0.4657178819179535, "logits/rejected": -0.4797271192073822, "logps/chosen": -3.879307985305786, "logps/rejected": -7.149705410003662, "loss": 18596.3594, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.02748905122280121, "rewards/margins": 0.029860844835639, "rewards/rejected": -0.05734989792108536, "step": 430 }, { "epoch": 0.9214659685863874, "grad_norm": 75597.1640625, "learning_rate": 9.617406953185136e-09, "logits/chosen": -0.4645307660102844, "logits/rejected": -0.4688239097595215, "logps/chosen": -4.340611457824707, "logps/rejected": -6.387178421020508, "loss": 18715.6813, "rewards/accuracies": 0.640625, "rewards/chosen": -0.032030027359724045, "rewards/margins": 0.018920475617051125, "rewards/rejected": -0.05095050483942032, "step": 440 }, { "epoch": 0.9424083769633508, "grad_norm": 114222.5078125, "learning_rate": 5.2370785753763356e-09, "logits/chosen": -0.4637104570865631, "logits/rejected": -0.46644240617752075, "logps/chosen": -4.310400485992432, "logps/rejected": -6.484769344329834, "loss": 18713.0891, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.03145883232355118, "rewards/margins": 0.020065244287252426, "rewards/rejected": -0.0515240803360939, "step": 450 }, { "epoch": 0.9633507853403142, "grad_norm": 294656.09375, "learning_rate": 2.168758844148272e-09, "logits/chosen": -0.4725240170955658, "logits/rejected": -0.48559778928756714, "logps/chosen": -4.068756103515625, "logps/rejected": -6.292796611785889, "loss": 18791.9016, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.02896204963326454, "rewards/margins": 0.02072622999548912, "rewards/rejected": -0.049688275903463364, "step": 460 }, { "epoch": 0.9842931937172775, "grad_norm": 169383.0625, "learning_rate": 4.288949484559934e-10, "logits/chosen": -0.4306113123893738, "logits/rejected": -0.4224150776863098, "logps/chosen": -4.323697566986084, "logps/rejected": -6.879971981048584, "loss": 18644.775, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.031131744384765625, "rewards/margins": 0.024372201412916183, "rewards/rejected": -0.05550394579768181, "step": 470 }, { "epoch": 0.9989528795811519, "step": 477, "total_flos": 0.0, "train_loss": 19262.642230083857, "train_runtime": 5980.3723, "train_samples_per_second": 10.223, "train_steps_per_second": 0.08 } ], "logging_steps": 10, "max_steps": 477, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }