{ "best_metric": 0.675000011920929, "best_model_checkpoint": "./outputs/tinyllama-1.1b-dpo-pku-saferlhf/checkpoint-1200", "epoch": 0.9997600191984641, "eval_steps": 200, "global_step": 2083, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004799616030717543, "grad_norm": 57.0, "learning_rate": 2.3923444976076555e-08, "logits/chosen": -2.688718318939209, "logits/rejected": -2.5538744926452637, "logps/chosen": -212.6398162841797, "logps/rejected": -186.61505126953125, "loss": 0.6966, "rewards/accuracies": 0.35624998807907104, "rewards/chosen": -0.005152043420821428, "rewards/margins": -0.006588623858988285, "rewards/rejected": 0.0014365792740136385, "step": 10 }, { "epoch": 0.009599232061435085, "grad_norm": 59.75, "learning_rate": 4.784688995215311e-08, "logits/chosen": -2.728940486907959, "logits/rejected": -2.616565227508545, "logps/chosen": -223.5636749267578, "logps/rejected": -203.41867065429688, "loss": 0.6899, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.002993463072925806, "rewards/margins": 0.006973244249820709, "rewards/rejected": -0.003979781176894903, "step": 20 }, { "epoch": 0.014398848092152628, "grad_norm": 50.75, "learning_rate": 7.177033492822967e-08, "logits/chosen": -2.716870069503784, "logits/rejected": -2.6400887966156006, "logps/chosen": -237.99618530273438, "logps/rejected": -219.1649627685547, "loss": 0.6938, "rewards/accuracies": 0.40625, "rewards/chosen": -0.0003619740600697696, "rewards/margins": -0.0006011867080815136, "rewards/rejected": 0.00023921244428493083, "step": 30 }, { "epoch": 0.01919846412287017, "grad_norm": 57.25, "learning_rate": 9.569377990430622e-08, "logits/chosen": -2.7444612979888916, "logits/rejected": -2.5919036865234375, "logps/chosen": -251.178466796875, "logps/rejected": -196.35256958007812, "loss": 0.6926, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0019455172587186098, "rewards/margins": 0.001769614638760686, "rewards/rejected": -0.0037151314318180084, "step": 40 }, { "epoch": 0.023998080153587713, "grad_norm": 48.25, "learning_rate": 1.1961722488038278e-07, "logits/chosen": -2.6667444705963135, "logits/rejected": -2.6048452854156494, "logps/chosen": -234.8287353515625, "logps/rejected": -199.68902587890625, "loss": 0.6949, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.0009633477893657982, "rewards/margins": -0.002945653162896633, "rewards/rejected": 0.001982305431738496, "step": 50 }, { "epoch": 0.028797696184305256, "grad_norm": 63.5, "learning_rate": 1.4354066985645933e-07, "logits/chosen": -2.705540180206299, "logits/rejected": -2.599553346633911, "logps/chosen": -223.2880859375, "logps/rejected": -215.03759765625, "loss": 0.6932, "rewards/accuracies": 0.5, "rewards/chosen": -0.002534763887524605, "rewards/margins": 0.00042680976912379265, "rewards/rejected": -0.002961573889479041, "step": 60 }, { "epoch": 0.033597312215022795, "grad_norm": 51.25, "learning_rate": 1.6746411483253589e-07, "logits/chosen": -2.726954698562622, "logits/rejected": -2.5858073234558105, "logps/chosen": -245.69790649414062, "logps/rejected": -205.90469360351562, "loss": 0.6925, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.001567687257193029, "rewards/margins": 0.0018620832124724984, "rewards/rejected": -0.0034297697711735964, "step": 70 }, { "epoch": 0.03839692824574034, "grad_norm": 52.25, "learning_rate": 1.9138755980861244e-07, "logits/chosen": -2.72399640083313, "logits/rejected": -2.6148579120635986, "logps/chosen": -235.31991577148438, "logps/rejected": -201.12049865722656, "loss": 0.6933, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.0001770513626979664, "rewards/margins": 0.0003844931779894978, "rewards/rejected": -0.00020744054927490652, "step": 80 }, { "epoch": 0.04319654427645788, "grad_norm": 53.0, "learning_rate": 2.15311004784689e-07, "logits/chosen": -2.7387874126434326, "logits/rejected": -2.5575790405273438, "logps/chosen": -251.2541961669922, "logps/rejected": -189.22152709960938, "loss": 0.6945, "rewards/accuracies": 0.5, "rewards/chosen": -0.001186860492452979, "rewards/margins": -0.002072554547339678, "rewards/rejected": 0.0008856941130943596, "step": 90 }, { "epoch": 0.04799616030717543, "grad_norm": 50.25, "learning_rate": 2.3923444976076555e-07, "logits/chosen": -2.707674503326416, "logits/rejected": -2.5784828662872314, "logps/chosen": -228.6029510498047, "logps/rejected": -206.3776092529297, "loss": 0.6919, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.0017306295922026038, "rewards/margins": 0.0031413964461535215, "rewards/rejected": -0.001410767319612205, "step": 100 }, { "epoch": 0.052795776337892966, "grad_norm": 52.25, "learning_rate": 2.631578947368421e-07, "logits/chosen": -2.7312660217285156, "logits/rejected": -2.5520169734954834, "logps/chosen": -237.2713165283203, "logps/rejected": -178.83563232421875, "loss": 0.6954, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": 8.01489659352228e-05, "rewards/margins": -0.004063433036208153, "rewards/rejected": 0.004143581725656986, "step": 110 }, { "epoch": 0.05759539236861051, "grad_norm": 54.75, "learning_rate": 2.8708133971291866e-07, "logits/chosen": -2.7264270782470703, "logits/rejected": -2.602839946746826, "logps/chosen": -234.1689910888672, "logps/rejected": -204.75241088867188, "loss": 0.6945, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.0038609784096479416, "rewards/margins": -0.0021540005691349506, "rewards/rejected": -0.0017069776076823473, "step": 120 }, { "epoch": 0.06239500839932805, "grad_norm": 60.25, "learning_rate": 3.110047846889952e-07, "logits/chosen": -2.71527361869812, "logits/rejected": -2.5875349044799805, "logps/chosen": -252.4326934814453, "logps/rejected": -210.2029266357422, "loss": 0.6909, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.0007449972326867282, "rewards/margins": 0.005101869348436594, "rewards/rejected": -0.004356871824711561, "step": 130 }, { "epoch": 0.06719462443004559, "grad_norm": 49.75, "learning_rate": 3.3492822966507177e-07, "logits/chosen": -2.6849746704101562, "logits/rejected": -2.6195476055145264, "logps/chosen": -235.63906860351562, "logps/rejected": -218.46603393554688, "loss": 0.6935, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.0014313453575596213, "rewards/margins": -0.00015073138638399541, "rewards/rejected": 0.0015820765402168036, "step": 140 }, { "epoch": 0.07199424046076314, "grad_norm": 51.25, "learning_rate": 3.588516746411483e-07, "logits/chosen": -2.702357769012451, "logits/rejected": -2.6205286979675293, "logps/chosen": -232.359619140625, "logps/rejected": -245.7252197265625, "loss": 0.6947, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 0.002868877723813057, "rewards/margins": -0.0025208499282598495, "rewards/rejected": 0.005389728117734194, "step": 150 }, { "epoch": 0.07679385649148068, "grad_norm": 69.0, "learning_rate": 3.827751196172249e-07, "logits/chosen": -2.6653263568878174, "logits/rejected": -2.5433461666107178, "logps/chosen": -243.0439453125, "logps/rejected": -200.38685607910156, "loss": 0.6959, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.004858389962464571, "rewards/margins": -0.004746051970869303, "rewards/rejected": -0.00011233799159526825, "step": 160 }, { "epoch": 0.08159347252219823, "grad_norm": 54.0, "learning_rate": 4.066985645933014e-07, "logits/chosen": -2.7417194843292236, "logits/rejected": -2.5782225131988525, "logps/chosen": -256.54278564453125, "logps/rejected": -199.0166473388672, "loss": 0.6918, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.0022998370695859194, "rewards/margins": 0.003148593008518219, "rewards/rejected": -0.0008487561717629433, "step": 170 }, { "epoch": 0.08639308855291576, "grad_norm": 53.5, "learning_rate": 4.30622009569378e-07, "logits/chosen": -2.7641491889953613, "logits/rejected": -2.6625149250030518, "logps/chosen": -242.5579071044922, "logps/rejected": -201.333251953125, "loss": 0.6908, "rewards/accuracies": 0.5625, "rewards/chosen": -0.000439296942204237, "rewards/margins": 0.00535095389932394, "rewards/rejected": -0.005790251307189465, "step": 180 }, { "epoch": 0.09119270458363331, "grad_norm": 56.0, "learning_rate": 4.545454545454545e-07, "logits/chosen": -2.691822052001953, "logits/rejected": -2.587761640548706, "logps/chosen": -240.88916015625, "logps/rejected": -198.99119567871094, "loss": 0.6915, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.002601384650915861, "rewards/margins": 0.003762087319046259, "rewards/rejected": -0.0011607027845457196, "step": 190 }, { "epoch": 0.09599232061435085, "grad_norm": 45.25, "learning_rate": 4.784688995215311e-07, "logits/chosen": -2.7268319129943848, "logits/rejected": -2.6072897911071777, "logps/chosen": -228.60128784179688, "logps/rejected": -190.16201782226562, "loss": 0.6899, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0014531847555190325, "rewards/margins": 0.00695295725017786, "rewards/rejected": -0.005499773193150759, "step": 200 }, { "epoch": 0.09599232061435085, "eval_logits/chosen": -2.7156012058258057, "eval_logits/rejected": -2.595405340194702, "eval_logps/chosen": -233.07533264160156, "eval_logps/rejected": -205.5474853515625, "eval_loss": 0.6897569298744202, "eval_rewards/accuracies": 0.5680000185966492, "eval_rewards/chosen": 0.005765980575233698, "eval_rewards/margins": 0.007436447311192751, "eval_rewards/rejected": -0.0016704658046364784, "eval_runtime": 21.4199, "eval_samples_per_second": 46.686, "eval_steps_per_second": 11.671, "step": 200 }, { "epoch": 0.1007919366450684, "grad_norm": 50.0, "learning_rate": 4.999996487062011e-07, "logits/chosen": -2.6748883724212646, "logits/rejected": -2.5881714820861816, "logps/chosen": -242.6162872314453, "logps/rejected": -214.3368377685547, "loss": 0.6931, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.0026280046440660954, "rewards/margins": 0.000607747002504766, "rewards/rejected": 0.002020257292315364, "step": 210 }, { "epoch": 0.10559155267578593, "grad_norm": 49.5, "learning_rate": 4.999574946449064e-07, "logits/chosen": -2.7096612453460693, "logits/rejected": -2.5823137760162354, "logps/chosen": -226.0032196044922, "logps/rejected": -185.87234497070312, "loss": 0.6908, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.007502657826989889, "rewards/margins": 0.005469072610139847, "rewards/rejected": 0.002033584751188755, "step": 220 }, { "epoch": 0.11039116870650348, "grad_norm": 53.25, "learning_rate": 4.998450953980164e-07, "logits/chosen": -2.674795389175415, "logits/rejected": -2.562544345855713, "logps/chosen": -231.25247192382812, "logps/rejected": -223.4167938232422, "loss": 0.6922, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.005688765086233616, "rewards/margins": 0.0028063193894922733, "rewards/rejected": 0.0028824463952332735, "step": 230 }, { "epoch": 0.11519078473722102, "grad_norm": 49.5, "learning_rate": 4.996624825529257e-07, "logits/chosen": -2.752612590789795, "logits/rejected": -2.641317367553711, "logps/chosen": -216.5712432861328, "logps/rejected": -192.3323211669922, "loss": 0.69, "rewards/accuracies": 0.53125, "rewards/chosen": 0.01013671699911356, "rewards/margins": 0.006918230559676886, "rewards/rejected": 0.003218486439436674, "step": 240 }, { "epoch": 0.11999040076793857, "grad_norm": 51.5, "learning_rate": 4.994097074290524e-07, "logits/chosen": -2.7131876945495605, "logits/rejected": -2.591782331466675, "logps/chosen": -228.76925659179688, "logps/rejected": -200.34799194335938, "loss": 0.6902, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.00667478796094656, "rewards/margins": 0.00645422050729394, "rewards/rejected": 0.00022056761372368783, "step": 250 }, { "epoch": 0.1247900167986561, "grad_norm": 53.0, "learning_rate": 4.990868410634162e-07, "logits/chosen": -2.7187414169311523, "logits/rejected": -2.6327602863311768, "logps/chosen": -225.66043090820312, "logps/rejected": -192.34942626953125, "loss": 0.6881, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.009334566071629524, "rewards/margins": 0.010765586979687214, "rewards/rejected": -0.0014310205588117242, "step": 260 }, { "epoch": 0.12958963282937366, "grad_norm": 55.0, "learning_rate": 4.986939741906753e-07, "logits/chosen": -2.7310328483581543, "logits/rejected": -2.6244540214538574, "logps/chosen": -214.0838623046875, "logps/rejected": -191.58615112304688, "loss": 0.685, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.012120475992560387, "rewards/margins": 0.016924794763326645, "rewards/rejected": -0.004804318305104971, "step": 270 }, { "epoch": 0.13438924886009118, "grad_norm": 61.75, "learning_rate": 4.982312172176264e-07, "logits/chosen": -2.7920923233032227, "logits/rejected": -2.5907645225524902, "logps/chosen": -273.3122253417969, "logps/rejected": -205.17733764648438, "loss": 0.6891, "rewards/accuracies": 0.5625, "rewards/chosen": 0.02231917902827263, "rewards/margins": 0.008881422691047192, "rewards/rejected": 0.013437752611935139, "step": 280 }, { "epoch": 0.13918886489080873, "grad_norm": 57.25, "learning_rate": 4.976987001921786e-07, "logits/chosen": -2.710538625717163, "logits/rejected": -2.5878021717071533, "logps/chosen": -235.57437133789062, "logps/rejected": -204.8675537109375, "loss": 0.6868, "rewards/accuracies": 0.625, "rewards/chosen": 0.013490339741110802, "rewards/margins": 0.01348956674337387, "rewards/rejected": 7.734633982181549e-07, "step": 290 }, { "epoch": 0.14398848092152627, "grad_norm": 52.75, "learning_rate": 4.97096572766805e-07, "logits/chosen": -2.727212905883789, "logits/rejected": -2.582718849182129, "logps/chosen": -240.856201171875, "logps/rejected": -190.96470642089844, "loss": 0.6864, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.01539058517664671, "rewards/margins": 0.014497148804366589, "rewards/rejected": 0.0008934367215260863, "step": 300 }, { "epoch": 0.14878809695224382, "grad_norm": 49.0, "learning_rate": 4.964250041564868e-07, "logits/chosen": -2.7062602043151855, "logits/rejected": -2.5759198665618896, "logps/chosen": -232.9503173828125, "logps/rejected": -198.68533325195312, "loss": 0.6863, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.010409911163151264, "rewards/margins": 0.01460896898061037, "rewards/rejected": -0.0041990578174591064, "step": 310 }, { "epoch": 0.15358771298296137, "grad_norm": 48.0, "learning_rate": 4.956841830911587e-07, "logits/chosen": -2.688969850540161, "logits/rejected": -2.5635781288146973, "logps/chosen": -244.30337524414062, "logps/rejected": -198.82345581054688, "loss": 0.6871, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.01625187136232853, "rewards/margins": 0.013046267442405224, "rewards/rejected": 0.0032056006602942944, "step": 320 }, { "epoch": 0.1583873290136789, "grad_norm": 56.25, "learning_rate": 4.948743177626708e-07, "logits/chosen": -2.708862543106079, "logits/rejected": -2.5964908599853516, "logps/chosen": -218.1621856689453, "logps/rejected": -196.46609497070312, "loss": 0.691, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.015645433217287064, "rewards/margins": 0.005278537981212139, "rewards/rejected": 0.010366896167397499, "step": 330 }, { "epoch": 0.16318694504439646, "grad_norm": 53.5, "learning_rate": 4.939956357662805e-07, "logits/chosen": -2.664097309112549, "logits/rejected": -2.504223108291626, "logps/chosen": -233.2242431640625, "logps/rejected": -178.89642333984375, "loss": 0.6869, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.010349711403250694, "rewards/margins": 0.013576941564679146, "rewards/rejected": -0.0032272294629365206, "step": 340 }, { "epoch": 0.16798656107511398, "grad_norm": 53.25, "learning_rate": 4.930483840366915e-07, "logits/chosen": -2.6505606174468994, "logits/rejected": -2.508861541748047, "logps/chosen": -253.9485321044922, "logps/rejected": -195.91908264160156, "loss": 0.6851, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.024759415537118912, "rewards/margins": 0.017002228647470474, "rewards/rejected": 0.0077571868896484375, "step": 350 }, { "epoch": 0.17278617710583152, "grad_norm": 46.75, "learning_rate": 4.920328287786586e-07, "logits/chosen": -2.661841869354248, "logits/rejected": -2.5565028190612793, "logps/chosen": -229.78305053710938, "logps/rejected": -194.48904418945312, "loss": 0.6866, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.021085303276777267, "rewards/margins": 0.014185063540935516, "rewards/rejected": 0.006900241132825613, "step": 360 }, { "epoch": 0.17758579313654907, "grad_norm": 45.5, "learning_rate": 4.90949255392176e-07, "logits/chosen": -2.70994234085083, "logits/rejected": -2.547222852706909, "logps/chosen": -244.82565307617188, "logps/rejected": -202.6025390625, "loss": 0.6828, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.022651832550764084, "rewards/margins": 0.021922901272773743, "rewards/rejected": 0.0007289334898814559, "step": 370 }, { "epoch": 0.18238540916726662, "grad_norm": 48.75, "learning_rate": 4.897979683922727e-07, "logits/chosen": -2.733055591583252, "logits/rejected": -2.6267523765563965, "logps/chosen": -218.7659149169922, "logps/rejected": -180.62173461914062, "loss": 0.6845, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.01779359206557274, "rewards/margins": 0.01852073147892952, "rewards/rejected": -0.0007271372596733272, "step": 380 }, { "epoch": 0.18718502519798416, "grad_norm": 48.75, "learning_rate": 4.885792913234339e-07, "logits/chosen": -2.6706607341766357, "logits/rejected": -2.6117610931396484, "logps/chosen": -223.50021362304688, "logps/rejected": -208.219482421875, "loss": 0.6879, "rewards/accuracies": 0.59375, "rewards/chosen": 0.025855297222733498, "rewards/margins": 0.011695639230310917, "rewards/rejected": 0.014159657061100006, "step": 390 }, { "epoch": 0.1919846412287017, "grad_norm": 49.5, "learning_rate": 4.872935666686766e-07, "logits/chosen": -2.6978952884674072, "logits/rejected": -2.5849671363830566, "logps/chosen": -233.3878631591797, "logps/rejected": -212.00723266601562, "loss": 0.6868, "rewards/accuracies": 0.59375, "rewards/chosen": 0.025096680968999863, "rewards/margins": 0.013743218965828419, "rewards/rejected": 0.01135346107184887, "step": 400 }, { "epoch": 0.1919846412287017, "eval_logits/chosen": -2.7157444953918457, "eval_logits/rejected": -2.595482587814331, "eval_logps/chosen": -232.87948608398438, "eval_logps/rejected": -205.45095825195312, "eval_loss": 0.6850579977035522, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": 0.025349698960781097, "eval_rewards/margins": 0.017367491498589516, "eval_rewards/rejected": 0.007982207462191582, "eval_runtime": 21.4159, "eval_samples_per_second": 46.694, "eval_steps_per_second": 11.674, "step": 400 }, { "epoch": 0.19678425725941925, "grad_norm": 50.75, "learning_rate": 4.859411557533018e-07, "logits/chosen": -2.7110087871551514, "logits/rejected": -2.599547863006592, "logps/chosen": -229.89578247070312, "logps/rejected": -196.5421905517578, "loss": 0.685, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.023681003600358963, "rewards/margins": 0.01766652800142765, "rewards/rejected": 0.006014474667608738, "step": 410 }, { "epoch": 0.2015838732901368, "grad_norm": 47.75, "learning_rate": 4.845224386433521e-07, "logits/chosen": -2.6937224864959717, "logits/rejected": -2.616425037384033, "logps/chosen": -207.7985382080078, "logps/rejected": -210.020751953125, "loss": 0.6884, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.02355768159031868, "rewards/margins": 0.01056084968149662, "rewards/rejected": 0.012996832840144634, "step": 420 }, { "epoch": 0.20638348932085432, "grad_norm": 42.75, "learning_rate": 4.830378140388015e-07, "logits/chosen": -2.802743434906006, "logits/rejected": -2.6532254219055176, "logps/chosen": -238.71115112304688, "logps/rejected": -192.9730987548828, "loss": 0.6816, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.025352749973535538, "rewards/margins": 0.02443523518741131, "rewards/rejected": 0.0009175121667794883, "step": 430 }, { "epoch": 0.21118310535157186, "grad_norm": 52.5, "learning_rate": 4.814876991615104e-07, "logits/chosen": -2.682868719100952, "logits/rejected": -2.5881507396698, "logps/chosen": -226.88131713867188, "logps/rejected": -197.5751495361328, "loss": 0.6858, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.024431098252534866, "rewards/margins": 0.016206270083785057, "rewards/rejected": 0.008224830962717533, "step": 440 }, { "epoch": 0.2159827213822894, "grad_norm": 55.0, "learning_rate": 4.798725296379735e-07, "logits/chosen": -2.711108684539795, "logits/rejected": -2.626420497894287, "logps/chosen": -221.24533081054688, "logps/rejected": -193.32937622070312, "loss": 0.6844, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.031023338437080383, "rewards/margins": 0.01875515654683113, "rewards/rejected": 0.012268180958926678, "step": 450 }, { "epoch": 0.22078233741300696, "grad_norm": 48.5, "learning_rate": 4.781927593768969e-07, "logits/chosen": -2.7570741176605225, "logits/rejected": -2.6272220611572266, "logps/chosen": -232.4735107421875, "logps/rejected": -199.38890075683594, "loss": 0.6839, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.026766937226057053, "rewards/margins": 0.019803114235401154, "rewards/rejected": 0.006963823921978474, "step": 460 }, { "epoch": 0.2255819534437245, "grad_norm": 46.5, "learning_rate": 4.764488604416364e-07, "logits/chosen": -2.7485814094543457, "logits/rejected": -2.579071044921875, "logps/chosen": -257.55096435546875, "logps/rejected": -215.48782348632812, "loss": 0.6799, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.03846278041601181, "rewards/margins": 0.02814416028559208, "rewards/rejected": 0.010318620130419731, "step": 470 }, { "epoch": 0.23038156947444205, "grad_norm": 49.5, "learning_rate": 4.7464132291753457e-07, "logits/chosen": -2.693459987640381, "logits/rejected": -2.601459503173828, "logps/chosen": -213.8591766357422, "logps/rejected": -188.70779418945312, "loss": 0.6871, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.02118738368153572, "rewards/margins": 0.013606322929263115, "rewards/rejected": 0.007581062614917755, "step": 480 }, { "epoch": 0.2351811855051596, "grad_norm": 50.0, "learning_rate": 4.7277065477419236e-07, "logits/chosen": -2.6836752891540527, "logits/rejected": -2.5498709678649902, "logps/chosen": -229.8844451904297, "logps/rejected": -181.79513549804688, "loss": 0.68, "rewards/accuracies": 0.625, "rewards/chosen": 0.02958494983613491, "rewards/margins": 0.027864400297403336, "rewards/rejected": 0.0017205558251589537, "step": 490 }, { "epoch": 0.23998080153587714, "grad_norm": 50.0, "learning_rate": 4.7083738172271575e-07, "logits/chosen": -2.6776702404022217, "logits/rejected": -2.5478641986846924, "logps/chosen": -241.678466796875, "logps/rejected": -201.00245666503906, "loss": 0.6835, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.03041999600827694, "rewards/margins": 0.020813625305891037, "rewards/rejected": 0.009606371633708477, "step": 500 }, { "epoch": 0.24478041756659466, "grad_norm": 47.5, "learning_rate": 4.6884204706797537e-07, "logits/chosen": -2.67305850982666, "logits/rejected": -2.5395994186401367, "logps/chosen": -246.15304565429688, "logps/rejected": -191.6173553466797, "loss": 0.6802, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.03808588907122612, "rewards/margins": 0.027713218703866005, "rewards/rejected": 0.010372666642069817, "step": 510 }, { "epoch": 0.2495800335973122, "grad_norm": 56.25, "learning_rate": 4.6678521155592266e-07, "logits/chosen": -2.715430498123169, "logits/rejected": -2.5766196250915527, "logps/chosen": -257.3648681640625, "logps/rejected": -218.2578125, "loss": 0.6819, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.03974943235516548, "rewards/margins": 0.0246734581887722, "rewards/rejected": 0.01507597416639328, "step": 520 }, { "epoch": 0.2543796496280298, "grad_norm": 42.75, "learning_rate": 4.646674532160041e-07, "logits/chosen": -2.7444615364074707, "logits/rejected": -2.642268180847168, "logps/chosen": -234.83529663085938, "logps/rejected": -207.768798828125, "loss": 0.6895, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.02697952464222908, "rewards/margins": 0.008664881810545921, "rewards/rejected": 0.018314644694328308, "step": 530 }, { "epoch": 0.2591792656587473, "grad_norm": 52.5, "learning_rate": 4.624893671987185e-07, "logits/chosen": -2.710597515106201, "logits/rejected": -2.618180513381958, "logps/chosen": -220.21383666992188, "logps/rejected": -187.06417846679688, "loss": 0.6782, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.030816808342933655, "rewards/margins": 0.031680621206760406, "rewards/rejected": -0.0008638119325041771, "step": 540 }, { "epoch": 0.2639788816894648, "grad_norm": 51.5, "learning_rate": 4.602515656083629e-07, "logits/chosen": -2.7750511169433594, "logits/rejected": -2.689318895339966, "logps/chosen": -234.1312255859375, "logps/rejected": -218.6779327392578, "loss": 0.682, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.032838549464941025, "rewards/margins": 0.02399415522813797, "rewards/rejected": 0.008844394236803055, "step": 550 }, { "epoch": 0.26877849772018236, "grad_norm": 48.5, "learning_rate": 4.5795467733101356e-07, "logits/chosen": -2.716984510421753, "logits/rejected": -2.536345958709717, "logps/chosen": -238.9961395263672, "logps/rejected": -209.2313690185547, "loss": 0.6839, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.029773468151688576, "rewards/margins": 0.020234117284417152, "rewards/rejected": 0.009539352729916573, "step": 560 }, { "epoch": 0.2735781137508999, "grad_norm": 45.0, "learning_rate": 4.555993478577911e-07, "logits/chosen": -2.7671806812286377, "logits/rejected": -2.5658230781555176, "logps/chosen": -245.57693481445312, "logps/rejected": -186.72381591796875, "loss": 0.6743, "rewards/accuracies": 0.6875, "rewards/chosen": 0.04541153460741043, "rewards/margins": 0.03957425057888031, "rewards/rejected": 0.0058372789062559605, "step": 570 }, { "epoch": 0.27837772978161746, "grad_norm": 46.75, "learning_rate": 4.531862391034591e-07, "logits/chosen": -2.6841483116149902, "logits/rejected": -2.5884292125701904, "logps/chosen": -234.3633270263672, "logps/rejected": -198.0540008544922, "loss": 0.6779, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.03814179450273514, "rewards/margins": 0.03233181685209274, "rewards/rejected": 0.005809984169900417, "step": 580 }, { "epoch": 0.283177345812335, "grad_norm": 48.5, "learning_rate": 4.5071602922040734e-07, "logits/chosen": -2.762327194213867, "logits/rejected": -2.6249194145202637, "logps/chosen": -237.1390380859375, "logps/rejected": -201.12574768066406, "loss": 0.6779, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.0438932366669178, "rewards/margins": 0.032640643417835236, "rewards/rejected": 0.011252591386437416, "step": 590 }, { "epoch": 0.28797696184305255, "grad_norm": 51.25, "learning_rate": 4.4818941240807133e-07, "logits/chosen": -2.751591920852661, "logits/rejected": -2.6312174797058105, "logps/chosen": -235.29855346679688, "logps/rejected": -211.7731475830078, "loss": 0.6741, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.043742720037698746, "rewards/margins": 0.040415119379758835, "rewards/rejected": 0.0033275973983108997, "step": 600 }, { "epoch": 0.28797696184305255, "eval_logits/chosen": -2.7159337997436523, "eval_logits/rejected": -2.5955567359924316, "eval_logps/chosen": -232.7647247314453, "eval_logps/rejected": -205.43447875976562, "eval_loss": 0.6805809140205383, "eval_rewards/accuracies": 0.640999972820282, "eval_rewards/chosen": 0.03682754188776016, "eval_rewards/margins": 0.02719729021191597, "eval_rewards/rejected": 0.009630252607166767, "eval_runtime": 21.3954, "eval_samples_per_second": 46.739, "eval_steps_per_second": 11.685, "step": 600 }, { "epoch": 0.2927765778737701, "grad_norm": 46.75, "learning_rate": 4.456070987178426e-07, "logits/chosen": -2.7190473079681396, "logits/rejected": -2.549043893814087, "logps/chosen": -218.71493530273438, "logps/rejected": -175.1071014404297, "loss": 0.6771, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.029196638613939285, "rewards/margins": 0.034055858850479126, "rewards/rejected": -0.004859219305217266, "step": 610 }, { "epoch": 0.29757619390448764, "grad_norm": 43.5, "learning_rate": 4.429698138535241e-07, "logits/chosen": -2.689408779144287, "logits/rejected": -2.5913164615631104, "logps/chosen": -238.9366912841797, "logps/rejected": -217.3396453857422, "loss": 0.6825, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.04081985726952553, "rewards/margins": 0.02339627407491207, "rewards/rejected": 0.01742357760667801, "step": 620 }, { "epoch": 0.3023758099352052, "grad_norm": 45.25, "learning_rate": 4.402782989673867e-07, "logits/chosen": -2.7332329750061035, "logits/rejected": -2.5742244720458984, "logps/chosen": -241.5056610107422, "logps/rejected": -199.6383514404297, "loss": 0.6788, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.044023603200912476, "rewards/margins": 0.03084336593747139, "rewards/rejected": 0.013180236332118511, "step": 630 }, { "epoch": 0.30717542596592273, "grad_norm": 48.5, "learning_rate": 4.3753331045188415e-07, "logits/chosen": -2.651803970336914, "logits/rejected": -2.60718035697937, "logps/chosen": -223.8908233642578, "logps/rejected": -215.56411743164062, "loss": 0.6881, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.027814963832497597, "rewards/margins": 0.012350986711680889, "rewards/rejected": 0.015463980846107006, "step": 640 }, { "epoch": 0.3119750419966403, "grad_norm": 47.75, "learning_rate": 4.3473561972708517e-07, "logits/chosen": -2.7187139987945557, "logits/rejected": -2.5601890087127686, "logps/chosen": -232.9247283935547, "logps/rejected": -204.0401611328125, "loss": 0.6835, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.030253728851675987, "rewards/margins": 0.021437767893075943, "rewards/rejected": 0.008815961889922619, "step": 650 }, { "epoch": 0.3167746580273578, "grad_norm": 45.75, "learning_rate": 4.3188601302388276e-07, "logits/chosen": -2.6520533561706543, "logits/rejected": -2.577056407928467, "logps/chosen": -217.80337524414062, "logps/rejected": -217.4649658203125, "loss": 0.6781, "rewards/accuracies": 0.6875, "rewards/chosen": 0.043406371027231216, "rewards/margins": 0.03220932558178902, "rewards/rejected": 0.011197047308087349, "step": 660 }, { "epoch": 0.32157427405807537, "grad_norm": 45.0, "learning_rate": 4.289852911630406e-07, "logits/chosen": -2.746192455291748, "logits/rejected": -2.5919899940490723, "logps/chosen": -261.55328369140625, "logps/rejected": -208.4563446044922, "loss": 0.6746, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.05559698864817619, "rewards/margins": 0.039354514330625534, "rewards/rejected": 0.016242478042840958, "step": 670 }, { "epoch": 0.3263738900887929, "grad_norm": 56.5, "learning_rate": 4.2603426933013955e-07, "logits/chosen": -2.7136101722717285, "logits/rejected": -2.5737733840942383, "logps/chosen": -235.10751342773438, "logps/rejected": -190.7623291015625, "loss": 0.6826, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.03037920594215393, "rewards/margins": 0.02374129556119442, "rewards/rejected": 0.0066379099152982235, "step": 680 }, { "epoch": 0.33117350611951046, "grad_norm": 49.75, "learning_rate": 4.2303377684648734e-07, "logits/chosen": -2.693387746810913, "logits/rejected": -2.6150875091552734, "logps/chosen": -229.60073852539062, "logps/rejected": -227.20849609375, "loss": 0.6781, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.047377780079841614, "rewards/margins": 0.03262994438409805, "rewards/rejected": 0.014747830107808113, "step": 690 }, { "epoch": 0.33597312215022795, "grad_norm": 58.25, "learning_rate": 4.199846569360557e-07, "logits/chosen": -2.7111198902130127, "logits/rejected": -2.6025779247283936, "logps/chosen": -236.203857421875, "logps/rejected": -210.2013702392578, "loss": 0.6878, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.03395792096853256, "rewards/margins": 0.013354765251278877, "rewards/rejected": 0.020603153854608536, "step": 700 }, { "epoch": 0.3407727381809455, "grad_norm": 51.0, "learning_rate": 4.1688776648851034e-07, "logits/chosen": -2.7076306343078613, "logits/rejected": -2.5523858070373535, "logps/chosen": -228.8372039794922, "logps/rejected": -181.23193359375, "loss": 0.6765, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.03608817234635353, "rewards/margins": 0.03536154329776764, "rewards/rejected": 0.0007266284665092826, "step": 710 }, { "epoch": 0.34557235421166305, "grad_norm": 49.25, "learning_rate": 4.1374397581840034e-07, "logits/chosen": -2.7360334396362305, "logits/rejected": -2.5981593132019043, "logps/chosen": -227.6597137451172, "logps/rejected": -183.78594970703125, "loss": 0.676, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.040428828448057175, "rewards/margins": 0.03674127534031868, "rewards/rejected": 0.0036875568330287933, "step": 720 }, { "epoch": 0.3503719702423806, "grad_norm": 53.5, "learning_rate": 4.105541684205751e-07, "logits/chosen": -2.6906344890594482, "logits/rejected": -2.5774295330047607, "logps/chosen": -218.3501434326172, "logps/rejected": -195.3230743408203, "loss": 0.6776, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.037394892424345016, "rewards/margins": 0.03364209085702896, "rewards/rejected": 0.0037527973763644695, "step": 730 }, { "epoch": 0.35517158627309814, "grad_norm": 49.25, "learning_rate": 4.073192407218971e-07, "logits/chosen": -2.7351787090301514, "logits/rejected": -2.5966103076934814, "logps/chosen": -241.0888214111328, "logps/rejected": -191.10189819335938, "loss": 0.6704, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.0579073540866375, "rewards/margins": 0.04853241890668869, "rewards/rejected": 0.009374936111271381, "step": 740 }, { "epoch": 0.3599712023038157, "grad_norm": 50.5, "learning_rate": 4.040401018293204e-07, "logits/chosen": -2.664130687713623, "logits/rejected": -2.594024181365967, "logps/chosen": -221.1615753173828, "logps/rejected": -228.0982666015625, "loss": 0.6793, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.0396769680082798, "rewards/margins": 0.03049297071993351, "rewards/rejected": 0.009183998219668865, "step": 750 }, { "epoch": 0.36477081833453323, "grad_norm": 52.25, "learning_rate": 4.0071767327440536e-07, "logits/chosen": -2.6687798500061035, "logits/rejected": -2.6370534896850586, "logps/chosen": -235.759521484375, "logps/rejected": -229.422119140625, "loss": 0.6776, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.05410134792327881, "rewards/margins": 0.034135472029447556, "rewards/rejected": 0.019965868443250656, "step": 760 }, { "epoch": 0.3695704343652508, "grad_norm": 47.25, "learning_rate": 3.9735288875434254e-07, "logits/chosen": -2.741582155227661, "logits/rejected": -2.554959774017334, "logps/chosen": -242.64480590820312, "logps/rejected": -185.625244140625, "loss": 0.6808, "rewards/accuracies": 0.625, "rewards/chosen": 0.034398965537548065, "rewards/margins": 0.02690746821463108, "rewards/rejected": 0.007491500116884708, "step": 770 }, { "epoch": 0.3743700503959683, "grad_norm": 55.0, "learning_rate": 3.939466938695565e-07, "logits/chosen": -2.660132884979248, "logits/rejected": -2.5671591758728027, "logps/chosen": -253.7039337158203, "logps/rejected": -221.8881072998047, "loss": 0.6792, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.04830535501241684, "rewards/margins": 0.031025772914290428, "rewards/rejected": 0.01727958396077156, "step": 780 }, { "epoch": 0.37916966642668587, "grad_norm": 46.25, "learning_rate": 3.905000458579657e-07, "logits/chosen": -2.672783851623535, "logits/rejected": -2.598494291305542, "logps/chosen": -210.40975952148438, "logps/rejected": -224.6177978515625, "loss": 0.6821, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.03782086446881294, "rewards/margins": 0.02480388432741165, "rewards/rejected": 0.01301698386669159, "step": 790 }, { "epoch": 0.3839692824574034, "grad_norm": 50.75, "learning_rate": 3.870139133259709e-07, "logits/chosen": -2.6891722679138184, "logits/rejected": -2.5445141792297363, "logps/chosen": -259.342529296875, "logps/rejected": -206.988525390625, "loss": 0.6767, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.05699441581964493, "rewards/margins": 0.03583725541830063, "rewards/rejected": 0.0211571604013443, "step": 800 }, { "epoch": 0.3839692824574034, "eval_logits/chosen": -2.7158119678497314, "eval_logits/rejected": -2.595388650894165, "eval_logps/chosen": -232.6337127685547, "eval_logps/rejected": -205.41590881347656, "eval_loss": 0.6753210425376892, "eval_rewards/accuracies": 0.6669999957084656, "eval_rewards/chosen": 0.04992655664682388, "eval_rewards/margins": 0.03843830153346062, "eval_rewards/rejected": 0.01148825604468584, "eval_runtime": 21.4126, "eval_samples_per_second": 46.702, "eval_steps_per_second": 11.675, "step": 800 }, { "epoch": 0.38876889848812096, "grad_norm": 55.5, "learning_rate": 3.8348927597624964e-07, "logits/chosen": -2.740044116973877, "logits/rejected": -2.6301164627075195, "logps/chosen": -231.58377075195312, "logps/rejected": -210.11886596679688, "loss": 0.6786, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.052413731813430786, "rewards/margins": 0.031818680465221405, "rewards/rejected": 0.020595049485564232, "step": 810 }, { "epoch": 0.3935685145188385, "grad_norm": 49.25, "learning_rate": 3.7992712433243114e-07, "logits/chosen": -2.717849016189575, "logits/rejected": -2.5538547039031982, "logps/chosen": -233.3022003173828, "logps/rejected": -178.08187866210938, "loss": 0.6776, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.034468021243810654, "rewards/margins": 0.03307095915079117, "rewards/rejected": 0.001397057669237256, "step": 820 }, { "epoch": 0.39836813054955605, "grad_norm": 47.5, "learning_rate": 3.7632845946073135e-07, "logits/chosen": -2.7453646659851074, "logits/rejected": -2.5826191902160645, "logps/chosen": -246.8603515625, "logps/rejected": -179.30404663085938, "loss": 0.6707, "rewards/accuracies": 0.6875, "rewards/chosen": 0.045634619891643524, "rewards/margins": 0.047946564853191376, "rewards/rejected": -0.002311945194378495, "step": 830 }, { "epoch": 0.4031677465802736, "grad_norm": 48.0, "learning_rate": 3.7269429268862507e-07, "logits/chosen": -2.710023880004883, "logits/rejected": -2.6359734535217285, "logps/chosen": -208.82150268554688, "logps/rejected": -196.3325653076172, "loss": 0.6799, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.04252176731824875, "rewards/margins": 0.02919645607471466, "rewards/rejected": 0.01332530565559864, "step": 840 }, { "epoch": 0.40796736261099115, "grad_norm": 57.0, "learning_rate": 3.6902564532063336e-07, "logits/chosen": -2.7001442909240723, "logits/rejected": -2.6420705318450928, "logps/chosen": -214.80667114257812, "logps/rejected": -198.60533142089844, "loss": 0.6792, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.04096178710460663, "rewards/margins": 0.03064887225627899, "rewards/rejected": 0.010312914848327637, "step": 850 }, { "epoch": 0.41276697864170864, "grad_norm": 50.0, "learning_rate": 3.653235483513084e-07, "logits/chosen": -2.736861228942871, "logits/rejected": -2.618833541870117, "logps/chosen": -248.7901153564453, "logps/rejected": -216.1083526611328, "loss": 0.6743, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.056495171040296555, "rewards/margins": 0.041087765246629715, "rewards/rejected": 0.015407413244247437, "step": 860 }, { "epoch": 0.4175665946724262, "grad_norm": 48.0, "learning_rate": 3.615890421754944e-07, "logits/chosen": -2.724944591522217, "logits/rejected": -2.6574723720550537, "logps/chosen": -223.84408569335938, "logps/rejected": -194.44735717773438, "loss": 0.6796, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.042622704058885574, "rewards/margins": 0.02971811592578888, "rewards/rejected": 0.012904593721032143, "step": 870 }, { "epoch": 0.42236621070314373, "grad_norm": 61.0, "learning_rate": 3.5782317629594706e-07, "logits/chosen": -2.706808567047119, "logits/rejected": -2.6081411838531494, "logps/chosen": -241.17495727539062, "logps/rejected": -212.4481658935547, "loss": 0.6819, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.047704242169857025, "rewards/margins": 0.025270383805036545, "rewards/rejected": 0.022433852776885033, "step": 880 }, { "epoch": 0.4271658267338613, "grad_norm": 47.5, "learning_rate": 3.5402700902839313e-07, "logits/chosen": -2.6064233779907227, "logits/rejected": -2.556283473968506, "logps/chosen": -206.00344848632812, "logps/rejected": -208.06936645507812, "loss": 0.6791, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.036233410239219666, "rewards/margins": 0.030693132430315018, "rewards/rejected": 0.005540275014936924, "step": 890 }, { "epoch": 0.4319654427645788, "grad_norm": 46.75, "learning_rate": 3.5020160720411403e-07, "logits/chosen": -2.722177267074585, "logits/rejected": -2.592517137527466, "logps/chosen": -234.11703491210938, "logps/rejected": -216.63766479492188, "loss": 0.6759, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.05363321304321289, "rewards/margins": 0.03814633563160896, "rewards/rejected": 0.015486878342926502, "step": 900 }, { "epoch": 0.43676505879529637, "grad_norm": 52.75, "learning_rate": 3.46348045870135e-07, "logits/chosen": -2.6586787700653076, "logits/rejected": -2.601860523223877, "logps/chosen": -215.1962432861328, "logps/rejected": -208.45068359375, "loss": 0.6834, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.04313874989748001, "rewards/margins": 0.022245222702622414, "rewards/rejected": 0.020893529057502747, "step": 910 }, { "epoch": 0.4415646748260139, "grad_norm": 45.75, "learning_rate": 3.4246740798710725e-07, "logits/chosen": -2.672468900680542, "logits/rejected": -2.5783610343933105, "logps/chosen": -216.0347442626953, "logps/rejected": -195.738037109375, "loss": 0.6819, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.03864717856049538, "rewards/margins": 0.02574675716459751, "rewards/rejected": 0.01290042232722044, "step": 920 }, { "epoch": 0.44636429085673146, "grad_norm": 50.25, "learning_rate": 3.3856078412496417e-07, "logits/chosen": -2.729473829269409, "logits/rejected": -2.588343858718872, "logps/chosen": -238.8496551513672, "logps/rejected": -190.5778045654297, "loss": 0.6685, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.048866622149944305, "rewards/margins": 0.05237164348363876, "rewards/rejected": -0.003505019936710596, "step": 930 }, { "epoch": 0.451163906887449, "grad_norm": 49.5, "learning_rate": 3.3462927215644066e-07, "logits/chosen": -2.747483968734741, "logits/rejected": -2.640693187713623, "logps/chosen": -262.37213134765625, "logps/rejected": -216.82742309570312, "loss": 0.6748, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.060678768903017044, "rewards/margins": 0.04027427360415459, "rewards/rejected": 0.020404506474733353, "step": 940 }, { "epoch": 0.45596352291816655, "grad_norm": 42.75, "learning_rate": 3.3067397694853937e-07, "logits/chosen": -2.6840896606445312, "logits/rejected": -2.554112434387207, "logps/chosen": -233.49899291992188, "logps/rejected": -191.6746063232422, "loss": 0.6713, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.051722604781389236, "rewards/margins": 0.04748953878879547, "rewards/rejected": 0.0042330720461905, "step": 950 }, { "epoch": 0.4607631389488841, "grad_norm": 46.0, "learning_rate": 3.2669601005203155e-07, "logits/chosen": -2.717355251312256, "logits/rejected": -2.615908145904541, "logps/chosen": -204.6543426513672, "logps/rejected": -185.7799072265625, "loss": 0.6734, "rewards/accuracies": 0.65625, "rewards/chosen": 0.04618459939956665, "rewards/margins": 0.042539265006780624, "rewards/rejected": 0.003645337652415037, "step": 960 }, { "epoch": 0.46556275497960165, "grad_norm": 48.5, "learning_rate": 3.2269648938907973e-07, "logits/chosen": -2.6776490211486816, "logits/rejected": -2.560394287109375, "logps/chosen": -214.96142578125, "logps/rejected": -182.30978393554688, "loss": 0.6781, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0470888614654541, "rewards/margins": 0.033395569771528244, "rewards/rejected": 0.013693295419216156, "step": 970 }, { "epoch": 0.4703623710103192, "grad_norm": 49.0, "learning_rate": 3.186765389390695e-07, "logits/chosen": -2.7659125328063965, "logits/rejected": -2.6198360919952393, "logps/chosen": -251.896240234375, "logps/rejected": -194.74826049804688, "loss": 0.6773, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.043909598141908646, "rewards/margins": 0.03449582681059837, "rewards/rejected": 0.009413773193955421, "step": 980 }, { "epoch": 0.47516198704103674, "grad_norm": 50.25, "learning_rate": 3.146372884227393e-07, "logits/chosen": -2.7383854389190674, "logits/rejected": -2.633877992630005, "logps/chosen": -249.55557250976562, "logps/rejected": -215.5314178466797, "loss": 0.6781, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.05108872801065445, "rewards/margins": 0.03366169333457947, "rewards/rejected": 0.01742703653872013, "step": 990 }, { "epoch": 0.4799616030717543, "grad_norm": 45.75, "learning_rate": 3.105798729846969e-07, "logits/chosen": -2.6620967388153076, "logits/rejected": -2.5416641235351562, "logps/chosen": -214.88015747070312, "logps/rejected": -182.47698974609375, "loss": 0.676, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.04558812081813812, "rewards/margins": 0.0367765799164772, "rewards/rejected": 0.00881153903901577, "step": 1000 }, { "epoch": 0.4799616030717543, "eval_logits/chosen": -2.716266393661499, "eval_logits/rejected": -2.595984935760498, "eval_logps/chosen": -232.63925170898438, "eval_logps/rejected": -205.39816284179688, "eval_loss": 0.6766188740730286, "eval_rewards/accuracies": 0.6570000052452087, "eval_rewards/chosen": 0.04937145859003067, "eval_rewards/margins": 0.03610716760158539, "eval_rewards/rejected": 0.013264299370348454, "eval_runtime": 21.4065, "eval_samples_per_second": 46.715, "eval_steps_per_second": 11.679, "step": 1000 }, { "epoch": 0.48476121910247183, "grad_norm": 47.0, "learning_rate": 3.065054328744109e-07, "logits/chosen": -2.6782500743865967, "logits/rejected": -2.5327606201171875, "logps/chosen": -249.7314910888672, "logps/rejected": -209.0973663330078, "loss": 0.6751, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.05049954727292061, "rewards/margins": 0.03927897661924362, "rewards/rejected": 0.011220571584999561, "step": 1010 }, { "epoch": 0.4895608351331893, "grad_norm": 51.75, "learning_rate": 3.024151131257687e-07, "logits/chosen": -2.7015366554260254, "logits/rejected": -2.5806756019592285, "logps/chosen": -245.3987274169922, "logps/rejected": -191.32785034179688, "loss": 0.6728, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.04835500195622444, "rewards/margins": 0.04381892830133438, "rewards/rejected": 0.00453607365489006, "step": 1020 }, { "epoch": 0.49436045116390687, "grad_norm": 50.75, "learning_rate": 2.9831006323528886e-07, "logits/chosen": -2.7741270065307617, "logits/rejected": -2.5906481742858887, "logps/chosen": -254.17239379882812, "logps/rejected": -197.8710479736328, "loss": 0.673, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.059526920318603516, "rewards/margins": 0.04361771419644356, "rewards/rejected": 0.01590920425951481, "step": 1030 }, { "epoch": 0.4991600671946244, "grad_norm": 48.0, "learning_rate": 2.941914368390798e-07, "logits/chosen": -2.692235231399536, "logits/rejected": -2.610217332839966, "logps/chosen": -218.3246307373047, "logps/rejected": -205.70993041992188, "loss": 0.6816, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.032597918063402176, "rewards/margins": 0.025554979220032692, "rewards/rejected": 0.007042936980724335, "step": 1040 }, { "epoch": 0.503959683225342, "grad_norm": 56.75, "learning_rate": 2.900603913886357e-07, "logits/chosen": -2.672635555267334, "logits/rejected": -2.5501255989074707, "logps/chosen": -244.4874267578125, "logps/rejected": -211.55068969726562, "loss": 0.6746, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.04713388532400131, "rewards/margins": 0.04026245325803757, "rewards/rejected": 0.006871436722576618, "step": 1050 }, { "epoch": 0.5087592992560596, "grad_norm": 49.25, "learning_rate": 2.859180878255588e-07, "logits/chosen": -2.682440996170044, "logits/rejected": -2.611323833465576, "logps/chosen": -232.0714874267578, "logps/rejected": -215.4655303955078, "loss": 0.6815, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.04437769576907158, "rewards/margins": 0.02631448581814766, "rewards/rejected": 0.018063215538859367, "step": 1060 }, { "epoch": 0.5135589152867771, "grad_norm": 50.25, "learning_rate": 2.8176569025530234e-07, "logits/chosen": -2.7059943675994873, "logits/rejected": -2.602865219116211, "logps/chosen": -232.48147583007812, "logps/rejected": -204.5135040283203, "loss": 0.6773, "rewards/accuracies": 0.625, "rewards/chosen": 0.05438702180981636, "rewards/margins": 0.034470170736312866, "rewards/rejected": 0.019916851073503494, "step": 1070 }, { "epoch": 0.5183585313174947, "grad_norm": 50.25, "learning_rate": 2.7760436562002346e-07, "logits/chosen": -2.6945126056671143, "logits/rejected": -2.516050338745117, "logps/chosen": -265.76055908203125, "logps/rejected": -181.6714324951172, "loss": 0.6745, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.05421316623687744, "rewards/margins": 0.04068455100059509, "rewards/rejected": 0.0135286133736372, "step": 1080 }, { "epoch": 0.5231581473482121, "grad_norm": 53.5, "learning_rate": 2.734352833706392e-07, "logits/chosen": -2.7844748497009277, "logits/rejected": -2.654388904571533, "logps/chosen": -249.6466827392578, "logps/rejected": -214.0853271484375, "loss": 0.6748, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.06085364893078804, "rewards/margins": 0.04021080583333969, "rewards/rejected": 0.020642835646867752, "step": 1090 }, { "epoch": 0.5279577633789296, "grad_norm": 47.75, "learning_rate": 2.6925961513817733e-07, "logits/chosen": -2.6918578147888184, "logits/rejected": -2.627488374710083, "logps/chosen": -199.40310668945312, "logps/rejected": -200.98233032226562, "loss": 0.676, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.043318361043930054, "rewards/margins": 0.03646283596754074, "rewards/rejected": 0.006855523679405451, "step": 1100 }, { "epoch": 0.5327573794096472, "grad_norm": 50.0, "learning_rate": 2.6507853440451484e-07, "logits/chosen": -2.7055044174194336, "logits/rejected": -2.61013126373291, "logps/chosen": -227.26321411132812, "logps/rejected": -205.27490234375, "loss": 0.6744, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.04602568596601486, "rewards/margins": 0.04149339720606804, "rewards/rejected": 0.00453228922560811, "step": 1110 }, { "epoch": 0.5375569954403647, "grad_norm": 42.0, "learning_rate": 2.608932161725958e-07, "logits/chosen": -2.7049965858459473, "logits/rejected": -2.570584774017334, "logps/chosen": -232.04818725585938, "logps/rejected": -203.71127319335938, "loss": 0.6741, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.04708060622215271, "rewards/margins": 0.04201260581612587, "rewards/rejected": 0.005067999474704266, "step": 1120 }, { "epoch": 0.5423566114710823, "grad_norm": 56.5, "learning_rate": 2.5670483663622247e-07, "logits/chosen": -2.6920132637023926, "logits/rejected": -2.583217144012451, "logps/chosen": -239.03427124023438, "logps/rejected": -200.62376403808594, "loss": 0.6751, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.04974411427974701, "rewards/margins": 0.039146848022937775, "rewards/rejected": 0.010597268119454384, "step": 1130 }, { "epoch": 0.5471562275017998, "grad_norm": 43.5, "learning_rate": 2.5251457284951056e-07, "logits/chosen": -2.709200859069824, "logits/rejected": -2.6131153106689453, "logps/chosen": -227.12826538085938, "logps/rejected": -191.06222534179688, "loss": 0.6735, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.051633380353450775, "rewards/margins": 0.04308091849088669, "rewards/rejected": 0.008552461862564087, "step": 1140 }, { "epoch": 0.5519558435325174, "grad_norm": 45.5, "learning_rate": 2.4832360239610414e-07, "logits/chosen": -2.702821731567383, "logits/rejected": -2.5874671936035156, "logps/chosen": -228.0370635986328, "logps/rejected": -201.68345642089844, "loss": 0.6721, "rewards/accuracies": 0.65625, "rewards/chosen": 0.052544206380844116, "rewards/margins": 0.04553115367889404, "rewards/rejected": 0.007013053633272648, "step": 1150 }, { "epoch": 0.5567554595632349, "grad_norm": 52.25, "learning_rate": 2.441331030582407e-07, "logits/chosen": -2.720001697540283, "logits/rejected": -2.630744457244873, "logps/chosen": -223.72116088867188, "logps/rejected": -205.20474243164062, "loss": 0.6789, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.05077257752418518, "rewards/margins": 0.030812978744506836, "rewards/rejected": 0.019959593191742897, "step": 1160 }, { "epoch": 0.5615550755939525, "grad_norm": 48.5, "learning_rate": 2.39944252485761e-07, "logits/chosen": -2.7418465614318848, "logits/rejected": -2.5958893299102783, "logps/chosen": -245.22238159179688, "logps/rejected": -192.0289764404297, "loss": 0.6723, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.05651768296957016, "rewards/margins": 0.04533126950263977, "rewards/rejected": 0.011186418123543262, "step": 1170 }, { "epoch": 0.56635469162467, "grad_norm": 53.25, "learning_rate": 2.3575822786515529e-07, "logits/chosen": -2.6802361011505127, "logits/rejected": -2.5656845569610596, "logps/chosen": -231.7133026123047, "logps/rejected": -209.87765502929688, "loss": 0.6792, "rewards/accuracies": 0.625, "rewards/chosen": 0.04587788134813309, "rewards/margins": 0.03062388300895691, "rewards/rejected": 0.015253995545208454, "step": 1180 }, { "epoch": 0.5711543076553875, "grad_norm": 54.0, "learning_rate": 2.3157620558874106e-07, "logits/chosen": -2.723170757293701, "logits/rejected": -2.5895724296569824, "logps/chosen": -242.99923706054688, "logps/rejected": -193.60450744628906, "loss": 0.6776, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.052822746336460114, "rewards/margins": 0.03466617316007614, "rewards/rejected": 0.018156569451093674, "step": 1190 }, { "epoch": 0.5759539236861051, "grad_norm": 42.75, "learning_rate": 2.2739936092406286e-07, "logits/chosen": -2.674161434173584, "logits/rejected": -2.576936721801758, "logps/chosen": -226.0552215576172, "logps/rejected": -213.71524047851562, "loss": 0.6774, "rewards/accuracies": 0.65625, "rewards/chosen": 0.05344771221280098, "rewards/margins": 0.034632958471775055, "rewards/rejected": 0.018814753741025925, "step": 1200 }, { "epoch": 0.5759539236861051, "eval_logits/chosen": -2.71634578704834, "eval_logits/rejected": -2.595935583114624, "eval_logps/chosen": -232.62538146972656, "eval_logps/rejected": -205.43319702148438, "eval_loss": 0.6742354035377502, "eval_rewards/accuracies": 0.675000011920929, "eval_rewards/chosen": 0.050759363919496536, "eval_rewards/margins": 0.041003111749887466, "eval_rewards/rejected": 0.009756244719028473, "eval_runtime": 21.4174, "eval_samples_per_second": 46.691, "eval_steps_per_second": 11.673, "step": 1200 }, { "epoch": 0.5807535397168226, "grad_norm": 48.25, "learning_rate": 2.232288676836087e-07, "logits/chosen": -2.617983341217041, "logits/rejected": -2.5485970973968506, "logps/chosen": -238.41134643554688, "logps/rejected": -203.52255249023438, "loss": 0.6683, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.06037604808807373, "rewards/margins": 0.05402814596891403, "rewards/rejected": 0.006347896996885538, "step": 1210 }, { "epoch": 0.5855531557475402, "grad_norm": 50.25, "learning_rate": 2.1906589789493518e-07, "logits/chosen": -2.710653066635132, "logits/rejected": -2.5681469440460205, "logps/chosen": -217.65231323242188, "logps/rejected": -183.86453247070312, "loss": 0.6765, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.04624713212251663, "rewards/margins": 0.036256637424230576, "rewards/rejected": 0.009990494698286057, "step": 1220 }, { "epoch": 0.5903527717782577, "grad_norm": 50.25, "learning_rate": 2.1491162147129428e-07, "logits/chosen": -2.71733021736145, "logits/rejected": -2.6050782203674316, "logps/chosen": -232.63601684570312, "logps/rejected": -208.39132690429688, "loss": 0.6763, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.051035962998867035, "rewards/margins": 0.036614201962947845, "rewards/rejected": 0.014421762898564339, "step": 1230 }, { "epoch": 0.5951523878089753, "grad_norm": 48.0, "learning_rate": 2.107672058828544e-07, "logits/chosen": -2.722160816192627, "logits/rejected": -2.608168363571167, "logps/chosen": -226.43807983398438, "logps/rejected": -192.34970092773438, "loss": 0.6717, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.05640612170100212, "rewards/margins": 0.0462309755384922, "rewards/rejected": 0.010175148025155067, "step": 1240 }, { "epoch": 0.5999520038396928, "grad_norm": 54.5, "learning_rate": 2.0663381582860825e-07, "logits/chosen": -2.7216320037841797, "logits/rejected": -2.643075942993164, "logps/chosen": -226.637451171875, "logps/rejected": -209.5499725341797, "loss": 0.6759, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.046819452196359634, "rewards/margins": 0.03698267415165901, "rewards/rejected": 0.00983678363263607, "step": 1250 }, { "epoch": 0.6047516198704104, "grad_norm": 45.25, "learning_rate": 2.025126129090588e-07, "logits/chosen": -2.776801586151123, "logits/rejected": -2.626488447189331, "logps/chosen": -221.3492431640625, "logps/rejected": -179.89920043945312, "loss": 0.6709, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.051989030092954636, "rewards/margins": 0.04751256853342056, "rewards/rejected": 0.004476464353501797, "step": 1260 }, { "epoch": 0.6095512359011279, "grad_norm": 43.5, "learning_rate": 1.9840475529977655e-07, "logits/chosen": -2.71726655960083, "logits/rejected": -2.6046361923217773, "logps/chosen": -227.9778594970703, "logps/rejected": -195.9775390625, "loss": 0.6739, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.044441260397434235, "rewards/margins": 0.04231487214565277, "rewards/rejected": 0.0021263775415718555, "step": 1270 }, { "epoch": 0.6143508519318455, "grad_norm": 49.75, "learning_rate": 1.9431139742591896e-07, "logits/chosen": -2.7021281719207764, "logits/rejected": -2.58604097366333, "logps/chosen": -207.78173828125, "logps/rejected": -187.71017456054688, "loss": 0.6787, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.03924870118498802, "rewards/margins": 0.031981147825717926, "rewards/rejected": 0.0072675542905926704, "step": 1280 }, { "epoch": 0.619150467962563, "grad_norm": 52.0, "learning_rate": 1.9023368963780455e-07, "logits/chosen": -2.721538543701172, "logits/rejected": -2.6105265617370605, "logps/chosen": -232.59326171875, "logps/rejected": -196.3831024169922, "loss": 0.6749, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.04439551383256912, "rewards/margins": 0.03949584811925888, "rewards/rejected": 0.004899662919342518, "step": 1290 }, { "epoch": 0.6239500839932806, "grad_norm": 47.25, "learning_rate": 1.861727778876314e-07, "logits/chosen": -2.7027573585510254, "logits/rejected": -2.5897445678710938, "logps/chosen": -207.4355926513672, "logps/rejected": -173.4372100830078, "loss": 0.6804, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.030175339430570602, "rewards/margins": 0.028225919231772423, "rewards/rejected": 0.001949421362951398, "step": 1300 }, { "epoch": 0.6287497000239981, "grad_norm": 46.5, "learning_rate": 1.821298034074315e-07, "logits/chosen": -2.7313363552093506, "logits/rejected": -2.6595630645751953, "logps/chosen": -222.908447265625, "logps/rejected": -203.55274963378906, "loss": 0.685, "rewards/accuracies": 0.5625, "rewards/chosen": 0.03836590051651001, "rewards/margins": 0.019602758809924126, "rewards/rejected": 0.018763139843940735, "step": 1310 }, { "epoch": 0.6335493160547156, "grad_norm": 48.5, "learning_rate": 1.7810590238835276e-07, "logits/chosen": -2.6614937782287598, "logits/rejected": -2.6302168369293213, "logps/chosen": -224.0082244873047, "logps/rejected": -239.5669708251953, "loss": 0.6785, "rewards/accuracies": 0.59375, "rewards/chosen": 0.049693018198013306, "rewards/margins": 0.03309093788266182, "rewards/rejected": 0.016602078452706337, "step": 1320 }, { "epoch": 0.6383489320854332, "grad_norm": 48.75, "learning_rate": 1.7410220566135603e-07, "logits/chosen": -2.733497142791748, "logits/rejected": -2.613424777984619, "logps/chosen": -227.0560302734375, "logps/rejected": -196.37181091308594, "loss": 0.6739, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.05607549101114273, "rewards/margins": 0.041366271674633026, "rewards/rejected": 0.014709214679896832, "step": 1330 }, { "epoch": 0.6431485481161507, "grad_norm": 46.75, "learning_rate": 1.7011983837942021e-07, "logits/chosen": -2.7072107791900635, "logits/rejected": -2.5902278423309326, "logps/chosen": -233.81179809570312, "logps/rejected": -206.1332244873047, "loss": 0.6712, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0620468370616436, "rewards/margins": 0.04747987538576126, "rewards/rejected": 0.014566963538527489, "step": 1340 }, { "epoch": 0.6479481641468683, "grad_norm": 45.5, "learning_rate": 1.6615991970134158e-07, "logits/chosen": -2.741150379180908, "logits/rejected": -2.6298651695251465, "logps/chosen": -221.12841796875, "logps/rejected": -191.54258728027344, "loss": 0.6773, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.04875689372420311, "rewards/margins": 0.03473493829369545, "rewards/rejected": 0.014021962881088257, "step": 1350 }, { "epoch": 0.6527477801775858, "grad_norm": 46.75, "learning_rate": 1.622235624772183e-07, "logits/chosen": -2.6976001262664795, "logits/rejected": -2.5869300365448, "logps/chosen": -232.46533203125, "logps/rejected": -209.5470428466797, "loss": 0.6755, "rewards/accuracies": 0.6875, "rewards/chosen": 0.05371447280049324, "rewards/margins": 0.0387248769402504, "rewards/rejected": 0.014989593997597694, "step": 1360 }, { "epoch": 0.6575473962083034, "grad_norm": 46.5, "learning_rate": 1.5831187293570825e-07, "logits/chosen": -2.722553014755249, "logits/rejected": -2.602963924407959, "logps/chosen": -272.6893615722656, "logps/rejected": -217.13632202148438, "loss": 0.6784, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.06017666310071945, "rewards/margins": 0.03393205627799034, "rewards/rejected": 0.026244616135954857, "step": 1370 }, { "epoch": 0.6623470122390209, "grad_norm": 50.25, "learning_rate": 1.5442595037314648e-07, "logits/chosen": -2.7165400981903076, "logits/rejected": -2.5861897468566895, "logps/chosen": -240.0535888671875, "logps/rejected": -186.63296508789062, "loss": 0.6701, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.06224694103002548, "rewards/margins": 0.049013856798410416, "rewards/rejected": 0.013233085162937641, "step": 1380 }, { "epoch": 0.6671466282697385, "grad_norm": 48.25, "learning_rate": 1.5056688684461232e-07, "logits/chosen": -2.7177727222442627, "logits/rejected": -2.5875582695007324, "logps/chosen": -241.77590942382812, "logps/rejected": -202.8583221435547, "loss": 0.6715, "rewards/accuracies": 0.71875, "rewards/chosen": 0.05555950850248337, "rewards/margins": 0.04663746803998947, "rewards/rejected": 0.008922042325139046, "step": 1390 }, { "epoch": 0.6719462443004559, "grad_norm": 52.0, "learning_rate": 1.4673576685703026e-07, "logits/chosen": -2.71079158782959, "logits/rejected": -2.6190669536590576, "logps/chosen": -240.28317260742188, "logps/rejected": -208.4333953857422, "loss": 0.6746, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.05780891329050064, "rewards/margins": 0.04101189970970154, "rewards/rejected": 0.016797009855508804, "step": 1400 }, { "epoch": 0.6719462443004559, "eval_logits/chosen": -2.7162351608276367, "eval_logits/rejected": -2.595787763595581, "eval_logps/chosen": -232.603759765625, "eval_logps/rejected": -205.423583984375, "eval_loss": 0.6737259030342102, "eval_rewards/accuracies": 0.6549999713897705, "eval_rewards/chosen": 0.052920494228601456, "eval_rewards/margins": 0.04220106825232506, "eval_rewards/rejected": 0.010719424113631248, "eval_runtime": 21.4584, "eval_samples_per_second": 46.602, "eval_steps_per_second": 11.65, "step": 1400 }, { "epoch": 0.6767458603311735, "grad_norm": 45.25, "learning_rate": 1.429336670643929e-07, "logits/chosen": -2.6878199577331543, "logits/rejected": -2.5576183795928955, "logps/chosen": -217.6654052734375, "logps/rejected": -194.15072631835938, "loss": 0.6747, "rewards/accuracies": 0.65625, "rewards/chosen": 0.04295631870627403, "rewards/margins": 0.04018958657979965, "rewards/rejected": 0.0027667314279824495, "step": 1410 }, { "epoch": 0.681545476361891, "grad_norm": 44.25, "learning_rate": 1.3916165596519013e-07, "logits/chosen": -2.721832036972046, "logits/rejected": -2.5464541912078857, "logps/chosen": -230.22433471679688, "logps/rejected": -185.80426025390625, "loss": 0.673, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.0356689877808094, "rewards/margins": 0.04347275570034981, "rewards/rejected": -0.007803765125572681, "step": 1420 }, { "epoch": 0.6863450923926085, "grad_norm": 50.75, "learning_rate": 1.354207936021309e-07, "logits/chosen": -2.7223222255706787, "logits/rejected": -2.565199375152588, "logps/chosen": -222.4684295654297, "logps/rejected": -181.96670532226562, "loss": 0.6756, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0483887605369091, "rewards/margins": 0.0376238189637661, "rewards/rejected": 0.010764943435788155, "step": 1430 }, { "epoch": 0.6911447084233261, "grad_norm": 56.0, "learning_rate": 1.317121312642406e-07, "logits/chosen": -2.712290048599243, "logits/rejected": -2.5553765296936035, "logps/chosen": -231.23538208007812, "logps/rejected": -199.30477905273438, "loss": 0.6737, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.05653299763798714, "rewards/margins": 0.04253407567739487, "rewards/rejected": 0.013998927548527718, "step": 1440 }, { "epoch": 0.6959443244540436, "grad_norm": 54.0, "learning_rate": 1.280367111914195e-07, "logits/chosen": -2.635277509689331, "logits/rejected": -2.543097972869873, "logps/chosen": -245.3563690185547, "logps/rejected": -227.46142578125, "loss": 0.6802, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.055252768099308014, "rewards/margins": 0.029850680381059647, "rewards/rejected": 0.02540207840502262, "step": 1450 }, { "epoch": 0.7007439404847612, "grad_norm": 58.75, "learning_rate": 1.243955662815429e-07, "logits/chosen": -2.7177271842956543, "logits/rejected": -2.5459141731262207, "logps/chosen": -247.24038696289062, "logps/rejected": -206.8587188720703, "loss": 0.6777, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.05328672379255295, "rewards/margins": 0.03413508087396622, "rewards/rejected": 0.01915164105594158, "step": 1460 }, { "epoch": 0.7055435565154787, "grad_norm": 52.25, "learning_rate": 1.207897198001878e-07, "logits/chosen": -2.747087001800537, "logits/rejected": -2.646921396255493, "logps/chosen": -230.12109375, "logps/rejected": -196.12423706054688, "loss": 0.6732, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.050879109650850296, "rewards/margins": 0.04289903864264488, "rewards/rejected": 0.007980065420269966, "step": 1470 }, { "epoch": 0.7103431725461963, "grad_norm": 50.0, "learning_rate": 1.1722018509306586e-07, "logits/chosen": -2.708061456680298, "logits/rejected": -2.556723117828369, "logps/chosen": -247.2672119140625, "logps/rejected": -188.83200073242188, "loss": 0.6704, "rewards/accuracies": 0.625, "rewards/chosen": 0.04851624369621277, "rewards/margins": 0.04957341402769089, "rewards/rejected": -0.001057169632986188, "step": 1480 }, { "epoch": 0.7151427885769138, "grad_norm": 46.5, "learning_rate": 1.1368796530124442e-07, "logits/chosen": -2.671211004257202, "logits/rejected": -2.543172597885132, "logps/chosen": -249.50686645507812, "logps/rejected": -195.772216796875, "loss": 0.6652, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.06298129260540009, "rewards/margins": 0.05997220426797867, "rewards/rejected": 0.0030090927612036467, "step": 1490 }, { "epoch": 0.7199424046076314, "grad_norm": 48.25, "learning_rate": 1.1019405307923557e-07, "logits/chosen": -2.719313144683838, "logits/rejected": -2.598017454147339, "logps/chosen": -244.72067260742188, "logps/rejected": -204.39401245117188, "loss": 0.6727, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.06084643676877022, "rewards/margins": 0.04462161287665367, "rewards/rejected": 0.016224823892116547, "step": 1500 }, { "epoch": 0.7247420206383489, "grad_norm": 52.0, "learning_rate": 1.0673943031603133e-07, "logits/chosen": -2.7169058322906494, "logits/rejected": -2.6187710762023926, "logps/chosen": -221.7633819580078, "logps/rejected": -206.36257934570312, "loss": 0.6743, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.04818148910999298, "rewards/margins": 0.04159141331911087, "rewards/rejected": 0.006590074393898249, "step": 1510 }, { "epoch": 0.7295416366690665, "grad_norm": 51.0, "learning_rate": 1.0332506785916522e-07, "logits/chosen": -2.690253496170044, "logits/rejected": -2.5843067169189453, "logps/chosen": -238.80850219726562, "logps/rejected": -208.9747314453125, "loss": 0.6821, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.04689895361661911, "rewards/margins": 0.025392215698957443, "rewards/rejected": 0.021506736055016518, "step": 1520 }, { "epoch": 0.734341252699784, "grad_norm": 45.75, "learning_rate": 9.995192524187637e-08, "logits/chosen": -2.633424997329712, "logits/rejected": -2.576991081237793, "logps/chosen": -220.3133087158203, "logps/rejected": -207.2361297607422, "loss": 0.6806, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.04313893988728523, "rewards/margins": 0.028983239084482193, "rewards/rejected": 0.01415570080280304, "step": 1530 }, { "epoch": 0.7391408687305016, "grad_norm": 47.25, "learning_rate": 9.662095041345317e-08, "logits/chosen": -2.6786739826202393, "logits/rejected": -2.547990322113037, "logps/chosen": -245.1093292236328, "logps/rejected": -216.21823120117188, "loss": 0.6736, "rewards/accuracies": 0.625, "rewards/chosen": 0.05186639353632927, "rewards/margins": 0.042230743914842606, "rewards/rejected": 0.00963564682751894, "step": 1540 }, { "epoch": 0.7439404847612191, "grad_norm": 52.25, "learning_rate": 9.333307947283256e-08, "logits/chosen": -2.7363951206207275, "logits/rejected": -2.621778964996338, "logps/chosen": -240.5101318359375, "logps/rejected": -212.27685546875, "loss": 0.677, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.04980180412530899, "rewards/margins": 0.035508893430233, "rewards/rejected": 0.014292912557721138, "step": 1550 }, { "epoch": 0.7487401007919366, "grad_norm": 45.75, "learning_rate": 9.008923640552978e-08, "logits/chosen": -2.688732147216797, "logits/rejected": -2.5987465381622314, "logps/chosen": -210.93844604492188, "logps/rejected": -178.03103637695312, "loss": 0.6756, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.03177894279360771, "rewards/margins": 0.03770405799150467, "rewards/rejected": -0.005925112869590521, "step": 1560 }, { "epoch": 0.7535397168226542, "grad_norm": 49.25, "learning_rate": 8.689033282397165e-08, "logits/chosen": -2.717036485671997, "logits/rejected": -2.59865140914917, "logps/chosen": -228.39285278320312, "logps/rejected": -202.19210815429688, "loss": 0.6765, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.04258617386221886, "rewards/margins": 0.03610239177942276, "rewards/rejected": 0.006483784876763821, "step": 1570 }, { "epoch": 0.7583393328533717, "grad_norm": 44.25, "learning_rate": 8.373726771130768e-08, "logits/chosen": -2.7102208137512207, "logits/rejected": -2.585137128829956, "logps/chosen": -240.1363983154297, "logps/rejected": -198.0478057861328, "loss": 0.6694, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.058793772011995316, "rewards/margins": 0.051263321191072464, "rewards/rejected": 0.007530451752245426, "step": 1580 }, { "epoch": 0.7631389488840893, "grad_norm": 54.5, "learning_rate": 8.063092716877015e-08, "logits/chosen": -2.654996871948242, "logits/rejected": -2.5496888160705566, "logps/chosen": -253.6233673095703, "logps/rejected": -211.02017211914062, "loss": 0.6746, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.05824859067797661, "rewards/margins": 0.04030389338731766, "rewards/rejected": 0.0179446954280138, "step": 1590 }, { "epoch": 0.7679385649148068, "grad_norm": 46.75, "learning_rate": 7.757218416665445e-08, "logits/chosen": -2.745260715484619, "logits/rejected": -2.5834543704986572, "logps/chosen": -229.595703125, "logps/rejected": -188.58816528320312, "loss": 0.6678, "rewards/accuracies": 0.6875, "rewards/chosen": 0.05159440636634827, "rewards/margins": 0.054445721209049225, "rewards/rejected": -0.00285131623968482, "step": 1600 }, { "epoch": 0.7679385649148068, "eval_logits/chosen": -2.7164273262023926, "eval_logits/rejected": -2.5960192680358887, "eval_logps/chosen": -232.62025451660156, "eval_logps/rejected": -205.42721557617188, "eval_loss": 0.6742997169494629, "eval_rewards/accuracies": 0.6629999876022339, "eval_rewards/chosen": 0.051273249089717865, "eval_rewards/margins": 0.04091595113277435, "eval_rewards/rejected": 0.010357297956943512, "eval_runtime": 21.4386, "eval_samples_per_second": 46.645, "eval_steps_per_second": 11.661, "step": 1600 }, { "epoch": 0.7727381809455244, "grad_norm": 53.75, "learning_rate": 7.456189829898954e-08, "logits/chosen": -2.722618818283081, "logits/rejected": -2.5636143684387207, "logps/chosen": -236.95559692382812, "logps/rejected": -187.6465301513672, "loss": 0.6711, "rewards/accuracies": 0.6875, "rewards/chosen": 0.05168156698346138, "rewards/margins": 0.04733755439519882, "rewards/rejected": 0.00434401398524642, "step": 1610 }, { "epoch": 0.7775377969762419, "grad_norm": 47.5, "learning_rate": 7.160091554196731e-08, "logits/chosen": -2.7647414207458496, "logits/rejected": -2.6293978691101074, "logps/chosen": -234.0639190673828, "logps/rejected": -196.3838653564453, "loss": 0.673, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.051941949874162674, "rewards/margins": 0.043656349182128906, "rewards/rejected": 0.008285606279969215, "step": 1620 }, { "epoch": 0.7823374130069595, "grad_norm": 46.75, "learning_rate": 6.86900680161994e-08, "logits/chosen": -2.6866321563720703, "logits/rejected": -2.612730026245117, "logps/chosen": -240.7451171875, "logps/rejected": -228.06771850585938, "loss": 0.6801, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.05514489486813545, "rewards/margins": 0.029478853568434715, "rewards/rejected": 0.025666039437055588, "step": 1630 }, { "epoch": 0.787137029037677, "grad_norm": 41.75, "learning_rate": 6.583017375286726e-08, "logits/chosen": -2.695338726043701, "logits/rejected": -2.5727577209472656, "logps/chosen": -229.021728515625, "logps/rejected": -195.28890991210938, "loss": 0.6761, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.050636857748031616, "rewards/margins": 0.03745696693658829, "rewards/rejected": 0.01317988894879818, "step": 1640 }, { "epoch": 0.7919366450683946, "grad_norm": 48.0, "learning_rate": 6.302203646383239e-08, "logits/chosen": -2.71480131149292, "logits/rejected": -2.6168365478515625, "logps/chosen": -241.9342041015625, "logps/rejected": -196.26498413085938, "loss": 0.6745, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.047247517853975296, "rewards/margins": 0.04049244523048401, "rewards/rejected": 0.006755062844604254, "step": 1650 }, { "epoch": 0.7967362610991121, "grad_norm": 50.0, "learning_rate": 6.02664453157703e-08, "logits/chosen": -2.764697790145874, "logits/rejected": -2.6510090827941895, "logps/chosen": -232.6202850341797, "logps/rejected": -214.68887329101562, "loss": 0.6789, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.04804060235619545, "rewards/margins": 0.031984902918338776, "rewards/rejected": 0.016055695712566376, "step": 1660 }, { "epoch": 0.8015358771298297, "grad_norm": 39.25, "learning_rate": 5.756417470839195e-08, "logits/chosen": -2.7477545738220215, "logits/rejected": -2.6470861434936523, "logps/chosen": -226.798828125, "logps/rejected": -196.84617614746094, "loss": 0.6756, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.04336618259549141, "rewards/margins": 0.03806891292333603, "rewards/rejected": 0.005297265015542507, "step": 1670 }, { "epoch": 0.8063354931605472, "grad_norm": 46.0, "learning_rate": 5.491598405681558e-08, "logits/chosen": -2.7781832218170166, "logits/rejected": -2.594255208969116, "logps/chosen": -244.9146728515625, "logps/rejected": -191.92153930664062, "loss": 0.6696, "rewards/accuracies": 0.625, "rewards/chosen": 0.05770384520292282, "rewards/margins": 0.05054632946848869, "rewards/rejected": 0.007157514337450266, "step": 1680 }, { "epoch": 0.8111351091912647, "grad_norm": 46.25, "learning_rate": 5.232261757814924e-08, "logits/chosen": -2.676637649536133, "logits/rejected": -2.5312628746032715, "logps/chosen": -239.4225616455078, "logps/rejected": -201.6903076171875, "loss": 0.6665, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.06439922004938126, "rewards/margins": 0.0572139136493206, "rewards/rejected": 0.007185307331383228, "step": 1690 }, { "epoch": 0.8159347252219823, "grad_norm": 48.0, "learning_rate": 4.978480408234465e-08, "logits/chosen": -2.6256635189056396, "logits/rejected": -2.5944604873657227, "logps/chosen": -213.68728637695312, "logps/rejected": -203.4552459716797, "loss": 0.6819, "rewards/accuracies": 0.5625, "rewards/chosen": 0.035785894840955734, "rewards/margins": 0.025840366259217262, "rewards/rejected": 0.00994553230702877, "step": 1700 }, { "epoch": 0.8207343412526998, "grad_norm": 49.0, "learning_rate": 4.730325676738089e-08, "logits/chosen": -2.7068257331848145, "logits/rejected": -2.5982439517974854, "logps/chosen": -227.2494354248047, "logps/rejected": -194.36875915527344, "loss": 0.6773, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.041812531650066376, "rewards/margins": 0.0348488949239254, "rewards/rejected": 0.006963637657463551, "step": 1710 }, { "epoch": 0.8255339572834173, "grad_norm": 47.0, "learning_rate": 4.487867301883527e-08, "logits/chosen": -2.6358511447906494, "logits/rejected": -2.5312399864196777, "logps/chosen": -222.84445190429688, "logps/rejected": -203.24679565429688, "loss": 0.6697, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.05580927059054375, "rewards/margins": 0.04995386302471161, "rewards/rejected": 0.005855409428477287, "step": 1720 }, { "epoch": 0.8303335733141348, "grad_norm": 45.75, "learning_rate": 4.2511734213898085e-08, "logits/chosen": -2.7530338764190674, "logits/rejected": -2.6062283515930176, "logps/chosen": -251.0270233154297, "logps/rejected": -206.8340301513672, "loss": 0.6759, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.0477396696805954, "rewards/margins": 0.038779519498348236, "rewards/rejected": 0.008960146456956863, "step": 1730 }, { "epoch": 0.8351331893448524, "grad_norm": 59.75, "learning_rate": 4.020310552988632e-08, "logits/chosen": -2.726264476776123, "logits/rejected": -2.5524630546569824, "logps/chosen": -247.4801788330078, "logps/rejected": -204.47012329101562, "loss": 0.67, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0596492774784565, "rewards/margins": 0.04989113658666611, "rewards/rejected": 0.009758138097822666, "step": 1740 }, { "epoch": 0.8399328053755699, "grad_norm": 48.5, "learning_rate": 3.795343575730975e-08, "logits/chosen": -2.7325968742370605, "logits/rejected": -2.602531671524048, "logps/chosen": -251.66342163085938, "logps/rejected": -216.50717163085938, "loss": 0.6805, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.052908021956682205, "rewards/margins": 0.029805365949869156, "rewards/rejected": 0.0231026578694582, "step": 1750 }, { "epoch": 0.8447324214062875, "grad_norm": 50.75, "learning_rate": 3.576335711754236e-08, "logits/chosen": -2.7325785160064697, "logits/rejected": -2.6831870079040527, "logps/chosen": -234.35336303710938, "logps/rejected": -216.2118682861328, "loss": 0.6801, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.04432320594787598, "rewards/margins": 0.028889168053865433, "rewards/rejected": 0.015434036031365395, "step": 1760 }, { "epoch": 0.849532037437005, "grad_norm": 52.25, "learning_rate": 3.363348508515015e-08, "logits/chosen": -2.7496337890625, "logits/rejected": -2.6104989051818848, "logps/chosen": -236.09640502929688, "logps/rejected": -209.546875, "loss": 0.6741, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.05187439173460007, "rewards/margins": 0.041690729558467865, "rewards/rejected": 0.010183664970099926, "step": 1770 }, { "epoch": 0.8543316534677226, "grad_norm": 52.0, "learning_rate": 3.156441821492506e-08, "logits/chosen": -2.709050416946411, "logits/rejected": -2.584873676300049, "logps/chosen": -234.66232299804688, "logps/rejected": -206.11624145507812, "loss": 0.6738, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.05997660011053085, "rewards/margins": 0.04241427406668663, "rewards/rejected": 0.017562326043844223, "step": 1780 }, { "epoch": 0.8591312694984401, "grad_norm": 48.0, "learning_rate": 2.955673797367411e-08, "logits/chosen": -2.7106432914733887, "logits/rejected": -2.567945718765259, "logps/chosen": -238.9900665283203, "logps/rejected": -189.52700805664062, "loss": 0.6738, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.052786171436309814, "rewards/margins": 0.04187396913766861, "rewards/rejected": 0.01091220136731863, "step": 1790 }, { "epoch": 0.8639308855291576, "grad_norm": 49.75, "learning_rate": 2.7611008576810674e-08, "logits/chosen": -2.724682331085205, "logits/rejected": -2.603818416595459, "logps/chosen": -215.18359375, "logps/rejected": -187.88693237304688, "loss": 0.6833, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.03471002355217934, "rewards/margins": 0.02330438420176506, "rewards/rejected": 0.011405635625123978, "step": 1800 }, { "epoch": 0.8639308855291576, "eval_logits/chosen": -2.716348886489868, "eval_logits/rejected": -2.595895290374756, "eval_logps/chosen": -232.60520935058594, "eval_logps/rejected": -205.4226837158203, "eval_loss": 0.673865795135498, "eval_rewards/accuracies": 0.6620000004768372, "eval_rewards/chosen": 0.05277761444449425, "eval_rewards/margins": 0.041967809200286865, "eval_rewards/rejected": 0.010809808038175106, "eval_runtime": 21.4412, "eval_samples_per_second": 46.639, "eval_steps_per_second": 11.66, "step": 1800 }, { "epoch": 0.8687305015598752, "grad_norm": 53.5, "learning_rate": 2.5727776829793767e-08, "logits/chosen": -2.740374803543091, "logits/rejected": -2.5743534564971924, "logps/chosen": -236.7235565185547, "logps/rejected": -174.8079833984375, "loss": 0.6725, "rewards/accuracies": 0.625, "rewards/chosen": 0.045521851629018784, "rewards/margins": 0.04434143006801605, "rewards/rejected": 0.0011804220266640186, "step": 1810 }, { "epoch": 0.8735301175905927, "grad_norm": 52.75, "learning_rate": 2.390757197446025e-08, "logits/chosen": -2.721191883087158, "logits/rejected": -2.5652270317077637, "logps/chosen": -240.0341339111328, "logps/rejected": -187.92848205566406, "loss": 0.6739, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.05156964808702469, "rewards/margins": 0.041832335293293, "rewards/rejected": 0.009737305343151093, "step": 1820 }, { "epoch": 0.8783297336213103, "grad_norm": 50.0, "learning_rate": 2.2150905540292585e-08, "logits/chosen": -2.736666679382324, "logits/rejected": -2.5938286781311035, "logps/chosen": -227.4267578125, "logps/rejected": -202.85935974121094, "loss": 0.6757, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.054040707647800446, "rewards/margins": 0.03809930384159088, "rewards/rejected": 0.015941400080919266, "step": 1830 }, { "epoch": 0.8831293496520278, "grad_norm": 46.25, "learning_rate": 2.0458271200664624e-08, "logits/chosen": -2.6549439430236816, "logits/rejected": -2.612755537033081, "logps/chosen": -210.38265991210938, "logps/rejected": -196.44735717773438, "loss": 0.6762, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.04778756946325302, "rewards/margins": 0.0375523678958416, "rewards/rejected": 0.01023520715534687, "step": 1840 }, { "epoch": 0.8879289656827454, "grad_norm": 48.0, "learning_rate": 1.8830144634105206e-08, "logits/chosen": -2.7017998695373535, "logits/rejected": -2.5428473949432373, "logps/chosen": -245.57290649414062, "logps/rejected": -186.67721557617188, "loss": 0.6703, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.061512064188718796, "rewards/margins": 0.04928315803408623, "rewards/rejected": 0.012228906154632568, "step": 1850 }, { "epoch": 0.8927285817134629, "grad_norm": 46.5, "learning_rate": 1.7266983390618994e-08, "logits/chosen": -2.6695878505706787, "logits/rejected": -2.551301956176758, "logps/chosen": -227.3214111328125, "logps/rejected": -186.639404296875, "loss": 0.6687, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0619601309299469, "rewards/margins": 0.05267205834388733, "rewards/rejected": 0.009288066066801548, "step": 1860 }, { "epoch": 0.8975281977441805, "grad_norm": 44.5, "learning_rate": 1.5769226763101885e-08, "logits/chosen": -2.60475492477417, "logits/rejected": -2.5645339488983154, "logps/chosen": -228.18612670898438, "logps/rejected": -207.1147003173828, "loss": 0.6772, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.038809359073638916, "rewards/margins": 0.03571794182062149, "rewards/rejected": 0.0030914172530174255, "step": 1870 }, { "epoch": 0.902327813774898, "grad_norm": 47.0, "learning_rate": 1.4337295663887084e-08, "logits/chosen": -2.763521194458008, "logits/rejected": -2.614365339279175, "logps/chosen": -238.0912628173828, "logps/rejected": -187.44529724121094, "loss": 0.6705, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.06117742136120796, "rewards/margins": 0.04880157858133316, "rewards/rejected": 0.012375839985907078, "step": 1880 }, { "epoch": 0.9071274298056156, "grad_norm": 52.75, "learning_rate": 1.2971592506456796e-08, "logits/chosen": -2.6662535667419434, "logits/rejected": -2.585869550704956, "logps/chosen": -203.06375122070312, "logps/rejected": -187.27978515625, "loss": 0.6761, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.04065801948308945, "rewards/margins": 0.03694169595837593, "rewards/rejected": 0.0037163265515118837, "step": 1890 }, { "epoch": 0.9119270458363331, "grad_norm": 46.75, "learning_rate": 1.1672501092352544e-08, "logits/chosen": -2.7174623012542725, "logits/rejected": -2.5767555236816406, "logps/chosen": -239.62747192382812, "logps/rejected": -203.18661499023438, "loss": 0.6694, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.05886067822575569, "rewards/margins": 0.05083751678466797, "rewards/rejected": 0.008023159578442574, "step": 1900 }, { "epoch": 0.9167266618670507, "grad_norm": 50.5, "learning_rate": 1.0440386503315967e-08, "logits/chosen": -2.638658046722412, "logits/rejected": -2.565709114074707, "logps/chosen": -222.6656494140625, "logps/rejected": -242.17562866210938, "loss": 0.6788, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.049609534442424774, "rewards/margins": 0.0319090262055397, "rewards/rejected": 0.01770050823688507, "step": 1910 }, { "epoch": 0.9215262778977682, "grad_norm": 49.25, "learning_rate": 9.275594998690573e-09, "logits/chosen": -2.688535690307617, "logits/rejected": -2.516364574432373, "logps/chosen": -248.7972412109375, "logps/rejected": -191.66940307617188, "loss": 0.6685, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.0669298768043518, "rewards/margins": 0.052894193679094315, "rewards/rejected": 0.014035684056580067, "step": 1920 }, { "epoch": 0.9263258939284857, "grad_norm": 50.75, "learning_rate": 8.178453918112782e-09, "logits/chosen": -2.695676326751709, "logits/rejected": -2.564342737197876, "logps/chosen": -223.0631561279297, "logps/rejected": -179.12435913085938, "loss": 0.6699, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.04940425604581833, "rewards/margins": 0.049749527126550674, "rewards/rejected": -0.00034527387470006943, "step": 1930 }, { "epoch": 0.9311255099592033, "grad_norm": 64.5, "learning_rate": 7.149271589520167e-09, "logits/chosen": -2.655266523361206, "logits/rejected": -2.529818058013916, "logps/chosen": -209.69650268554688, "logps/rejected": -191.54139709472656, "loss": 0.6801, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.03373004496097565, "rewards/margins": 0.029740754514932632, "rewards/rejected": 0.0039892946369946, "step": 1940 }, { "epoch": 0.9359251259899208, "grad_norm": 40.75, "learning_rate": 6.188337242502784e-09, "logits/chosen": -2.6980903148651123, "logits/rejected": -2.5514559745788574, "logps/chosen": -236.17300415039062, "logps/rejected": -191.43814086914062, "loss": 0.6731, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.051222801208496094, "rewards/margins": 0.04344618320465088, "rewards/rejected": 0.007776615209877491, "step": 1950 }, { "epoch": 0.9407247420206384, "grad_norm": 43.5, "learning_rate": 5.295920927021108e-09, "logits/chosen": -2.7303788661956787, "logits/rejected": -2.624377727508545, "logps/chosen": -233.98922729492188, "logps/rejected": -197.11575317382812, "loss": 0.6741, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.05416171997785568, "rewards/margins": 0.04171646013855934, "rewards/rejected": 0.01244526170194149, "step": 1960 }, { "epoch": 0.9455243580513559, "grad_norm": 53.0, "learning_rate": 4.472273437514357e-09, "logits/chosen": -2.7538888454437256, "logits/rejected": -2.6263771057128906, "logps/chosen": -257.5274353027344, "logps/rejected": -210.43649291992188, "loss": 0.6698, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0656302273273468, "rewards/margins": 0.050034552812576294, "rewards/rejected": 0.015595669858157635, "step": 1970 }, { "epoch": 0.9503239740820735, "grad_norm": 46.75, "learning_rate": 3.7176262424202522e-09, "logits/chosen": -2.710458278656006, "logits/rejected": -2.611675262451172, "logps/chosen": -221.28195190429688, "logps/rejected": -202.12786865234375, "loss": 0.6825, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.03624237701296806, "rewards/margins": 0.023813677951693535, "rewards/rejected": 0.012428699992597103, "step": 1980 }, { "epoch": 0.955123590112791, "grad_norm": 51.25, "learning_rate": 3.0321914191255292e-09, "logits/chosen": -2.684296131134033, "logits/rejected": -2.57779598236084, "logps/chosen": -241.6301727294922, "logps/rejected": -214.57138061523438, "loss": 0.6789, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.044890034943819046, "rewards/margins": 0.03138625621795654, "rewards/rejected": 0.013503775000572205, "step": 1990 }, { "epoch": 0.9599232061435086, "grad_norm": 45.25, "learning_rate": 2.416161594366417e-09, "logits/chosen": -2.744062900543213, "logits/rejected": -2.644768238067627, "logps/chosen": -218.0794677734375, "logps/rejected": -204.07879638671875, "loss": 0.6743, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.05610308051109314, "rewards/margins": 0.04027719795703888, "rewards/rejected": 0.015825878828763962, "step": 2000 }, { "epoch": 0.9599232061435086, "eval_logits/chosen": -2.716303825378418, "eval_logits/rejected": -2.5958354473114014, "eval_logps/chosen": -232.603271484375, "eval_logps/rejected": -205.41600036621094, "eval_loss": 0.6740667223930359, "eval_rewards/accuracies": 0.6449999809265137, "eval_rewards/chosen": 0.05297102406620979, "eval_rewards/margins": 0.04149361699819565, "eval_rewards/rejected": 0.011477403342723846, "eval_runtime": 21.4581, "eval_samples_per_second": 46.602, "eval_steps_per_second": 11.651, "step": 2000 }, { "epoch": 0.9647228221742261, "grad_norm": 46.75, "learning_rate": 1.8697098900948283e-09, "logits/chosen": -2.670266628265381, "logits/rejected": -2.5685760974884033, "logps/chosen": -225.6419219970703, "logps/rejected": -218.05624389648438, "loss": 0.6796, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.04892081022262573, "rewards/margins": 0.0300876684486866, "rewards/rejected": 0.01883314736187458, "step": 2010 }, { "epoch": 0.9695224382049437, "grad_norm": 46.5, "learning_rate": 1.3929898748261948e-09, "logits/chosen": -2.751359224319458, "logits/rejected": -2.6107022762298584, "logps/chosen": -228.06881713867188, "logps/rejected": -205.7585906982422, "loss": 0.6756, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.042551226913928986, "rewards/margins": 0.03797770291566849, "rewards/rejected": 0.004573523066937923, "step": 2020 }, { "epoch": 0.9743220542356611, "grad_norm": 50.0, "learning_rate": 9.861355204825172e-10, "logits/chosen": -2.7200140953063965, "logits/rejected": -2.597716808319092, "logps/chosen": -256.5636291503906, "logps/rejected": -198.85813903808594, "loss": 0.6759, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.05483978986740112, "rewards/margins": 0.03850018233060837, "rewards/rejected": 0.016339603811502457, "step": 2030 }, { "epoch": 0.9791216702663786, "grad_norm": 47.25, "learning_rate": 6.492611647420932e-10, "logits/chosen": -2.6937568187713623, "logits/rejected": -2.550854206085205, "logps/chosen": -227.3085479736328, "logps/rejected": -193.7042694091797, "loss": 0.6746, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0502045638859272, "rewards/margins": 0.040242839604616165, "rewards/rejected": 0.009961729869246483, "step": 2040 }, { "epoch": 0.9839212862970962, "grad_norm": 50.0, "learning_rate": 3.8246147890763636e-10, "logits/chosen": -2.7479116916656494, "logits/rejected": -2.6015403270721436, "logps/chosen": -243.7687530517578, "logps/rejected": -200.82359313964844, "loss": 0.6735, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.05925138667225838, "rewards/margins": 0.042620036751031876, "rewards/rejected": 0.016631346195936203, "step": 2050 }, { "epoch": 0.9887209023278137, "grad_norm": 44.25, "learning_rate": 1.8581144130089266e-10, "logits/chosen": -2.674731731414795, "logits/rejected": -2.6031594276428223, "logps/chosen": -220.9689483642578, "logps/rejected": -206.5128173828125, "loss": 0.6804, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.046965621411800385, "rewards/margins": 0.028399232774972916, "rewards/rejected": 0.018566394224762917, "step": 2060 }, { "epoch": 0.9935205183585313, "grad_norm": 45.25, "learning_rate": 5.936631619152255e-11, "logits/chosen": -2.718005895614624, "logits/rejected": -2.6375200748443604, "logps/chosen": -232.222900390625, "logps/rejected": -203.7240753173828, "loss": 0.6763, "rewards/accuracies": 0.625, "rewards/chosen": 0.05353207513689995, "rewards/margins": 0.03654414042830467, "rewards/rejected": 0.016987936571240425, "step": 2070 }, { "epoch": 0.9983201343892488, "grad_norm": 49.25, "learning_rate": 3.1616382663024467e-12, "logits/chosen": -2.7572877407073975, "logits/rejected": -2.611027956008911, "logps/chosen": -232.747314453125, "logps/rejected": -208.55935668945312, "loss": 0.6791, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.04861464723944664, "rewards/margins": 0.03146715834736824, "rewards/rejected": 0.0171474888920784, "step": 2080 }, { "epoch": 0.9997600191984641, "step": 2083, "total_flos": 0.0, "train_loss": 0.6792905164692074, "train_runtime": 2163.7863, "train_samples_per_second": 15.405, "train_steps_per_second": 0.963 } ], "logging_steps": 10, "max_steps": 2083, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }