{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 200, "global_step": 955, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010471204188481676, "grad_norm": 45.70638656616211, "kl": 0.03365863859653473, "learning_rate": 0.0, "logits/chosen": -133350016.0, "logits/rejected": -100751848.0, "logps/chosen": -199.38916015625, "logps/rejected": -248.57103704637098, "loss": 1.9996, "rewards/chosen": -0.006603976993849783, "rewards/margins": 0.002509254980413562, "rewards/rejected": -0.009113231974263345, "step": 1 }, { "epoch": 0.010471204188481676, "grad_norm": 45.63622283935547, "kl": 0.05001102015376091, "learning_rate": 4.6875e-08, "logits/chosen": -114836760.0, "logits/rejected": -115496552.0, "logps/chosen": -280.943407960199, "logps/rejected": -255.2594489981785, "loss": 2.0019, "rewards/chosen": -0.001783605436385172, "rewards/margins": -0.0038228675079356074, "rewards/rejected": 0.0020392620715504353, "step": 10 }, { "epoch": 0.020942408376963352, "grad_norm": 49.609336853027344, "kl": 0.050704918801784515, "learning_rate": 9.895833333333332e-08, "logits/chosen": -125602032.0, "logits/rejected": -116276880.0, "logps/chosen": -279.09286115269464, "logps/rejected": -261.840047998366, "loss": 1.9992, "rewards/chosen": 0.0030849476060467564, "rewards/margins": 0.0017726313628812977, "rewards/rejected": 0.0013123162431654587, "step": 20 }, { "epoch": 0.031413612565445025, "grad_norm": 45.7076301574707, "kl": 0.0796850323677063, "learning_rate": 1.5104166666666664e-07, "logits/chosen": -117311640.0, "logits/rejected": -114144304.0, "logps/chosen": -294.7149727852853, "logps/rejected": -242.79242162052117, "loss": 1.9982, "rewards/chosen": 0.009787982648557372, "rewards/margins": 0.002934079104880057, "rewards/rejected": 0.006853903543677315, "step": 30 }, { "epoch": 0.041884816753926704, "grad_norm": 53.927669525146484, "kl": 0.20030224323272705, "learning_rate": 2.03125e-07, "logits/chosen": -119643952.0, "logits/rejected": -120977776.0, "logps/chosen": -306.0683379120879, "logps/rejected": -278.4179383748056, "loss": 1.9963, "rewards/chosen": 0.03329542031280661, "rewards/margins": 0.007960547439639757, "rewards/rejected": 0.025334872873166856, "step": 40 }, { "epoch": 0.05235602094240838, "grad_norm": 51.90446090698242, "kl": 0.34236329793930054, "learning_rate": 2.552083333333333e-07, "logits/chosen": -119124032.0, "logits/rejected": -115569560.0, "logps/chosen": -311.1760096153846, "logps/rejected": -268.8574900793651, "loss": 1.9864, "rewards/chosen": 0.07745902428260217, "rewards/margins": 0.02584011520367111, "rewards/rejected": 0.051618909078931054, "step": 50 }, { "epoch": 0.06282722513089005, "grad_norm": 51.83867263793945, "kl": 0.503300666809082, "learning_rate": 3.0729166666666665e-07, "logits/chosen": -114966488.0, "logits/rejected": -120589360.0, "logps/chosen": -299.37085962145113, "logps/rejected": -272.62553212074306, "loss": 1.9757, "rewards/chosen": 0.12960234199788667, "rewards/margins": 0.050474802944834835, "rewards/rejected": 0.07912753905305184, "step": 60 }, { "epoch": 0.07329842931937172, "grad_norm": 55.578922271728516, "kl": 0.31499481201171875, "learning_rate": 3.59375e-07, "logits/chosen": -111468848.0, "logits/rejected": -121516712.0, "logps/chosen": -281.68463625401927, "logps/rejected": -274.1886635638298, "loss": 1.9691, "rewards/chosen": 0.17197007304985806, "rewards/margins": 0.07172654231262758, "rewards/rejected": 0.10024353073723048, "step": 70 }, { "epoch": 0.08376963350785341, "grad_norm": 53.44794464111328, "kl": 0.14020584523677826, "learning_rate": 4.114583333333333e-07, "logits/chosen": -124242176.0, "logits/rejected": -114853336.0, "logps/chosen": -309.7781393568147, "logps/rejected": -254.96207137161085, "loss": 1.919, "rewards/chosen": 0.2425672171857054, "rewards/margins": 0.16257205424617438, "rewards/rejected": 0.079995162939531, "step": 80 }, { "epoch": 0.09424083769633508, "grad_norm": 45.9337043762207, "kl": 0.010224603116512299, "learning_rate": 4.6354166666666664e-07, "logits/chosen": -114247248.0, "logits/rejected": -121002304.0, "logps/chosen": -255.63410433070865, "logps/rejected": -255.94968507751938, "loss": 1.9103, "rewards/chosen": 0.2659489669199065, "rewards/margins": 0.1896964983080636, "rewards/rejected": 0.0762524686118429, "step": 90 }, { "epoch": 0.10471204188481675, "grad_norm": 47.694000244140625, "kl": 0.0, "learning_rate": 4.999849525959245e-07, "logits/chosen": -116696832.0, "logits/rejected": -135091520.0, "logps/chosen": -299.64565284653463, "logps/rejected": -257.15984328635017, "loss": 1.8452, "rewards/chosen": 0.3131070278658725, "rewards/margins": 0.35410608100594115, "rewards/rejected": -0.04099905314006862, "step": 100 }, { "epoch": 0.11518324607329843, "grad_norm": 45.92950439453125, "kl": 0.0, "learning_rate": 4.997174935782199e-07, "logits/chosen": -114301056.0, "logits/rejected": -125180608.0, "logps/chosen": -289.91893468118195, "logps/rejected": -249.99958300627944, "loss": 1.8212, "rewards/chosen": 0.0881744610206521, "rewards/margins": 0.3989718005393923, "rewards/rejected": -0.31079733951874017, "step": 110 }, { "epoch": 0.1256544502617801, "grad_norm": 44.388973236083984, "kl": 0.0, "learning_rate": 4.9911605954668e-07, "logits/chosen": -128287936.0, "logits/rejected": -123292968.0, "logps/chosen": -274.05433947772656, "logps/rejected": -291.2209608505564, "loss": 1.7655, "rewards/chosen": 0.06379634663805983, "rewards/margins": 0.5666303574066409, "rewards/rejected": -0.502834010768581, "step": 120 }, { "epoch": 0.13612565445026178, "grad_norm": 43.125389099121094, "kl": 0.0, "learning_rate": 4.981814548660135e-07, "logits/chosen": -118625360.0, "logits/rejected": -142319376.0, "logps/chosen": -286.44763163349916, "logps/rejected": -260.8665758862629, "loss": 1.688, "rewards/chosen": 0.16187965810595462, "rewards/margins": 0.7612699556618077, "rewards/rejected": -0.599390297555853, "step": 130 }, { "epoch": 0.14659685863874344, "grad_norm": 43.5549201965332, "kl": 0.0, "learning_rate": 4.969149294871417e-07, "logits/chosen": -134174704.0, "logits/rejected": -129014448.0, "logps/chosen": -270.30858126996804, "logps/rejected": -286.30975248470946, "loss": 1.6923, "rewards/chosen": -0.13654317642553165, "rewards/margins": 0.839808220327622, "rewards/rejected": -0.9763513967531536, "step": 140 }, { "epoch": 0.15706806282722513, "grad_norm": 38.105342864990234, "kl": 0.0, "learning_rate": 4.953181772754997e-07, "logits/chosen": -140920352.0, "logits/rejected": -132664600.0, "logps/chosen": -272.86307251908397, "logps/rejected": -266.89385, "loss": 1.6369, "rewards/chosen": -0.0188656086230096, "rewards/margins": 1.1427980632519903, "rewards/rejected": -1.161663671875, "step": 150 }, { "epoch": 0.16753926701570682, "grad_norm": 38.238868713378906, "kl": 0.0, "learning_rate": 4.93393333745642e-07, "logits/chosen": -130763216.0, "logits/rejected": -132181120.0, "logps/chosen": -267.7964184253247, "logps/rejected": -265.75249435240966, "loss": 1.6156, "rewards/chosen": 0.03252483962418197, "rewards/margins": 1.1381634585530223, "rewards/rejected": -1.1056386189288403, "step": 160 }, { "epoch": 0.17801047120418848, "grad_norm": 149.7418212890625, "kl": 0.0, "learning_rate": 4.9114297320518e-07, "logits/chosen": -148455008.0, "logits/rejected": -148213216.0, "logps/chosen": -292.17505877742946, "logps/rejected": -289.26684190031153, "loss": 1.6163, "rewards/chosen": -0.49163005195067594, "rewards/margins": 1.4294290544185801, "rewards/rejected": -1.9210591063692561, "step": 170 }, { "epoch": 0.18848167539267016, "grad_norm": 43.059410095214844, "kl": 0.0, "learning_rate": 4.885701053118751e-07, "logits/chosen": -147940400.0, "logits/rejected": -144332512.0, "logps/chosen": -282.0602057573416, "logps/rejected": -282.34577409162716, "loss": 1.5876, "rewards/chosen": 0.034519776310397446, "rewards/margins": 1.3361885912602889, "rewards/rejected": -1.3016688149498914, "step": 180 }, { "epoch": 0.19895287958115182, "grad_norm": 40.49046325683594, "kl": 0.0, "learning_rate": 4.856781710484872e-07, "logits/chosen": -139495344.0, "logits/rejected": -144202096.0, "logps/chosen": -280.0632974481659, "logps/rejected": -290.6424196018377, "loss": 1.5551, "rewards/chosen": 0.12813351523172722, "rewards/margins": 1.4480502681389824, "rewards/rejected": -1.319916752907255, "step": 190 }, { "epoch": 0.2094240837696335, "grad_norm": 40.46213912963867, "kl": 0.0, "learning_rate": 4.824710381207655e-07, "logits/chosen": -144665424.0, "logits/rejected": -151805984.0, "logps/chosen": -291.33322447749197, "logps/rejected": -285.26674107142856, "loss": 1.5841, "rewards/chosen": 0.006562507420874102, "rewards/margins": 1.4862394700443544, "rewards/rejected": -1.4796769626234803, "step": 200 }, { "epoch": 0.2094240837696335, "eval_kl": 0.0, "eval_logits/chosen": -151004736.0, "eval_logits/rejected": -149476768.0, "eval_logps/chosen": -289.55475, "eval_logps/rejected": -284.09784375, "eval_loss": 0.39710375666618347, "eval_rewards/chosen": -0.16988844299316405, "eval_rewards/margins": 1.544695541381836, "eval_rewards/rejected": -1.714583984375, "eval_runtime": 92.5994, "eval_samples_per_second": 43.197, "eval_steps_per_second": 1.35, "step": 200 }, { "epoch": 0.2198952879581152, "grad_norm": 36.308265686035156, "kl": 0.0, "learning_rate": 4.789529957847353e-07, "logits/chosen": -152687040.0, "logits/rejected": -144599248.0, "logps/chosen": -300.46617366412215, "logps/rejected": -276.6361, "loss": 1.5542, "rewards/chosen": -0.020091176215018935, "rewards/margins": 1.684735972222481, "rewards/rejected": -1.7048271484375, "step": 210 }, { "epoch": 0.23036649214659685, "grad_norm": 42.04066467285156, "kl": 0.0, "learning_rate": 4.751287491101977e-07, "logits/chosen": -153300528.0, "logits/rejected": -141240576.0, "logps/chosen": -284.2961228649068, "logps/rejected": -269.83765723270443, "loss": 1.626, "rewards/chosen": -0.18326281908876407, "rewards/margins": 1.4757443193374349, "rewards/rejected": -1.6590071384261988, "step": 220 }, { "epoch": 0.24083769633507854, "grad_norm": 36.196807861328125, "kl": 0.0, "learning_rate": 4.710034126881159e-07, "logits/chosen": -159471936.0, "logits/rejected": -135821408.0, "logps/chosen": -301.3033342430859, "logps/rejected": -293.3502055227656, "loss": 1.5672, "rewards/chosen": 0.1034569663945938, "rewards/margins": 1.6865190209305867, "rewards/rejected": -1.5830620545359928, "step": 230 }, { "epoch": 0.2513089005235602, "grad_norm": 36.31395721435547, "kl": 0.0, "learning_rate": 4.665825037903035e-07, "logits/chosen": -151562448.0, "logits/rejected": -142325232.0, "logps/chosen": -277.93509244992293, "logps/rejected": -272.8287688193344, "loss": 1.5525, "rewards/chosen": 0.3289636691288882, "rewards/margins": 1.6360129685110691, "rewards/rejected": -1.307049299382181, "step": 240 }, { "epoch": 0.2617801047120419, "grad_norm": 39.01026916503906, "kl": 0.0, "learning_rate": 4.618719349905619e-07, "logits/chosen": -156142944.0, "logits/rejected": -141863280.0, "logps/chosen": -294.4734971374046, "logps/rejected": -273.881775, "loss": 1.525, "rewards/chosen": 0.3023297957791627, "rewards/margins": 1.7884680770291626, "rewards/rejected": -1.48613828125, "step": 250 }, { "epoch": 0.27225130890052357, "grad_norm": 40.547950744628906, "kl": 0.0, "learning_rate": 4.568780062571374e-07, "logits/chosen": -152771744.0, "logits/rejected": -151975136.0, "logps/chosen": -278.8975861378205, "logps/rejected": -288.9916634908537, "loss": 1.5231, "rewards/chosen": 0.07649397850036621, "rewards/margins": 1.9374570090596268, "rewards/rejected": -1.8609630305592606, "step": 260 }, { "epoch": 0.28272251308900526, "grad_norm": 38.78754806518555, "kl": 0.0, "learning_rate": 4.516073965270717e-07, "logits/chosen": -147246848.0, "logits/rejected": -140734832.0, "logps/chosen": -275.6838321596244, "logps/rejected": -295.4493467238689, "loss": 1.5248, "rewards/chosen": 0.03273308743520149, "rewards/margins": 1.9670623063070813, "rewards/rejected": -1.93432921887188, "step": 270 }, { "epoch": 0.2931937172774869, "grad_norm": 38.30440902709961, "kl": 0.0, "learning_rate": 4.460671547737158e-07, "logits/chosen": -139242080.0, "logits/rejected": -144891984.0, "logps/chosen": -307.5467996382637, "logps/rejected": -275.4886018237082, "loss": 1.5128, "rewards/chosen": -0.2156752313448302, "rewards/margins": 1.9168140977062718, "rewards/rejected": -2.132489329051102, "step": 280 }, { "epoch": 0.3036649214659686, "grad_norm": 52.507747650146484, "kl": 0.0, "learning_rate": 4.40264690579353e-07, "logits/chosen": -153254432.0, "logits/rejected": -148096032.0, "logps/chosen": -296.4355230564024, "logps/rejected": -277.0494791666667, "loss": 1.5223, "rewards/chosen": -0.014792419061428162, "rewards/margins": 2.2441466881976866, "rewards/rejected": -2.2589391072591147, "step": 290 }, { "epoch": 0.31413612565445026, "grad_norm": 42.4326286315918, "kl": 0.0, "learning_rate": 4.3420776422553916e-07, "logits/chosen": -146678224.0, "logits/rejected": -144266464.0, "logps/chosen": -289.21781823394497, "logps/rejected": -280.0624001597444, "loss": 1.5308, "rewards/chosen": 0.00835518734899865, "rewards/margins": 2.1249945409457442, "rewards/rejected": -2.1166393535967454, "step": 300 }, { "epoch": 0.32460732984293195, "grad_norm": 48.941036224365234, "kl": 0.0, "learning_rate": 4.279044763144141e-07, "logits/chosen": -139984352.0, "logits/rejected": -148744560.0, "logps/chosen": -271.40555111821084, "logps/rejected": -308.63379204892965, "loss": 1.5119, "rewards/chosen": -0.12764440176966854, "rewards/margins": 2.093687366674521, "rewards/rejected": -2.22133176844419, "step": 310 }, { "epoch": 0.33507853403141363, "grad_norm": 30.565563201904297, "kl": 0.0, "learning_rate": 4.213632569348639e-07, "logits/chosen": -160167840.0, "logits/rejected": -138572496.0, "logps/chosen": -290.84521412884334, "logps/rejected": -290.7674309045226, "loss": 1.5273, "rewards/chosen": -0.013026183032012091, "rewards/margins": 2.366796105493951, "rewards/rejected": -2.379822288525963, "step": 320 }, { "epoch": 0.34554973821989526, "grad_norm": 37.133567810058594, "kl": 0.0, "learning_rate": 4.145928543880249e-07, "logits/chosen": -143369248.0, "logits/rejected": -145142368.0, "logps/chosen": -288.8722847551343, "logps/rejected": -284.5384949768161, "loss": 1.4706, "rewards/chosen": 0.3644241646379468, "rewards/margins": 2.3581714100373867, "rewards/rejected": -1.9937472453994398, "step": 330 }, { "epoch": 0.35602094240837695, "grad_norm": 42.86240005493164, "kl": 0.0, "learning_rate": 4.076023234872057e-07, "logits/chosen": -134226816.0, "logits/rejected": -150647472.0, "logps/chosen": -290.2114297253635, "logps/rejected": -284.6018816187595, "loss": 1.4998, "rewards/chosen": 0.00031860425514627927, "rewards/margins": 2.320413741535309, "rewards/rejected": -2.3200951372801626, "step": 340 }, { "epoch": 0.36649214659685864, "grad_norm": 43.323272705078125, "kl": 0.0, "learning_rate": 4.004010134478771e-07, "logits/chosen": -153940128.0, "logits/rejected": -142368416.0, "logps/chosen": -277.28628954475306, "logps/rejected": -281.2114319620253, "loss": 1.4997, "rewards/chosen": 0.20580647315508055, "rewards/margins": 2.183485568845844, "rewards/rejected": -1.9776790956907635, "step": 350 }, { "epoch": 0.3769633507853403, "grad_norm": 46.848731994628906, "kl": 0.0, "learning_rate": 3.9299855538392534e-07, "logits/chosen": -146128720.0, "logits/rejected": -144031184.0, "logps/chosen": -288.1243632445141, "logps/rejected": -282.6408197040498, "loss": 1.4687, "rewards/chosen": 0.38341842699200385, "rewards/margins": 2.226742892164317, "rewards/rejected": -1.843324465172313, "step": 360 }, { "epoch": 0.387434554973822, "grad_norm": 41.83399963378906, "kl": 0.0, "learning_rate": 3.8540484942689075e-07, "logits/chosen": -145012880.0, "logits/rejected": -147854064.0, "logps/chosen": -280.8901771496815, "logps/rejected": -297.4892398389571, "loss": 1.4964, "rewards/chosen": 0.17396898937832778, "rewards/margins": 2.185319656196771, "rewards/rejected": -2.0113506668184433, "step": 370 }, { "epoch": 0.39790575916230364, "grad_norm": 41.95068359375, "kl": 0.0, "learning_rate": 3.77630051485419e-07, "logits/chosen": -158335904.0, "logits/rejected": -135963312.0, "logps/chosen": -299.00375645280235, "logps/rejected": -290.30250726744185, "loss": 1.5113, "rewards/chosen": -0.136355295997102, "rewards/margins": 2.5501342781437826, "rewards/rejected": -2.6864895741408845, "step": 380 }, { "epoch": 0.4083769633507853, "grad_norm": 52.55752944946289, "kl": 0.0, "learning_rate": 3.696845596626342e-07, "logits/chosen": -136675232.0, "logits/rejected": -139554592.0, "logps/chosen": -267.1172371031746, "logps/rejected": -287.2497836538462, "loss": 1.4728, "rewards/chosen": 0.29143521747891865, "rewards/margins": 2.468798123127957, "rewards/rejected": -2.1773629056490384, "step": 390 }, { "epoch": 0.418848167539267, "grad_norm": 42.524559020996094, "kl": 0.0, "learning_rate": 3.61579000349597e-07, "logits/chosen": -145115344.0, "logits/rejected": -138730784.0, "logps/chosen": -291.6719227828746, "logps/rejected": -286.7621056309904, "loss": 1.404, "rewards/chosen": 0.3901278061239727, "rewards/margins": 2.5747456987457777, "rewards/rejected": -2.184617892621805, "step": 400 }, { "epoch": 0.418848167539267, "eval_kl": 0.0, "eval_logits/chosen": -143785152.0, "eval_logits/rejected": -142386976.0, "eval_logps/chosen": -288.1983125, "eval_logps/rejected": -290.82553125, "eval_loss": 0.3773096799850464, "eval_rewards/chosen": -0.03424349975585937, "eval_rewards/margins": 2.3531070861816406, "eval_rewards/rejected": -2.3873505859375, "eval_runtime": 92.585, "eval_samples_per_second": 43.204, "eval_steps_per_second": 1.35, "step": 400 }, { "epoch": 0.4293193717277487, "grad_norm": 42.05149459838867, "kl": 0.0, "learning_rate": 3.5332421401344837e-07, "logits/chosen": -123967896.0, "logits/rejected": -151420832.0, "logps/chosen": -290.8375683922559, "logps/rejected": -284.5394497084548, "loss": 1.5018, "rewards/chosen": -0.1862513866488781, "rewards/margins": 2.4929647368820076, "rewards/rejected": -2.679216123530886, "step": 410 }, { "epoch": 0.4397905759162304, "grad_norm": 45.76744842529297, "kl": 0.0, "learning_rate": 3.4493124069924635e-07, "logits/chosen": -141672128.0, "logits/rejected": -143752144.0, "logps/chosen": -296.493825, "logps/rejected": -276.59630248091605, "loss": 1.489, "rewards/chosen": -0.059238671875, "rewards/margins": 2.682879686009065, "rewards/rejected": -2.742118357884065, "step": 420 }, { "epoch": 0.450261780104712, "grad_norm": 48.224552154541016, "kl": 0.0, "learning_rate": 3.3641130526488335e-07, "logits/chosen": -128212800.0, "logits/rejected": -138041008.0, "logps/chosen": -262.0190345368917, "logps/rejected": -302.6821539657854, "loss": 1.5488, "rewards/chosen": 0.1527433649898511, "rewards/margins": 2.138388427805753, "rewards/rejected": -1.985645062815902, "step": 430 }, { "epoch": 0.4607329842931937, "grad_norm": 38.78076171875, "kl": 0.0, "learning_rate": 3.2777580236883473e-07, "logits/chosen": -137593360.0, "logits/rejected": -143156048.0, "logps/chosen": -262.82073682108626, "logps/rejected": -282.15691896024464, "loss": 1.4863, "rewards/chosen": 0.48399031276520066, "rewards/margins": 2.222597083598534, "rewards/rejected": -1.7386067708333333, "step": 440 }, { "epoch": 0.4712041884816754, "grad_norm": 32.755828857421875, "kl": 0.0, "learning_rate": 3.1903628123081196e-07, "logits/chosen": -145392688.0, "logits/rejected": -133878232.0, "logps/chosen": -279.9309734083851, "logps/rejected": -278.34424135220127, "loss": 1.4144, "rewards/chosen": 0.4770013560419497, "rewards/margins": 2.7351111625106013, "rewards/rejected": -2.2581098064686516, "step": 450 }, { "epoch": 0.4816753926701571, "grad_norm": 42.82301330566406, "kl": 0.0, "learning_rate": 3.1020443018570556e-07, "logits/chosen": -127823832.0, "logits/rejected": -148714048.0, "logps/chosen": -277.72029728084414, "logps/rejected": -274.04221573795184, "loss": 1.4946, "rewards/chosen": 0.2742920664997844, "rewards/margins": 2.2432816111039826, "rewards/rejected": -1.968989544604198, "step": 460 }, { "epoch": 0.49214659685863876, "grad_norm": 35.25486755371094, "kl": 0.0, "learning_rate": 3.0129206105147343e-07, "logits/chosen": -128224592.0, "logits/rejected": -141323744.0, "logps/chosen": -297.7064896003263, "logps/rejected": -277.0822245127436, "loss": 1.5014, "rewards/chosen": -0.1534212246221197, "rewards/margins": 2.4885884590126253, "rewards/rejected": -2.642009683634745, "step": 470 }, { "epoch": 0.5026178010471204, "grad_norm": 49.871315002441406, "kl": 0.0, "learning_rate": 2.923110933318805e-07, "logits/chosen": -138666448.0, "logits/rejected": -125876032.0, "logps/chosen": -282.11912313432833, "logps/rejected": -271.0550204918033, "loss": 1.5235, "rewards/chosen": -0.26497638759328357, "rewards/margins": 2.5066780853831507, "rewards/rejected": -2.7716544729764343, "step": 480 }, { "epoch": 0.5130890052356021, "grad_norm": 39.03130340576172, "kl": 0.0, "learning_rate": 2.832735382752194e-07, "logits/chosen": -144244752.0, "logits/rejected": -139237664.0, "logps/chosen": -280.307546898928, "logps/rejected": -291.85860247208933, "loss": 1.5082, "rewards/chosen": -0.07937168746410796, "rewards/margins": 2.5424585728902582, "rewards/rejected": -2.621830260354366, "step": 490 }, { "epoch": 0.5235602094240838, "grad_norm": 31.400175094604492, "kl": 0.0, "learning_rate": 2.741914828103307e-07, "logits/chosen": -134795200.0, "logits/rejected": -140993584.0, "logps/chosen": -274.8821624803768, "logps/rejected": -273.449115474339, "loss": 1.4409, "rewards/chosen": 0.3371262003900118, "rewards/margins": 2.429343961677434, "rewards/rejected": -2.092217761287422, "step": 500 }, { "epoch": 0.5340314136125655, "grad_norm": 39.398651123046875, "kl": 0.0, "learning_rate": 2.650770733814065e-07, "logits/chosen": -139524336.0, "logits/rejected": -137561184.0, "logps/chosen": -281.1636513157895, "logps/rejected": -272.26749138591117, "loss": 1.4701, "rewards/chosen": 0.545446806547174, "rewards/margins": 2.443065145583353, "rewards/rejected": -1.8976183390361792, "step": 510 }, { "epoch": 0.5445026178010471, "grad_norm": 40.88848114013672, "kl": 0.0, "learning_rate": 2.55942499703198e-07, "logits/chosen": -147061424.0, "logits/rejected": -143406240.0, "logps/chosen": -285.4889, "logps/rejected": -274.23685591603055, "loss": 1.4519, "rewards/chosen": 0.4090586181640625, "rewards/margins": 2.4761641167327646, "rewards/rejected": -2.067105498568702, "step": 520 }, { "epoch": 0.5549738219895288, "grad_norm": 41.25908660888672, "kl": 0.0, "learning_rate": 2.467999784583527e-07, "logits/chosen": -131054160.0, "logits/rejected": -139775840.0, "logps/chosen": -270.7588608226837, "logps/rejected": -278.70869170489294, "loss": 1.4667, "rewards/chosen": 0.1376024556997866, "rewards/margins": 2.5865234959926955, "rewards/rejected": -2.448921040292909, "step": 530 }, { "epoch": 0.5654450261780105, "grad_norm": 58.745155334472656, "kl": 0.0, "learning_rate": 2.3766173695868388e-07, "logits/chosen": -139035088.0, "logits/rejected": -133750928.0, "logps/chosen": -290.9726024119449, "logps/rejected": -290.5950209330144, "loss": 1.5503, "rewards/chosen": -0.12228842205146918, "rewards/margins": 2.3166335900099346, "rewards/rejected": -2.4389220120614037, "step": 540 }, { "epoch": 0.5759162303664922, "grad_norm": 66.44160461425781, "kl": 0.0, "learning_rate": 2.285399967922253e-07, "logits/chosen": -140837504.0, "logits/rejected": -148332576.0, "logps/chosen": -269.52528454472844, "logps/rejected": -282.80800840978594, "loss": 1.4314, "rewards/chosen": -0.13683468998430637, "rewards/margins": 2.8514840770383425, "rewards/rejected": -2.988318767022649, "step": 550 }, { "epoch": 0.5863874345549738, "grad_norm": 34.65999221801758, "kl": 0.0, "learning_rate": 2.194469574779397e-07, "logits/chosen": -155893536.0, "logits/rejected": -136393088.0, "logps/chosen": -289.53893209408193, "logps/rejected": -284.4916465378422, "loss": 1.4864, "rewards/chosen": -0.09103836383732751, "rewards/margins": 2.7043548873452408, "rewards/rejected": -2.7953932511825683, "step": 560 }, { "epoch": 0.5968586387434555, "grad_norm": 50.38192367553711, "kl": 0.0, "learning_rate": 2.1039478014994441e-07, "logits/chosen": -140968768.0, "logits/rejected": -132994816.0, "logps/chosen": -269.5297433903577, "logps/rejected": -288.17197311616957, "loss": 1.4443, "rewards/chosen": 0.11533069758912082, "rewards/margins": 2.7949352204379236, "rewards/rejected": -2.679604522848803, "step": 570 }, { "epoch": 0.6073298429319371, "grad_norm": 39.10985565185547, "kl": 0.0, "learning_rate": 2.0139557129307149e-07, "logits/chosen": -141384624.0, "logits/rejected": -141585264.0, "logps/chosen": -298.5900179140127, "logps/rejected": -308.7128067484663, "loss": 1.4304, "rewards/chosen": -0.01027871393094397, "rewards/margins": 2.8328863970257276, "rewards/rejected": -2.843165110956672, "step": 580 }, { "epoch": 0.6178010471204188, "grad_norm": 48.67289733886719, "kl": 0.0, "learning_rate": 1.9246136655151808e-07, "logits/chosen": -145905728.0, "logits/rejected": -138221376.0, "logps/chosen": -293.7313226744186, "logps/rejected": -306.55437992125985, "loss": 1.4509, "rewards/chosen": -0.13455553868020229, "rewards/margins": 2.8649036994472583, "rewards/rejected": -2.9994592381274607, "step": 590 }, { "epoch": 0.6282722513089005, "grad_norm": 68.55772399902344, "kl": 0.0, "learning_rate": 1.8360411463223873e-07, "logits/chosen": -136852608.0, "logits/rejected": -143516144.0, "logps/chosen": -284.4403070349762, "logps/rejected": -294.88001632104454, "loss": 1.4253, "rewards/chosen": -0.10330413672806538, "rewards/margins": 2.9733640825379677, "rewards/rejected": -3.076668219266033, "step": 600 }, { "epoch": 0.6282722513089005, "eval_kl": 0.0, "eval_logits/chosen": -145117536.0, "eval_logits/rejected": -143700400.0, "eval_logps/chosen": -291.06696875, "eval_logps/rejected": -298.3589375, "eval_loss": 0.36837950348854065, "eval_rewards/chosen": -0.32111080932617186, "eval_rewards/margins": 2.819581085205078, "eval_rewards/rejected": -3.14069189453125, "eval_runtime": 92.5853, "eval_samples_per_second": 43.203, "eval_steps_per_second": 1.35, "step": 600 }, { "epoch": 0.6387434554973822, "grad_norm": 46.867210388183594, "kl": 0.0, "learning_rate": 1.7483566132460865e-07, "logits/chosen": -136255600.0, "logits/rejected": -144252464.0, "logps/chosen": -299.9070411392405, "logps/rejected": -282.2795138888889, "loss": 1.4918, "rewards/chosen": -0.4866646392435967, "rewards/margins": 2.6991346651063264, "rewards/rejected": -3.185799304349923, "step": 610 }, { "epoch": 0.6492146596858639, "grad_norm": 66.88858032226562, "kl": 0.0, "learning_rate": 1.66167733657731e-07, "logits/chosen": -140277344.0, "logits/rejected": -142182944.0, "logps/chosen": -301.9623953349282, "logps/rejected": -294.2529192189893, "loss": 1.4825, "rewards/chosen": -0.35411040140301037, "rewards/margins": 2.692060965126469, "rewards/rejected": -3.0461713665294794, "step": 620 }, { "epoch": 0.6596858638743456, "grad_norm": 62.48853302001953, "kl": 0.0, "learning_rate": 1.5761192421657456e-07, "logits/chosen": -133893392.0, "logits/rejected": -143692496.0, "logps/chosen": -292.95065395367413, "logps/rejected": -299.167311735474, "loss": 1.4055, "rewards/chosen": 0.11951812159139127, "rewards/margins": 2.9790377767782226, "rewards/rejected": -2.859519655186831, "step": 630 }, { "epoch": 0.6701570680628273, "grad_norm": 38.459293365478516, "kl": 0.0, "learning_rate": 1.491796756379185e-07, "logits/chosen": -148631472.0, "logits/rejected": -137124976.0, "logps/chosen": -307.80620335820896, "logps/rejected": -284.41946721311473, "loss": 1.4798, "rewards/chosen": 0.12505948650303172, "rewards/margins": 2.7576216423047737, "rewards/rejected": -2.632562155801742, "step": 640 }, { "epoch": 0.680628272251309, "grad_norm": 51.62284469604492, "kl": 0.0, "learning_rate": 1.4088226530684071e-07, "logits/chosen": -145016352.0, "logits/rejected": -135913600.0, "logps/chosen": -293.0742607526882, "logps/rejected": -286.65267289348174, "loss": 1.4106, "rewards/chosen": 0.37428661059307794, "rewards/margins": 2.8034505085213373, "rewards/rejected": -2.4291638979282593, "step": 650 }, { "epoch": 0.6910994764397905, "grad_norm": 44.217506408691406, "kl": 0.0, "learning_rate": 1.327307902742142e-07, "logits/chosen": -153775056.0, "logits/rejected": -142987488.0, "logps/chosen": -278.56211538461537, "logps/rejected": -295.2840277777778, "loss": 1.4113, "rewards/chosen": 0.3199942486102764, "rewards/margins": 3.0220053085681133, "rewards/rejected": -2.702011059957837, "step": 660 }, { "epoch": 0.7015706806282722, "grad_norm": 52.56444549560547, "kl": 0.0, "learning_rate": 1.2473615241538523e-07, "logits/chosen": -138428624.0, "logits/rejected": -125599760.0, "logps/chosen": -270.62024962742174, "logps/rejected": -297.9065578817734, "loss": 1.5102, "rewards/chosen": 0.21654559389844916, "rewards/margins": 2.3964325355420244, "rewards/rejected": -2.1798869416435753, "step": 670 }, { "epoch": 0.7120418848167539, "grad_norm": 38.48976516723633, "kl": 0.0, "learning_rate": 1.169090438498816e-07, "logits/chosen": -140096608.0, "logits/rejected": -141314656.0, "logps/chosen": -289.16740023474176, "logps/rejected": -289.05847796411854, "loss": 1.4, "rewards/chosen": 0.4871681464110182, "rewards/margins": 2.8861619137789, "rewards/rejected": -2.398993767367882, "step": 680 }, { "epoch": 0.7225130890052356, "grad_norm": 56.998104095458984, "kl": 0.0, "learning_rate": 1.0925993264165045e-07, "logits/chosen": -136509200.0, "logits/rejected": -140580992.0, "logps/chosen": -284.6041084265176, "logps/rejected": -296.2347333715596, "loss": 1.4483, "rewards/chosen": 0.1843722529304675, "rewards/margins": 2.723567089094936, "rewards/rejected": -2.5391948361644685, "step": 690 }, { "epoch": 0.7329842931937173, "grad_norm": 45.59560012817383, "kl": 0.0, "learning_rate": 1.0179904879894998e-07, "logits/chosen": -139792672.0, "logits/rejected": -133128280.0, "logps/chosen": -282.0086206896552, "logps/rejected": -297.9873685747664, "loss": 1.4197, "rewards/chosen": 0.06277323516558704, "rewards/margins": 3.0981276117475574, "rewards/rejected": -3.0353543765819704, "step": 700 }, { "epoch": 0.743455497382199, "grad_norm": 48.215816497802734, "kl": 0.0, "learning_rate": 9.453637059262117e-08, "logits/chosen": -127794064.0, "logits/rejected": -130284464.0, "logps/chosen": -276.90582061068704, "logps/rejected": -275.3603, "loss": 1.5249, "rewards/chosen": -0.14867283478947996, "rewards/margins": 2.47092736052302, "rewards/rejected": -2.6196001953125, "step": 710 }, { "epoch": 0.7539267015706806, "grad_norm": 61.68962097167969, "kl": 0.0, "learning_rate": 8.748161121103406e-08, "logits/chosen": -140951328.0, "logits/rejected": -141405104.0, "logps/chosen": -288.2717027559055, "logps/rejected": -306.3112403100775, "loss": 1.3695, "rewards/chosen": 0.361008819820374, "rewards/margins": 3.0993951070660133, "rewards/rejected": -2.7383862872456395, "step": 720 }, { "epoch": 0.7643979057591623, "grad_norm": 43.57563400268555, "kl": 0.0, "learning_rate": 8.064420576955965e-08, "logits/chosen": -144350032.0, "logits/rejected": -144956128.0, "logps/chosen": -289.1480224609375, "logps/rejected": -297.3230224609375, "loss": 1.4858, "rewards/chosen": 0.04629603624343872, "rewards/margins": 2.6986050724983217, "rewards/rejected": -2.652309036254883, "step": 730 }, { "epoch": 0.774869109947644, "grad_norm": 47.147090911865234, "kl": 0.0, "learning_rate": 7.403329869193922e-08, "logits/chosen": -135583312.0, "logits/rejected": -131832256.0, "logps/chosen": -277.6656105990783, "logps/rejected": -275.17182531796504, "loss": 1.3656, "rewards/chosen": 0.1653434061967466, "rewards/margins": 3.2219976161281854, "rewards/rejected": -3.056654209931439, "step": 740 }, { "epoch": 0.7853403141361257, "grad_norm": 44.07842254638672, "kl": 0.0, "learning_rate": 6.765773148042858e-08, "logits/chosen": -143625632.0, "logits/rejected": -132837352.0, "logps/chosen": -285.6507056451613, "logps/rejected": -281.92257054848966, "loss": 1.4614, "rewards/chosen": 0.21915514311665946, "rewards/margins": 2.62252863091972, "rewards/rejected": -2.4033734878030604, "step": 750 }, { "epoch": 0.7958115183246073, "grad_norm": 37.42493438720703, "kl": 0.0, "learning_rate": 6.152603089107139e-08, "logits/chosen": -136608224.0, "logits/rejected": -131950376.0, "logps/chosen": -276.34929128614914, "logps/rejected": -277.7825792536116, "loss": 1.4942, "rewards/chosen": 0.24006759666779634, "rewards/margins": 2.4807506731374995, "rewards/rejected": -2.240683076469703, "step": 760 }, { "epoch": 0.806282722513089, "grad_norm": 68.92852020263672, "kl": 0.0, "learning_rate": 5.5646397529920175e-08, "logits/chosen": -132386256.0, "logits/rejected": -138109456.0, "logps/chosen": -303.72342011128774, "logps/rejected": -284.04025057603684, "loss": 1.3831, "rewards/chosen": 0.32797061695772056, "rewards/margins": 2.99956472398057, "rewards/rejected": -2.6715941070228495, "step": 770 }, { "epoch": 0.8167539267015707, "grad_norm": 36.25162124633789, "kl": 0.0, "learning_rate": 5.002669488545111e-08, "logits/chosen": -126804304.0, "logits/rejected": -149925328.0, "logps/chosen": -280.9178725369458, "logps/rejected": -298.3454033159463, "loss": 1.4707, "rewards/chosen": 0.24425755893851345, "rewards/margins": 2.512562284278491, "rewards/rejected": -2.2683047253399775, "step": 780 }, { "epoch": 0.8272251308900523, "grad_norm": 45.3673095703125, "kl": 0.0, "learning_rate": 4.467443881184646e-08, "logits/chosen": -137377824.0, "logits/rejected": -140888416.0, "logps/chosen": -284.73014937106916, "logps/rejected": -271.3363742236025, "loss": 1.4641, "rewards/chosen": 0.15725456093842127, "rewards/margins": 2.4592872211917207, "rewards/rejected": -2.3020326602532997, "step": 790 }, { "epoch": 0.837696335078534, "grad_norm": 44.92776870727539, "kl": 0.0, "learning_rate": 3.959678747720488e-08, "logits/chosen": -147636928.0, "logits/rejected": -129594688.0, "logps/chosen": -278.11655092592594, "logps/rejected": -284.1502840909091, "loss": 1.4432, "rewards/chosen": 0.22765783239293982, "rewards/margins": 2.823709614417733, "rewards/rejected": -2.5960517820247935, "step": 800 }, { "epoch": 0.837696335078534, "eval_kl": 0.0, "eval_logits/chosen": -140467840.0, "eval_logits/rejected": -139209600.0, "eval_logps/chosen": -286.2336875, "eval_logps/rejected": -292.39625, "eval_loss": 0.3657679557800293, "eval_rewards/chosen": 0.16221832275390624, "eval_rewards/margins": 2.7066421508789062, "eval_rewards/rejected": -2.544423828125, "eval_runtime": 92.5899, "eval_samples_per_second": 43.201, "eval_steps_per_second": 1.35, "step": 800 }, { "epoch": 0.8481675392670157, "grad_norm": 64.69525909423828, "kl": 0.0, "learning_rate": 3.480053179012654e-08, "logits/chosen": -129839872.0, "logits/rejected": -140454848.0, "logps/chosen": -266.87487579491255, "logps/rejected": -288.485599078341, "loss": 1.5392, "rewards/chosen": -0.023006766702867273, "rewards/margins": 2.3186599749686, "rewards/rejected": -2.341666741671467, "step": 810 }, { "epoch": 0.8586387434554974, "grad_norm": 63.29912185668945, "kl": 0.0, "learning_rate": 3.029208631747446e-08, "logits/chosen": -138798032.0, "logits/rejected": -130695232.0, "logps/chosen": -273.63010448619633, "logps/rejected": -289.54072949840764, "loss": 1.3905, "rewards/chosen": 0.2373036811688195, "rewards/margins": 3.154649639319695, "rewards/rejected": -2.9173459581508756, "step": 820 }, { "epoch": 0.8691099476439791, "grad_norm": 35.773826599121094, "kl": 0.0, "learning_rate": 2.607748070546037e-08, "logits/chosen": -138341072.0, "logits/rejected": -140245856.0, "logps/chosen": -276.33953568611986, "logps/rejected": -295.26535893962847, "loss": 1.4761, "rewards/chosen": 0.13098442968133872, "rewards/margins": 2.7483675067300037, "rewards/rejected": -2.617383077048665, "step": 830 }, { "epoch": 0.8795811518324608, "grad_norm": 36.50845718383789, "kl": 0.0, "learning_rate": 2.2162351615526544e-08, "logits/chosen": -140731280.0, "logits/rejected": -148560064.0, "logps/chosen": -301.4704117063492, "logps/rejected": -290.1577644230769, "loss": 1.4465, "rewards/chosen": 0.18666185651506698, "rewards/margins": 2.7894090019477593, "rewards/rejected": -2.6027471454326925, "step": 840 }, { "epoch": 0.8900523560209425, "grad_norm": 55.24102020263672, "kl": 0.0, "learning_rate": 1.8551935185811717e-08, "logits/chosen": -132794856.0, "logits/rejected": -138046480.0, "logps/chosen": -282.9409226190476, "logps/rejected": -302.71471153846153, "loss": 1.4063, "rewards/chosen": 0.10835386003766741, "rewards/margins": 3.033290759076129, "rewards/rejected": -2.9249368990384617, "step": 850 }, { "epoch": 0.900523560209424, "grad_norm": 70.16515350341797, "kl": 0.0, "learning_rate": 1.5251060028279612e-08, "logits/chosen": -143098928.0, "logits/rejected": -126897944.0, "logps/chosen": -272.46225367078824, "logps/rejected": -305.9686759478673, "loss": 1.5084, "rewards/chosen": 0.03112101370621317, "rewards/margins": 2.5436736215754867, "rewards/rejected": -2.5125526078692735, "step": 860 }, { "epoch": 0.9109947643979057, "grad_norm": 28.95891761779785, "kl": 0.0, "learning_rate": 1.2264140770878839e-08, "logits/chosen": -137280736.0, "logits/rejected": -143405088.0, "logps/chosen": -299.961469889065, "logps/rejected": -295.115875385208, "loss": 1.4681, "rewards/chosen": -0.01648792260800224, "rewards/margins": 2.59923304083335, "rewards/rejected": -2.615720963441352, "step": 870 }, { "epoch": 0.9214659685863874, "grad_norm": 37.45644760131836, "kl": 0.0, "learning_rate": 9.59517215336922e-09, "logits/chosen": -128477976.0, "logits/rejected": -129677584.0, "logps/chosen": -280.0816461267606, "logps/rejected": -291.42394695787834, "loss": 1.4164, "rewards/chosen": -0.018490225682982444, "rewards/margins": 3.1044779987100557, "rewards/rejected": -3.122968224393038, "step": 880 }, { "epoch": 0.9319371727748691, "grad_norm": 43.50038528442383, "kl": 0.0, "learning_rate": 7.247723684711382e-09, "logits/chosen": -137378768.0, "logits/rejected": -126123072.0, "logps/chosen": -271.5022151295732, "logps/rejected": -294.2158453525641, "loss": 1.4426, "rewards/chosen": 0.19363278877444384, "rewards/margins": 2.807892557529452, "rewards/rejected": -2.614259768755008, "step": 890 }, { "epoch": 0.9424083769633508, "grad_norm": 58.68267059326172, "kl": 0.0, "learning_rate": 5.224934869164976e-09, "logits/chosen": -139006912.0, "logits/rejected": -141954272.0, "logps/chosen": -292.2946211507293, "logps/rejected": -301.43368212669685, "loss": 1.4844, "rewards/chosen": -0.05539148785113515, "rewards/margins": 2.60611269089368, "rewards/rejected": -2.6615041787448153, "step": 900 }, { "epoch": 0.9528795811518325, "grad_norm": 51.242652893066406, "kl": 0.0, "learning_rate": 3.529511007479946e-09, "logits/chosen": -140018880.0, "logits/rejected": -134809904.0, "logps/chosen": -292.4076660906298, "logps/rejected": -281.15220091414943, "loss": 1.4564, "rewards/chosen": 0.1075638450235815, "rewards/margins": 2.6941378960967137, "rewards/rejected": -2.586574051073132, "step": 910 }, { "epoch": 0.9633507853403142, "grad_norm": 60.67096710205078, "kl": 0.0, "learning_rate": 2.1637195787966857e-09, "logits/chosen": -132927744.0, "logits/rejected": -145037952.0, "logps/chosen": -294.478180176565, "logps/rejected": -286.2923801369863, "loss": 1.4298, "rewards/chosen": 0.23743079906481993, "rewards/margins": 2.895192835903671, "rewards/rejected": -2.657762036838851, "step": 920 }, { "epoch": 0.9738219895287958, "grad_norm": 44.87836837768555, "kl": 0.0, "learning_rate": 1.1293872080934963e-09, "logits/chosen": -125942888.0, "logits/rejected": -145652336.0, "logps/chosen": -283.0784801136364, "logps/rejected": -296.9692206325301, "loss": 1.4259, "rewards/chosen": 0.11567323858087714, "rewards/margins": 2.9895860038999658, "rewards/rejected": -2.873912765319089, "step": 930 }, { "epoch": 0.9842931937172775, "grad_norm": 43.4354362487793, "kl": 0.0, "learning_rate": 4.2789722323760546e-10, "logits/chosen": -142621872.0, "logits/rejected": -136877328.0, "logps/chosen": -288.74076066616766, "logps/rejected": -292.02558210784315, "loss": 1.4129, "rewards/chosen": 0.27186810613392354, "rewards/margins": 3.1179713504915623, "rewards/rejected": -2.846103244357639, "step": 940 }, { "epoch": 0.9947643979057592, "grad_norm": 49.643978118896484, "kl": 0.0, "learning_rate": 6.018780490690822e-11, "logits/chosen": -147837520.0, "logits/rejected": -131119704.0, "logps/chosen": -285.26368371212124, "logps/rejected": -282.18916330645163, "loss": 1.4033, "rewards/chosen": 0.14758417534105706, "rewards/margins": 3.1335240379922666, "rewards/rejected": -2.9859398626512097, "step": 950 }, { "epoch": 1.0, "step": 955, "total_flos": 0.0, "train_loss": 1.5426912418834826, "train_runtime": 5367.6535, "train_samples_per_second": 22.779, "train_steps_per_second": 0.178 } ], "logging_steps": 10, "max_steps": 955, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }