{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 2130, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004694835680751174, "grad_norm": 312.0256219422113, "learning_rate": 2.1126760563380282e-08, "logits/chosen": -3.2750000953674316, "logits/rejected": -3.1578125953674316, "logps/chosen": -520.0, "logps/rejected": -623.4000244140625, "loss": 2.0975, "rewards/accuracies": 0.171875, "rewards/chosen": 0.06386718899011612, "rewards/margins": 0.14946289360523224, "rewards/rejected": -0.08598633110523224, "step": 10 }, { "epoch": 0.009389671361502348, "grad_norm": 357.83885964439077, "learning_rate": 4.460093896713615e-08, "logits/chosen": -3.176562547683716, "logits/rejected": -3.1796875, "logps/chosen": -553.0, "logps/rejected": -617.2000122070312, "loss": 2.6057, "rewards/accuracies": 0.23828125, "rewards/chosen": 0.19448241591453552, "rewards/margins": 0.205078125, "rewards/rejected": -0.01123046875, "step": 20 }, { "epoch": 0.014084507042253521, "grad_norm": 292.5595630275396, "learning_rate": 6.807511737089202e-08, "logits/chosen": -3.167187452316284, "logits/rejected": -3.090625047683716, "logps/chosen": -585.0, "logps/rejected": -682.7999877929688, "loss": 2.6069, "rewards/accuracies": 0.24921874701976776, "rewards/chosen": 0.27446287870407104, "rewards/margins": 0.12026367336511612, "rewards/rejected": 0.1552734375, "step": 30 }, { "epoch": 0.018779342723004695, "grad_norm": 1477.179885439986, "learning_rate": 9.154929577464789e-08, "logits/chosen": -3.2281250953674316, "logits/rejected": -3.1703124046325684, "logps/chosen": -573.7999877929688, "logps/rejected": -706.0, "loss": 3.0694, "rewards/accuracies": 0.24687500298023224, "rewards/chosen": 0.17978516221046448, "rewards/margins": 0.2806640565395355, "rewards/rejected": -0.10068359225988388, "step": 40 }, { "epoch": 0.023474178403755867, "grad_norm": 361.20032694705145, "learning_rate": 1.1502347417840374e-07, "logits/chosen": -3.1484375, "logits/rejected": -3.1781249046325684, "logps/chosen": -554.2000122070312, "logps/rejected": -629.2000122070312, "loss": 3.243, "rewards/accuracies": 0.2421875, "rewards/chosen": -0.10427246242761612, "rewards/margins": -0.28144532442092896, "rewards/rejected": 0.17734375596046448, "step": 50 }, { "epoch": 0.028169014084507043, "grad_norm": 323.77333516562635, "learning_rate": 1.384976525821596e-07, "logits/chosen": -3.215625047683716, "logits/rejected": -3.176562547683716, "logps/chosen": -569.0, "logps/rejected": -684.0, "loss": 2.4599, "rewards/accuracies": 0.23828125, "rewards/chosen": 0.3821777403354645, "rewards/margins": 0.4833007752895355, "rewards/rejected": -0.10146484524011612, "step": 60 }, { "epoch": 0.03286384976525822, "grad_norm": 276.7746357585784, "learning_rate": 1.619718309859155e-07, "logits/chosen": -3.104687452316284, "logits/rejected": -3.0875000953674316, "logps/chosen": -550.2000122070312, "logps/rejected": -621.2000122070312, "loss": 2.7151, "rewards/accuracies": 0.25703126192092896, "rewards/chosen": 0.08867187798023224, "rewards/margins": 0.12685546278953552, "rewards/rejected": -0.03798828274011612, "step": 70 }, { "epoch": 0.03755868544600939, "grad_norm": 315.8250479371314, "learning_rate": 1.8544600938967138e-07, "logits/chosen": -3.2640624046325684, "logits/rejected": -3.2421875, "logps/chosen": -550.5999755859375, "logps/rejected": -655.2000122070312, "loss": 2.6591, "rewards/accuracies": 0.23749999701976776, "rewards/chosen": 0.11894531548023224, "rewards/margins": 0.276123046875, "rewards/rejected": -0.15708008408546448, "step": 80 }, { "epoch": 0.04225352112676056, "grad_norm": 277.4297891268117, "learning_rate": 2.089201877934272e-07, "logits/chosen": -3.2906250953674316, "logits/rejected": -3.262500047683716, "logps/chosen": -527.7999877929688, "logps/rejected": -603.4000244140625, "loss": 2.9056, "rewards/accuracies": 0.2578125, "rewards/chosen": 0.42170411348342896, "rewards/margins": -0.03627929836511612, "rewards/rejected": 0.4574218690395355, "step": 90 }, { "epoch": 0.046948356807511735, "grad_norm": 662.6082353712405, "learning_rate": 2.323943661971831e-07, "logits/chosen": -3.1734375953674316, "logits/rejected": -3.192187547683716, "logps/chosen": -574.0, "logps/rejected": -603.7999877929688, "loss": 2.8654, "rewards/accuracies": 0.25078123807907104, "rewards/chosen": 0.13359375298023224, "rewards/margins": 0.18437500298023224, "rewards/rejected": -0.05058593675494194, "step": 100 }, { "epoch": 0.051643192488262914, "grad_norm": 300.25041205050195, "learning_rate": 2.5586854460093895e-07, "logits/chosen": -3.237499952316284, "logits/rejected": -3.1859374046325684, "logps/chosen": -550.5999755859375, "logps/rejected": -657.5999755859375, "loss": 2.8981, "rewards/accuracies": 0.23828125, "rewards/chosen": -0.21835938096046448, "rewards/margins": -0.20292969048023224, "rewards/rejected": -0.014208984561264515, "step": 110 }, { "epoch": 0.056338028169014086, "grad_norm": 254.4772537543913, "learning_rate": 2.7934272300469483e-07, "logits/chosen": -3.2359375953674316, "logits/rejected": -3.171875, "logps/chosen": -561.0, "logps/rejected": -649.2000122070312, "loss": 2.9676, "rewards/accuracies": 0.2710937559604645, "rewards/chosen": 0.18388672173023224, "rewards/margins": 0.2694335877895355, "rewards/rejected": -0.08457031100988388, "step": 120 }, { "epoch": 0.06103286384976526, "grad_norm": 359.56965403329684, "learning_rate": 3.0281690140845066e-07, "logits/chosen": -3.153125047683716, "logits/rejected": -3.1484375, "logps/chosen": -576.0, "logps/rejected": -665.5999755859375, "loss": 2.4338, "rewards/accuracies": 0.28203123807907104, "rewards/chosen": 0.49860841035842896, "rewards/margins": 1.219873070716858, "rewards/rejected": -0.722363293170929, "step": 130 }, { "epoch": 0.06572769953051644, "grad_norm": 353.8273960746246, "learning_rate": 3.2629107981220654e-07, "logits/chosen": -3.151562452316284, "logits/rejected": -3.1484375, "logps/chosen": -564.4000244140625, "logps/rejected": -689.2000122070312, "loss": 3.216, "rewards/accuracies": 0.24296875298023224, "rewards/chosen": 0.553417980670929, "rewards/margins": -0.5686279535293579, "rewards/rejected": 1.121679663658142, "step": 140 }, { "epoch": 0.07042253521126761, "grad_norm": 297.09345656551403, "learning_rate": 3.497652582159624e-07, "logits/chosen": -3.2109375, "logits/rejected": -3.229687452316284, "logps/chosen": -560.4000244140625, "logps/rejected": -666.4000244140625, "loss": 2.6835, "rewards/accuracies": 0.2640624940395355, "rewards/chosen": 0.43452149629592896, "rewards/margins": 0.678906261920929, "rewards/rejected": -0.24501952528953552, "step": 150 }, { "epoch": 0.07511737089201878, "grad_norm": 324.5518620175009, "learning_rate": 3.732394366197183e-07, "logits/chosen": -3.2109375, "logits/rejected": -3.1703124046325684, "logps/chosen": -574.5999755859375, "logps/rejected": -665.0, "loss": 2.5045, "rewards/accuracies": 0.27734375, "rewards/chosen": 0.5849609375, "rewards/margins": 0.823046863079071, "rewards/rejected": -0.23779296875, "step": 160 }, { "epoch": 0.07981220657276995, "grad_norm": 370.79323789587045, "learning_rate": 3.967136150234742e-07, "logits/chosen": -3.2203125953674316, "logits/rejected": -3.1421875953674316, "logps/chosen": -528.7999877929688, "logps/rejected": -648.4000244140625, "loss": 2.826, "rewards/accuracies": 0.265625, "rewards/chosen": 0.32744139432907104, "rewards/margins": 0.3781494200229645, "rewards/rejected": -0.04990234225988388, "step": 170 }, { "epoch": 0.08450704225352113, "grad_norm": 493.3277569235484, "learning_rate": 4.2018779342723e-07, "logits/chosen": -3.167187452316284, "logits/rejected": -3.112499952316284, "logps/chosen": -517.5999755859375, "logps/rejected": -644.4000244140625, "loss": 2.4864, "rewards/accuracies": 0.2867187559604645, "rewards/chosen": 0.49970704317092896, "rewards/margins": 0.4146484434604645, "rewards/rejected": 0.085205078125, "step": 180 }, { "epoch": 0.0892018779342723, "grad_norm": 285.4873436657188, "learning_rate": 4.436619718309859e-07, "logits/chosen": -3.171875, "logits/rejected": -3.1578125953674316, "logps/chosen": -553.0, "logps/rejected": -625.2000122070312, "loss": 2.5783, "rewards/accuracies": 0.30156248807907104, "rewards/chosen": 1.051855444908142, "rewards/margins": 1.269140601158142, "rewards/rejected": -0.21464844048023224, "step": 190 }, { "epoch": 0.09389671361502347, "grad_norm": 349.67590705902006, "learning_rate": 4.671361502347418e-07, "logits/chosen": -3.159374952316284, "logits/rejected": -3.1468749046325684, "logps/chosen": -541.7999877929688, "logps/rejected": -632.5999755859375, "loss": 2.3192, "rewards/accuracies": 0.31171876192092896, "rewards/chosen": 1.356347680091858, "rewards/margins": 1.372656226158142, "rewards/rejected": -0.0159912109375, "step": 200 }, { "epoch": 0.09859154929577464, "grad_norm": 302.1865630833012, "learning_rate": 4.906103286384976e-07, "logits/chosen": -3.192187547683716, "logits/rejected": -3.151562452316284, "logps/chosen": -555.2000122070312, "logps/rejected": -669.5999755859375, "loss": 2.5959, "rewards/accuracies": 0.31640625, "rewards/chosen": 1.054296851158142, "rewards/margins": 1.321874976158142, "rewards/rejected": -0.26777344942092896, "step": 210 }, { "epoch": 0.10328638497652583, "grad_norm": 300.72724401497436, "learning_rate": 4.984350547730829e-07, "logits/chosen": -3.268749952316284, "logits/rejected": -3.1937499046325684, "logps/chosen": -569.5999755859375, "logps/rejected": -648.0, "loss": 2.2734, "rewards/accuracies": 0.33671873807907104, "rewards/chosen": 1.7156250476837158, "rewards/margins": 1.9314453601837158, "rewards/rejected": -0.21503905951976776, "step": 220 }, { "epoch": 0.107981220657277, "grad_norm": 1102.3813264661942, "learning_rate": 4.958268127282212e-07, "logits/chosen": -3.200000047683716, "logits/rejected": -3.1703124046325684, "logps/chosen": -556.5999755859375, "logps/rejected": -673.5999755859375, "loss": 2.4886, "rewards/accuracies": 0.33125001192092896, "rewards/chosen": 2.0492186546325684, "rewards/margins": 1.604589819908142, "rewards/rejected": 0.4451660215854645, "step": 230 }, { "epoch": 0.11267605633802817, "grad_norm": 282.37733202209813, "learning_rate": 4.932185706833594e-07, "logits/chosen": -3.1500000953674316, "logits/rejected": -3.1812500953674316, "logps/chosen": -513.4000244140625, "logps/rejected": -563.0, "loss": 2.3018, "rewards/accuracies": 0.36406248807907104, "rewards/chosen": 2.0999999046325684, "rewards/margins": 2.255859375, "rewards/rejected": -0.15898437798023224, "step": 240 }, { "epoch": 0.11737089201877934, "grad_norm": 279.1099151963578, "learning_rate": 4.906103286384976e-07, "logits/chosen": -3.207812547683716, "logits/rejected": -3.1656250953674316, "logps/chosen": -552.4000244140625, "logps/rejected": -684.0, "loss": 2.4892, "rewards/accuracies": 0.3812499940395355, "rewards/chosen": 1.790624976158142, "rewards/margins": 2.741015672683716, "rewards/rejected": -0.951904296875, "step": 250 }, { "epoch": 0.12206572769953052, "grad_norm": 287.6263722929828, "learning_rate": 4.880020865936358e-07, "logits/chosen": -3.167187452316284, "logits/rejected": -3.1812500953674316, "logps/chosen": -516.7999877929688, "logps/rejected": -623.7999877929688, "loss": 2.6498, "rewards/accuracies": 0.3671875, "rewards/chosen": 2.3648438453674316, "rewards/margins": 2.208203077316284, "rewards/rejected": 0.15483398735523224, "step": 260 }, { "epoch": 0.1267605633802817, "grad_norm": 343.03300828691033, "learning_rate": 4.853938445487741e-07, "logits/chosen": -3.096874952316284, "logits/rejected": -3.0718750953674316, "logps/chosen": -554.0, "logps/rejected": -658.7999877929688, "loss": 2.7283, "rewards/accuracies": 0.3851562440395355, "rewards/chosen": 2.793750047683716, "rewards/margins": 2.6507811546325684, "rewards/rejected": 0.14072266221046448, "step": 270 }, { "epoch": 0.13145539906103287, "grad_norm": 260.13644441380166, "learning_rate": 4.827856025039123e-07, "logits/chosen": -3.1781249046325684, "logits/rejected": -3.174999952316284, "logps/chosen": -553.4000244140625, "logps/rejected": -664.0, "loss": 2.5984, "rewards/accuracies": 0.38671875, "rewards/chosen": 3.018749952316284, "rewards/margins": 3.389843702316284, "rewards/rejected": -0.3746093809604645, "step": 280 }, { "epoch": 0.13615023474178403, "grad_norm": 231.6706812957532, "learning_rate": 4.801773604590506e-07, "logits/chosen": -3.1390624046325684, "logits/rejected": -3.1703124046325684, "logps/chosen": -599.7999877929688, "logps/rejected": -660.4000244140625, "loss": 2.7207, "rewards/accuracies": 0.4039062559604645, "rewards/chosen": 2.717968702316284, "rewards/margins": 3.4097657203674316, "rewards/rejected": -0.693408191204071, "step": 290 }, { "epoch": 0.14084507042253522, "grad_norm": 220.97715297066875, "learning_rate": 4.775691184141888e-07, "logits/chosen": -3.1578125953674316, "logits/rejected": -3.182812452316284, "logps/chosen": -572.5999755859375, "logps/rejected": -656.5999755859375, "loss": 2.8978, "rewards/accuracies": 0.39765626192092896, "rewards/chosen": 2.4898438453674316, "rewards/margins": 4.083276271820068, "rewards/rejected": -1.5941894054412842, "step": 300 }, { "epoch": 0.14553990610328638, "grad_norm": 302.8311801574877, "learning_rate": 4.749608763693271e-07, "logits/chosen": -3.245312452316284, "logits/rejected": -3.2015624046325684, "logps/chosen": -537.7999877929688, "logps/rejected": -635.2000122070312, "loss": 2.9098, "rewards/accuracies": 0.4195312559604645, "rewards/chosen": 3.0171875953674316, "rewards/margins": 2.634570360183716, "rewards/rejected": 0.38300782442092896, "step": 310 }, { "epoch": 0.15023474178403756, "grad_norm": 405.1955863726396, "learning_rate": 4.7235263432446533e-07, "logits/chosen": -3.2203125953674316, "logits/rejected": -3.137500047683716, "logps/chosen": -545.2000122070312, "logps/rejected": -634.2000122070312, "loss": 2.7902, "rewards/accuracies": 0.41718751192092896, "rewards/chosen": 3.2046875953674316, "rewards/margins": 3.9234375953674316, "rewards/rejected": -0.721484363079071, "step": 320 }, { "epoch": 0.15492957746478872, "grad_norm": 244.65396594212294, "learning_rate": 4.6974439227960353e-07, "logits/chosen": -3.1875, "logits/rejected": -3.1265625953674316, "logps/chosen": -564.7999877929688, "logps/rejected": -665.2000122070312, "loss": 3.1965, "rewards/accuracies": 0.4242187440395355, "rewards/chosen": 3.067578077316284, "rewards/margins": 4.0234375, "rewards/rejected": -0.956298828125, "step": 330 }, { "epoch": 0.1596244131455399, "grad_norm": 372.5758889280196, "learning_rate": 4.671361502347418e-07, "logits/chosen": -3.2281250953674316, "logits/rejected": -3.2171874046325684, "logps/chosen": -540.4000244140625, "logps/rejected": -591.0, "loss": 3.0711, "rewards/accuracies": 0.41718751192092896, "rewards/chosen": 3.635937452316284, "rewards/margins": 2.765625, "rewards/rejected": 0.8724609613418579, "step": 340 }, { "epoch": 0.1643192488262911, "grad_norm": 268.8524534862668, "learning_rate": 4.6452790818988004e-07, "logits/chosen": -3.1890625953674316, "logits/rejected": -3.176562547683716, "logps/chosen": -575.2000122070312, "logps/rejected": -662.4000244140625, "loss": 3.209, "rewards/accuracies": 0.4242187440395355, "rewards/chosen": 3.598437547683716, "rewards/margins": 3.905468702316284, "rewards/rejected": -0.3086914122104645, "step": 350 }, { "epoch": 0.16901408450704225, "grad_norm": 331.4831794798673, "learning_rate": 4.6191966614501824e-07, "logits/chosen": -3.1781249046325684, "logits/rejected": -3.1546874046325684, "logps/chosen": -544.0, "logps/rejected": -637.4000244140625, "loss": 2.7959, "rewards/accuracies": 0.40625, "rewards/chosen": 3.909374952316284, "rewards/margins": 3.6156249046325684, "rewards/rejected": 0.29326170682907104, "step": 360 }, { "epoch": 0.17370892018779344, "grad_norm": 300.48013396150066, "learning_rate": 4.593114241001565e-07, "logits/chosen": -3.1265625953674316, "logits/rejected": -3.1500000953674316, "logps/chosen": -586.5999755859375, "logps/rejected": -667.5999755859375, "loss": 2.9779, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": 4.057812690734863, "rewards/margins": 4.220703125, "rewards/rejected": -0.16093750298023224, "step": 370 }, { "epoch": 0.1784037558685446, "grad_norm": 259.22801416212894, "learning_rate": 4.5670318205529474e-07, "logits/chosen": -3.1796875, "logits/rejected": -3.1234374046325684, "logps/chosen": -552.4000244140625, "logps/rejected": -658.7999877929688, "loss": 2.6823, "rewards/accuracies": 0.4078125059604645, "rewards/chosen": 3.3335938453674316, "rewards/margins": 4.030468940734863, "rewards/rejected": -0.696972668170929, "step": 380 }, { "epoch": 0.18309859154929578, "grad_norm": 314.2218817447742, "learning_rate": 4.54094940010433e-07, "logits/chosen": -3.2015624046325684, "logits/rejected": -3.160937547683716, "logps/chosen": -565.5999755859375, "logps/rejected": -674.4000244140625, "loss": 3.0923, "rewards/accuracies": 0.4398437440395355, "rewards/chosen": 3.741015672683716, "rewards/margins": 4.584374904632568, "rewards/rejected": -0.849902331829071, "step": 390 }, { "epoch": 0.18779342723004694, "grad_norm": 303.68185985451623, "learning_rate": 4.514866979655712e-07, "logits/chosen": -3.221874952316284, "logits/rejected": -3.1546874046325684, "logps/chosen": -553.5999755859375, "logps/rejected": -639.2000122070312, "loss": 2.8172, "rewards/accuracies": 0.4593749940395355, "rewards/chosen": 4.425000190734863, "rewards/margins": 5.13671875, "rewards/rejected": -0.7081298828125, "step": 400 }, { "epoch": 0.19248826291079812, "grad_norm": 316.63227968980334, "learning_rate": 4.4887845592070945e-07, "logits/chosen": -3.1546874046325684, "logits/rejected": -3.1234374046325684, "logps/chosen": -542.4000244140625, "logps/rejected": -677.5999755859375, "loss": 2.8016, "rewards/accuracies": 0.4554687440395355, "rewards/chosen": 4.473437309265137, "rewards/margins": 5.852343559265137, "rewards/rejected": -1.3835937976837158, "step": 410 }, { "epoch": 0.19718309859154928, "grad_norm": 294.08918909696143, "learning_rate": 4.462702138758477e-07, "logits/chosen": -3.1812500953674316, "logits/rejected": -3.135937452316284, "logps/chosen": -575.2000122070312, "logps/rejected": -644.4000244140625, "loss": 2.9949, "rewards/accuracies": 0.43046873807907104, "rewards/chosen": 4.242968559265137, "rewards/margins": 4.417187690734863, "rewards/rejected": -0.17177733778953552, "step": 420 }, { "epoch": 0.20187793427230047, "grad_norm": 265.6457311420601, "learning_rate": 4.436619718309859e-07, "logits/chosen": -3.153125047683716, "logits/rejected": -3.0703125, "logps/chosen": -557.7999877929688, "logps/rejected": -684.4000244140625, "loss": 2.853, "rewards/accuracies": 0.4281249940395355, "rewards/chosen": 3.6234374046325684, "rewards/margins": 4.569531440734863, "rewards/rejected": -0.94482421875, "step": 430 }, { "epoch": 0.20657276995305165, "grad_norm": 278.50429679548523, "learning_rate": 4.4105372978612415e-07, "logits/chosen": -3.2359375953674316, "logits/rejected": -3.2171874046325684, "logps/chosen": -572.0, "logps/rejected": -673.2000122070312, "loss": 3.1858, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 4.300000190734863, "rewards/margins": 5.065625190734863, "rewards/rejected": -0.7640625238418579, "step": 440 }, { "epoch": 0.2112676056338028, "grad_norm": 230.10305977022676, "learning_rate": 4.384454877412624e-07, "logits/chosen": -3.260937452316284, "logits/rejected": -3.153125047683716, "logps/chosen": -556.4000244140625, "logps/rejected": -748.7999877929688, "loss": 3.0931, "rewards/accuracies": 0.4359374940395355, "rewards/chosen": 4.378125190734863, "rewards/margins": 6.3046875, "rewards/rejected": -1.936132788658142, "step": 450 }, { "epoch": 0.215962441314554, "grad_norm": 263.6305024511116, "learning_rate": 4.358372456964006e-07, "logits/chosen": -3.160937547683716, "logits/rejected": -3.15625, "logps/chosen": -552.5999755859375, "logps/rejected": -659.2000122070312, "loss": 2.8102, "rewards/accuracies": 0.46015626192092896, "rewards/chosen": 4.801562309265137, "rewards/margins": 4.841406345367432, "rewards/rejected": -0.0338134765625, "step": 460 }, { "epoch": 0.22065727699530516, "grad_norm": 324.489924430666, "learning_rate": 4.3322900365153886e-07, "logits/chosen": -3.2640624046325684, "logits/rejected": -3.192187547683716, "logps/chosen": -593.5999755859375, "logps/rejected": -655.5999755859375, "loss": 3.3795, "rewards/accuracies": 0.44062501192092896, "rewards/chosen": 4.314062595367432, "rewards/margins": 4.260937690734863, "rewards/rejected": 0.05312500149011612, "step": 470 }, { "epoch": 0.22535211267605634, "grad_norm": 255.4091492673624, "learning_rate": 4.306207616066771e-07, "logits/chosen": -3.2015624046325684, "logits/rejected": -3.174999952316284, "logps/chosen": -542.5999755859375, "logps/rejected": -636.2000122070312, "loss": 2.8316, "rewards/accuracies": 0.4281249940395355, "rewards/chosen": 4.528124809265137, "rewards/margins": 4.978125095367432, "rewards/rejected": -0.44755858182907104, "step": 480 }, { "epoch": 0.2300469483568075, "grad_norm": 292.3375710045175, "learning_rate": 4.280125195618153e-07, "logits/chosen": -3.1953125, "logits/rejected": -3.1312499046325684, "logps/chosen": -548.2000122070312, "logps/rejected": -663.5999755859375, "loss": 3.1282, "rewards/accuracies": 0.4476562440395355, "rewards/chosen": 4.767187595367432, "rewards/margins": 4.903124809265137, "rewards/rejected": -0.13076171278953552, "step": 490 }, { "epoch": 0.2347417840375587, "grad_norm": 276.8383687595893, "learning_rate": 4.2540427751695357e-07, "logits/chosen": -3.1234374046325684, "logits/rejected": -3.135937452316284, "logps/chosen": -563.7999877929688, "logps/rejected": -639.2000122070312, "loss": 2.8798, "rewards/accuracies": 0.45390623807907104, "rewards/chosen": 4.951562404632568, "rewards/margins": 5.360937595367432, "rewards/rejected": -0.40961915254592896, "step": 500 }, { "epoch": 0.23943661971830985, "grad_norm": 240.08669970605214, "learning_rate": 4.227960354720918e-07, "logits/chosen": -3.262500047683716, "logits/rejected": -3.200000047683716, "logps/chosen": -542.2000122070312, "logps/rejected": -677.5999755859375, "loss": 2.9689, "rewards/accuracies": 0.4429687559604645, "rewards/chosen": 4.881249904632568, "rewards/margins": 4.892480373382568, "rewards/rejected": -0.00820312462747097, "step": 510 }, { "epoch": 0.24413145539906103, "grad_norm": 316.99746852995713, "learning_rate": 4.2018779342723e-07, "logits/chosen": -3.160937547683716, "logits/rejected": -3.1703124046325684, "logps/chosen": -562.4000244140625, "logps/rejected": -626.0, "loss": 3.6521, "rewards/accuracies": 0.4359374940395355, "rewards/chosen": 4.800000190734863, "rewards/margins": 3.233593702316284, "rewards/rejected": 1.5671875476837158, "step": 520 }, { "epoch": 0.24882629107981222, "grad_norm": 270.0542704810816, "learning_rate": 4.1757955138236827e-07, "logits/chosen": -3.253124952316284, "logits/rejected": -3.182812452316284, "logps/chosen": -541.5999755859375, "logps/rejected": -660.0, "loss": 2.8176, "rewards/accuracies": 0.4921875, "rewards/chosen": 5.292187690734863, "rewards/margins": 6.079687595367432, "rewards/rejected": -0.780078113079071, "step": 530 }, { "epoch": 0.2535211267605634, "grad_norm": 295.63002336313383, "learning_rate": 4.149713093375065e-07, "logits/chosen": -3.2093749046325684, "logits/rejected": -3.176562547683716, "logps/chosen": -543.5999755859375, "logps/rejected": -606.0, "loss": 3.0713, "rewards/accuracies": 0.453125, "rewards/chosen": 5.207812309265137, "rewards/margins": 5.46875, "rewards/rejected": -0.25507813692092896, "step": 540 }, { "epoch": 0.25821596244131456, "grad_norm": 310.35958187748247, "learning_rate": 4.123630672926447e-07, "logits/chosen": -3.231250047683716, "logits/rejected": -3.1968750953674316, "logps/chosen": -532.0, "logps/rejected": -653.0, "loss": 3.0968, "rewards/accuracies": 0.44218748807907104, "rewards/chosen": 4.865624904632568, "rewards/margins": 5.584374904632568, "rewards/rejected": -0.7201172113418579, "step": 550 }, { "epoch": 0.26291079812206575, "grad_norm": 312.5796454732532, "learning_rate": 4.09754825247783e-07, "logits/chosen": -3.215625047683716, "logits/rejected": -3.1578125953674316, "logps/chosen": -549.2000122070312, "logps/rejected": -630.0, "loss": 2.7618, "rewards/accuracies": 0.46562498807907104, "rewards/chosen": 5.068749904632568, "rewards/margins": 6.120312690734863, "rewards/rejected": -1.05029296875, "step": 560 }, { "epoch": 0.2676056338028169, "grad_norm": 250.4466960516743, "learning_rate": 4.0714658320292123e-07, "logits/chosen": -3.28125, "logits/rejected": -3.2093749046325684, "logps/chosen": -544.0, "logps/rejected": -643.7999877929688, "loss": 2.8375, "rewards/accuracies": 0.4585937559604645, "rewards/chosen": 5.240624904632568, "rewards/margins": 5.368750095367432, "rewards/rejected": -0.12744140625, "step": 570 }, { "epoch": 0.27230046948356806, "grad_norm": 263.6284568812628, "learning_rate": 4.045383411580595e-07, "logits/chosen": -3.2249999046325684, "logits/rejected": -3.190624952316284, "logps/chosen": -529.5999755859375, "logps/rejected": -646.4000244140625, "loss": 3.1771, "rewards/accuracies": 0.46171873807907104, "rewards/chosen": 5.1875, "rewards/margins": 5.52734375, "rewards/rejected": -0.3402343690395355, "step": 580 }, { "epoch": 0.27699530516431925, "grad_norm": 247.26597055635676, "learning_rate": 4.019300991131977e-07, "logits/chosen": -3.106250047683716, "logits/rejected": -3.0843749046325684, "logps/chosen": -543.5999755859375, "logps/rejected": -643.2000122070312, "loss": 3.1357, "rewards/accuracies": 0.4710937440395355, "rewards/chosen": 5.296093940734863, "rewards/margins": 5.807812690734863, "rewards/rejected": -0.5101562738418579, "step": 590 }, { "epoch": 0.28169014084507044, "grad_norm": 343.6934307609959, "learning_rate": 3.9932185706833594e-07, "logits/chosen": -3.2593750953674316, "logits/rejected": -3.1859374046325684, "logps/chosen": -557.7999877929688, "logps/rejected": -634.4000244140625, "loss": 3.2729, "rewards/accuracies": 0.4609375, "rewards/chosen": 5.770312309265137, "rewards/margins": 5.9296875, "rewards/rejected": -0.15639647841453552, "step": 600 }, { "epoch": 0.2863849765258216, "grad_norm": 261.3531599965703, "learning_rate": 3.967136150234742e-07, "logits/chosen": -3.207812547683716, "logits/rejected": -3.215625047683716, "logps/chosen": -559.5999755859375, "logps/rejected": -643.0, "loss": 3.269, "rewards/accuracies": 0.47343748807907104, "rewards/chosen": 5.671875, "rewards/margins": 6.106249809265137, "rewards/rejected": -0.42988282442092896, "step": 610 }, { "epoch": 0.29107981220657275, "grad_norm": 236.1085139278817, "learning_rate": 3.941053729786124e-07, "logits/chosen": -3.151562452316284, "logits/rejected": -3.1078124046325684, "logps/chosen": -569.7999877929688, "logps/rejected": -654.7999877929688, "loss": 2.7709, "rewards/accuracies": 0.484375, "rewards/chosen": 5.900000095367432, "rewards/margins": 7.834374904632568, "rewards/rejected": -1.9337890148162842, "step": 620 }, { "epoch": 0.29577464788732394, "grad_norm": 1668.1858891625257, "learning_rate": 3.9149713093375064e-07, "logits/chosen": -3.1640625, "logits/rejected": -3.125, "logps/chosen": -554.0, "logps/rejected": -673.5999755859375, "loss": 2.8208, "rewards/accuracies": 0.48359376192092896, "rewards/chosen": 6.259375095367432, "rewards/margins": 7.706250190734863, "rewards/rejected": -1.448828101158142, "step": 630 }, { "epoch": 0.3004694835680751, "grad_norm": 274.4990815783928, "learning_rate": 3.888888888888889e-07, "logits/chosen": -3.2015624046325684, "logits/rejected": -3.1546874046325684, "logps/chosen": -545.4000244140625, "logps/rejected": -686.4000244140625, "loss": 2.9131, "rewards/accuracies": 0.49140626192092896, "rewards/chosen": 5.653124809265137, "rewards/margins": 7.78125, "rewards/rejected": -2.1275634765625, "step": 640 }, { "epoch": 0.3051643192488263, "grad_norm": 264.72591172971954, "learning_rate": 3.862806468440271e-07, "logits/chosen": -3.2874999046325684, "logits/rejected": -3.2593750953674316, "logps/chosen": -505.0, "logps/rejected": -605.2000122070312, "loss": 2.9616, "rewards/accuracies": 0.4671874940395355, "rewards/chosen": 5.503125190734863, "rewards/margins": 5.125, "rewards/rejected": 0.37983399629592896, "step": 650 }, { "epoch": 0.30985915492957744, "grad_norm": 302.6815782764531, "learning_rate": 3.8367240479916535e-07, "logits/chosen": -3.2171874046325684, "logits/rejected": -3.254687547683716, "logps/chosen": -538.7999877929688, "logps/rejected": -611.0, "loss": 2.8749, "rewards/accuracies": 0.50390625, "rewards/chosen": 6.021874904632568, "rewards/margins": 6.935937404632568, "rewards/rejected": -0.921313464641571, "step": 660 }, { "epoch": 0.3145539906103286, "grad_norm": 285.79486724420923, "learning_rate": 3.810641627543036e-07, "logits/chosen": -3.2203125953674316, "logits/rejected": -3.1859374046325684, "logps/chosen": -554.0, "logps/rejected": -647.0, "loss": 3.0776, "rewards/accuracies": 0.48515623807907104, "rewards/chosen": 5.768750190734863, "rewards/margins": 7.626562595367432, "rewards/rejected": -1.8566405773162842, "step": 670 }, { "epoch": 0.3192488262910798, "grad_norm": 267.03878626735144, "learning_rate": 3.784559207094418e-07, "logits/chosen": -3.2171874046325684, "logits/rejected": -3.1859374046325684, "logps/chosen": -564.0, "logps/rejected": -634.4000244140625, "loss": 3.018, "rewards/accuracies": 0.46015626192092896, "rewards/chosen": 5.534375190734863, "rewards/margins": 5.573437690734863, "rewards/rejected": -0.03432617336511612, "step": 680 }, { "epoch": 0.323943661971831, "grad_norm": 241.42149101925932, "learning_rate": 3.7584767866458005e-07, "logits/chosen": -3.167187452316284, "logits/rejected": -3.1468749046325684, "logps/chosen": -558.4000244140625, "logps/rejected": -612.2000122070312, "loss": 3.2028, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 5.759375095367432, "rewards/margins": 6.537499904632568, "rewards/rejected": -0.775390625, "step": 690 }, { "epoch": 0.3286384976525822, "grad_norm": 347.3003027177567, "learning_rate": 3.732394366197183e-07, "logits/chosen": -3.253124952316284, "logits/rejected": -3.206249952316284, "logps/chosen": -581.0, "logps/rejected": -622.7999877929688, "loss": 3.4616, "rewards/accuracies": 0.4789062440395355, "rewards/chosen": 5.829687595367432, "rewards/margins": 6.435937404632568, "rewards/rejected": -0.612841784954071, "step": 700 }, { "epoch": 0.3333333333333333, "grad_norm": 370.0683750172563, "learning_rate": 3.706311945748565e-07, "logits/chosen": -3.2328124046325684, "logits/rejected": -3.1781249046325684, "logps/chosen": -594.4000244140625, "logps/rejected": -631.5999755859375, "loss": 3.6096, "rewards/accuracies": 0.4984374940395355, "rewards/chosen": 5.745312690734863, "rewards/margins": 6.145312309265137, "rewards/rejected": -0.40507811307907104, "step": 710 }, { "epoch": 0.3380281690140845, "grad_norm": 297.31120098621625, "learning_rate": 3.6802295252999476e-07, "logits/chosen": -3.2015624046325684, "logits/rejected": -3.0640625953674316, "logps/chosen": -571.7999877929688, "logps/rejected": -682.2000122070312, "loss": 3.2536, "rewards/accuracies": 0.49687498807907104, "rewards/chosen": 6.015625, "rewards/margins": 7.296875, "rewards/rejected": -1.282324194908142, "step": 720 }, { "epoch": 0.3427230046948357, "grad_norm": 909.3109580482733, "learning_rate": 3.65414710485133e-07, "logits/chosen": -3.184375047683716, "logits/rejected": -3.2125000953674316, "logps/chosen": -566.2000122070312, "logps/rejected": -619.2000122070312, "loss": 3.2271, "rewards/accuracies": 0.48828125, "rewards/chosen": 5.009375095367432, "rewards/margins": 7.012499809265137, "rewards/rejected": -2.0113282203674316, "step": 730 }, { "epoch": 0.3474178403755869, "grad_norm": 251.82987219431644, "learning_rate": 3.6280646844027127e-07, "logits/chosen": -3.160937547683716, "logits/rejected": -3.1812500953674316, "logps/chosen": -548.4000244140625, "logps/rejected": -692.7999877929688, "loss": 2.4756, "rewards/accuracies": 0.5023437738418579, "rewards/chosen": 6.371874809265137, "rewards/margins": 8.1171875, "rewards/rejected": -1.7410156726837158, "step": 740 }, { "epoch": 0.352112676056338, "grad_norm": 325.56963232354997, "learning_rate": 3.6019822639540947e-07, "logits/chosen": -3.192187547683716, "logits/rejected": -3.128124952316284, "logps/chosen": -577.0, "logps/rejected": -614.4000244140625, "loss": 3.3683, "rewards/accuracies": 0.48046875, "rewards/chosen": 5.904687404632568, "rewards/margins": 6.515625, "rewards/rejected": -0.6114746332168579, "step": 750 }, { "epoch": 0.3568075117370892, "grad_norm": 262.68653804411235, "learning_rate": 3.575899843505477e-07, "logits/chosen": -3.171875, "logits/rejected": -3.121875047683716, "logps/chosen": -553.7999877929688, "logps/rejected": -606.4000244140625, "loss": 3.2532, "rewards/accuracies": 0.50390625, "rewards/chosen": 5.689062595367432, "rewards/margins": 6.7109375, "rewards/rejected": -1.021875023841858, "step": 760 }, { "epoch": 0.3615023474178404, "grad_norm": 308.5027585504221, "learning_rate": 3.5498174230568597e-07, "logits/chosen": -3.174999952316284, "logits/rejected": -3.143749952316284, "logps/chosen": -549.5999755859375, "logps/rejected": -639.5999755859375, "loss": 2.7304, "rewards/accuracies": 0.47734373807907104, "rewards/chosen": 5.609375, "rewards/margins": 7.015625, "rewards/rejected": -1.402978539466858, "step": 770 }, { "epoch": 0.36619718309859156, "grad_norm": 304.36534286887735, "learning_rate": 3.5237350026082417e-07, "logits/chosen": -3.128124952316284, "logits/rejected": -3.125, "logps/chosen": -559.5999755859375, "logps/rejected": -600.0, "loss": 3.1165, "rewards/accuracies": 0.4859375059604645, "rewards/chosen": 5.96875, "rewards/margins": 5.890625, "rewards/rejected": 0.07460937649011612, "step": 780 }, { "epoch": 0.37089201877934275, "grad_norm": 191.4390918096237, "learning_rate": 3.497652582159624e-07, "logits/chosen": -3.1859374046325684, "logits/rejected": -3.151562452316284, "logps/chosen": -555.7999877929688, "logps/rejected": -659.2000122070312, "loss": 3.0015, "rewards/accuracies": 0.4859375059604645, "rewards/chosen": 6.221875190734863, "rewards/margins": 7.193749904632568, "rewards/rejected": -0.9771484136581421, "step": 790 }, { "epoch": 0.3755868544600939, "grad_norm": 425.7166621373832, "learning_rate": 3.471570161711007e-07, "logits/chosen": -3.214062452316284, "logits/rejected": -3.1484375, "logps/chosen": -586.2000122070312, "logps/rejected": -674.5999755859375, "loss": 3.3681, "rewards/accuracies": 0.49921876192092896, "rewards/chosen": 6.193749904632568, "rewards/margins": 7.546875, "rewards/rejected": -1.3479492664337158, "step": 800 }, { "epoch": 0.38028169014084506, "grad_norm": 300.49720808118195, "learning_rate": 3.445487741262389e-07, "logits/chosen": -3.2015624046325684, "logits/rejected": -3.1968750953674316, "logps/chosen": -538.2000122070312, "logps/rejected": -643.5999755859375, "loss": 2.6164, "rewards/accuracies": 0.4976562559604645, "rewards/chosen": 5.670312404632568, "rewards/margins": 7.168749809265137, "rewards/rejected": -1.4990234375, "step": 810 }, { "epoch": 0.38497652582159625, "grad_norm": 273.38949554271784, "learning_rate": 3.4194053208137713e-07, "logits/chosen": -3.2171874046325684, "logits/rejected": -3.1734375953674316, "logps/chosen": -549.2000122070312, "logps/rejected": -615.7999877929688, "loss": 2.9592, "rewards/accuracies": 0.49140626192092896, "rewards/chosen": 5.9140625, "rewards/margins": 7.162499904632568, "rewards/rejected": -1.2492187023162842, "step": 820 }, { "epoch": 0.38967136150234744, "grad_norm": 271.8475642928366, "learning_rate": 3.393322900365154e-07, "logits/chosen": -3.1890625953674316, "logits/rejected": -3.096874952316284, "logps/chosen": -521.5999755859375, "logps/rejected": -621.2000122070312, "loss": 3.5942, "rewards/accuracies": 0.48515623807907104, "rewards/chosen": 5.356249809265137, "rewards/margins": 6.192187309265137, "rewards/rejected": -0.8359375, "step": 830 }, { "epoch": 0.39436619718309857, "grad_norm": 258.8715832438056, "learning_rate": 3.367240479916536e-07, "logits/chosen": -3.1890625953674316, "logits/rejected": -3.1312499046325684, "logps/chosen": -586.7999877929688, "logps/rejected": -662.7999877929688, "loss": 3.1732, "rewards/accuracies": 0.4945312440395355, "rewards/chosen": 5.978125095367432, "rewards/margins": 6.753125190734863, "rewards/rejected": -0.7646484375, "step": 840 }, { "epoch": 0.39906103286384975, "grad_norm": 277.01025031111124, "learning_rate": 3.3411580594679184e-07, "logits/chosen": -3.262500047683716, "logits/rejected": -3.1937499046325684, "logps/chosen": -537.4000244140625, "logps/rejected": -634.0, "loss": 3.1979, "rewards/accuracies": 0.49921876192092896, "rewards/chosen": 6.203125, "rewards/margins": 6.993750095367432, "rewards/rejected": -0.7895263433456421, "step": 850 }, { "epoch": 0.40375586854460094, "grad_norm": 262.8271384549558, "learning_rate": 3.315075639019301e-07, "logits/chosen": -3.1812500953674316, "logits/rejected": -3.184375047683716, "logps/chosen": -541.2000122070312, "logps/rejected": -632.4000244140625, "loss": 2.9426, "rewards/accuracies": 0.51171875, "rewards/chosen": 6.125, "rewards/margins": 7.006249904632568, "rewards/rejected": -0.8836669921875, "step": 860 }, { "epoch": 0.4084507042253521, "grad_norm": 257.76714659870345, "learning_rate": 3.288993218570683e-07, "logits/chosen": -3.207812547683716, "logits/rejected": -3.140625, "logps/chosen": -544.4000244140625, "logps/rejected": -617.2000122070312, "loss": 3.0632, "rewards/accuracies": 0.4867187440395355, "rewards/chosen": 6.678124904632568, "rewards/margins": 6.890625, "rewards/rejected": -0.21953125298023224, "step": 870 }, { "epoch": 0.4131455399061033, "grad_norm": 270.3906090837782, "learning_rate": 3.2629107981220654e-07, "logits/chosen": -3.2249999046325684, "logits/rejected": -3.1343750953674316, "logps/chosen": -561.5999755859375, "logps/rejected": -642.5999755859375, "loss": 3.1866, "rewards/accuracies": 0.4976562559604645, "rewards/chosen": 6.128125190734863, "rewards/margins": 7.431250095367432, "rewards/rejected": -1.2996094226837158, "step": 880 }, { "epoch": 0.41784037558685444, "grad_norm": 361.90483580549267, "learning_rate": 3.236828377673448e-07, "logits/chosen": -3.1640625, "logits/rejected": -3.2015624046325684, "logps/chosen": -588.2000122070312, "logps/rejected": -631.0, "loss": 3.6112, "rewards/accuracies": 0.5132812261581421, "rewards/chosen": 6.135937690734863, "rewards/margins": 6.685937404632568, "rewards/rejected": -0.5540527105331421, "step": 890 }, { "epoch": 0.4225352112676056, "grad_norm": 212.9363956810051, "learning_rate": 3.2107459572248305e-07, "logits/chosen": -3.168750047683716, "logits/rejected": -3.168750047683716, "logps/chosen": -563.0, "logps/rejected": -641.2000122070312, "loss": 3.2769, "rewards/accuracies": 0.4984374940395355, "rewards/chosen": 5.487500190734863, "rewards/margins": 7.017187595367432, "rewards/rejected": -1.535375952720642, "step": 900 }, { "epoch": 0.4272300469483568, "grad_norm": 362.81405145397866, "learning_rate": 3.1846635367762125e-07, "logits/chosen": -3.159374952316284, "logits/rejected": -3.151562452316284, "logps/chosen": -567.4000244140625, "logps/rejected": -667.7999877929688, "loss": 3.3387, "rewards/accuracies": 0.4906249940395355, "rewards/chosen": 6.262499809265137, "rewards/margins": 6.796875, "rewards/rejected": -0.536425769329071, "step": 910 }, { "epoch": 0.431924882629108, "grad_norm": 301.9066438976272, "learning_rate": 3.158581116327595e-07, "logits/chosen": -3.2015624046325684, "logits/rejected": -3.1703124046325684, "logps/chosen": -559.5999755859375, "logps/rejected": -698.0, "loss": 3.2574, "rewards/accuracies": 0.5054687261581421, "rewards/chosen": 6.571875095367432, "rewards/margins": 7.837500095367432, "rewards/rejected": -1.2675292491912842, "step": 920 }, { "epoch": 0.43661971830985913, "grad_norm": 261.95731135022044, "learning_rate": 3.1324986958789775e-07, "logits/chosen": -3.1640625, "logits/rejected": -3.1468749046325684, "logps/chosen": -526.7999877929688, "logps/rejected": -615.5999755859375, "loss": 3.0444, "rewards/accuracies": 0.48359376192092896, "rewards/chosen": 6.434374809265137, "rewards/margins": 6.09375, "rewards/rejected": 0.3431640565395355, "step": 930 }, { "epoch": 0.4413145539906103, "grad_norm": 255.23362051238044, "learning_rate": 3.1064162754303595e-07, "logits/chosen": -3.1390624046325684, "logits/rejected": -3.09375, "logps/chosen": -571.7999877929688, "logps/rejected": -674.7999877929688, "loss": 2.7951, "rewards/accuracies": 0.4945312440395355, "rewards/chosen": 6.862500190734863, "rewards/margins": 9.096875190734863, "rewards/rejected": -2.2445311546325684, "step": 940 }, { "epoch": 0.4460093896713615, "grad_norm": 246.61090183265875, "learning_rate": 3.080333854981742e-07, "logits/chosen": -3.2093749046325684, "logits/rejected": -3.143749952316284, "logps/chosen": -547.0, "logps/rejected": -617.5999755859375, "loss": 3.1938, "rewards/accuracies": 0.46015626192092896, "rewards/chosen": 6.581250190734863, "rewards/margins": 5.469531059265137, "rewards/rejected": 1.105712890625, "step": 950 }, { "epoch": 0.4507042253521127, "grad_norm": 263.40064087446916, "learning_rate": 3.0542514345331246e-07, "logits/chosen": -3.160937547683716, "logits/rejected": -3.1078124046325684, "logps/chosen": -559.7999877929688, "logps/rejected": -700.7999877929688, "loss": 3.1048, "rewards/accuracies": 0.4710937440395355, "rewards/chosen": 6.578125, "rewards/margins": 7.384375095367432, "rewards/rejected": -0.7998046875, "step": 960 }, { "epoch": 0.45539906103286387, "grad_norm": 285.6016326943243, "learning_rate": 3.0281690140845066e-07, "logits/chosen": -3.109375, "logits/rejected": -3.1656250953674316, "logps/chosen": -594.5999755859375, "logps/rejected": -667.5999755859375, "loss": 3.7553, "rewards/accuracies": 0.508593738079071, "rewards/chosen": 6.043749809265137, "rewards/margins": 6.780468940734863, "rewards/rejected": -0.7427734136581421, "step": 970 }, { "epoch": 0.460093896713615, "grad_norm": 363.14693418021574, "learning_rate": 3.002086593635889e-07, "logits/chosen": -3.2750000953674316, "logits/rejected": -3.1859374046325684, "logps/chosen": -550.0, "logps/rejected": -666.4000244140625, "loss": 3.1473, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 6.168749809265137, "rewards/margins": 7.689062595367432, "rewards/rejected": -1.517187476158142, "step": 980 }, { "epoch": 0.4647887323943662, "grad_norm": 253.9270033153478, "learning_rate": 2.9760041731872716e-07, "logits/chosen": -3.2578125, "logits/rejected": -3.2093749046325684, "logps/chosen": -514.2000122070312, "logps/rejected": -627.0, "loss": 2.8604, "rewards/accuracies": 0.522656261920929, "rewards/chosen": 7.346875190734863, "rewards/margins": 8.475000381469727, "rewards/rejected": -1.1066405773162842, "step": 990 }, { "epoch": 0.4694835680751174, "grad_norm": 335.1062534816379, "learning_rate": 2.9499217527386536e-07, "logits/chosen": -3.214062452316284, "logits/rejected": -3.192187547683716, "logps/chosen": -544.2000122070312, "logps/rejected": -641.2000122070312, "loss": 2.9158, "rewards/accuracies": 0.48515623807907104, "rewards/chosen": 6.574999809265137, "rewards/margins": 7.496874809265137, "rewards/rejected": -0.925537109375, "step": 1000 }, { "epoch": 0.47417840375586856, "grad_norm": 282.9372854168335, "learning_rate": 2.923839332290036e-07, "logits/chosen": -3.1265625953674316, "logits/rejected": -3.0999999046325684, "logps/chosen": -545.4000244140625, "logps/rejected": -657.5999755859375, "loss": 3.5473, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 5.871874809265137, "rewards/margins": 7.546875, "rewards/rejected": -1.6741211414337158, "step": 1010 }, { "epoch": 0.4788732394366197, "grad_norm": 239.71190395939402, "learning_rate": 2.8977569118414187e-07, "logits/chosen": -3.2437500953674316, "logits/rejected": -3.1890625953674316, "logps/chosen": -531.7999877929688, "logps/rejected": -632.4000244140625, "loss": 2.9164, "rewards/accuracies": 0.5, "rewards/chosen": 6.550000190734863, "rewards/margins": 8.649999618530273, "rewards/rejected": -2.099804639816284, "step": 1020 }, { "epoch": 0.4835680751173709, "grad_norm": 338.5277676426185, "learning_rate": 2.8716744913928007e-07, "logits/chosen": -3.2046875953674316, "logits/rejected": -3.1953125, "logps/chosen": -552.7999877929688, "logps/rejected": -620.4000244140625, "loss": 2.6548, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 6.712500095367432, "rewards/margins": 7.595312595367432, "rewards/rejected": -0.874804675579071, "step": 1030 }, { "epoch": 0.48826291079812206, "grad_norm": 233.06442469550254, "learning_rate": 2.845592070944183e-07, "logits/chosen": -3.2421875, "logits/rejected": -3.192187547683716, "logps/chosen": -549.0, "logps/rejected": -671.5999755859375, "loss": 2.8817, "rewards/accuracies": 0.51953125, "rewards/chosen": 7.153124809265137, "rewards/margins": 8.631250381469727, "rewards/rejected": -1.477929711341858, "step": 1040 }, { "epoch": 0.49295774647887325, "grad_norm": 220.02081206868604, "learning_rate": 2.819509650495566e-07, "logits/chosen": -3.1890625953674316, "logits/rejected": -3.114062547683716, "logps/chosen": -539.4000244140625, "logps/rejected": -620.4000244140625, "loss": 2.8089, "rewards/accuracies": 0.5289062261581421, "rewards/chosen": 7.287499904632568, "rewards/margins": 8.315625190734863, "rewards/rejected": -1.0333983898162842, "step": 1050 }, { "epoch": 0.49765258215962443, "grad_norm": 217.0188288235901, "learning_rate": 2.7934272300469483e-07, "logits/chosen": -3.1312499046325684, "logits/rejected": -3.0875000953674316, "logps/chosen": -581.5999755859375, "logps/rejected": -634.4000244140625, "loss": 2.9255, "rewards/accuracies": 0.50390625, "rewards/chosen": 7.106249809265137, "rewards/margins": 8.217187881469727, "rewards/rejected": -1.108007788658142, "step": 1060 }, { "epoch": 0.5023474178403756, "grad_norm": 391.863680049624, "learning_rate": 2.7673448095983303e-07, "logits/chosen": -3.2093749046325684, "logits/rejected": -3.192187547683716, "logps/chosen": -550.5999755859375, "logps/rejected": -623.2000122070312, "loss": 2.692, "rewards/accuracies": 0.4945312440395355, "rewards/chosen": 6.865624904632568, "rewards/margins": 6.509375095367432, "rewards/rejected": 0.34440916776657104, "step": 1070 }, { "epoch": 0.5070422535211268, "grad_norm": 242.99585174045663, "learning_rate": 2.741262389149713e-07, "logits/chosen": -3.159374952316284, "logits/rejected": -3.078125, "logps/chosen": -564.0, "logps/rejected": -651.5999755859375, "loss": 2.9599, "rewards/accuracies": 0.5218750238418579, "rewards/chosen": 6.462500095367432, "rewards/margins": 7.978125095367432, "rewards/rejected": -1.518164038658142, "step": 1080 }, { "epoch": 0.5117370892018779, "grad_norm": 333.24219218814267, "learning_rate": 2.7151799687010953e-07, "logits/chosen": -3.203125, "logits/rejected": -3.1312499046325684, "logps/chosen": -564.2000122070312, "logps/rejected": -659.5999755859375, "loss": 2.8861, "rewards/accuracies": 0.5367187261581421, "rewards/chosen": 6.493750095367432, "rewards/margins": 8.649999618530273, "rewards/rejected": -2.144726514816284, "step": 1090 }, { "epoch": 0.5164319248826291, "grad_norm": 334.21800919068465, "learning_rate": 2.6890975482524773e-07, "logits/chosen": -3.114062547683716, "logits/rejected": -3.104687452316284, "logps/chosen": -588.4000244140625, "logps/rejected": -649.5999755859375, "loss": 3.4208, "rewards/accuracies": 0.4859375059604645, "rewards/chosen": 6.010156154632568, "rewards/margins": 8.053125381469727, "rewards/rejected": -2.0474610328674316, "step": 1100 }, { "epoch": 0.5211267605633803, "grad_norm": 242.1193531722806, "learning_rate": 2.66301512780386e-07, "logits/chosen": -3.184375047683716, "logits/rejected": -3.082812547683716, "logps/chosen": -526.0, "logps/rejected": -645.5999755859375, "loss": 2.8418, "rewards/accuracies": 0.49296873807907104, "rewards/chosen": 6.581250190734863, "rewards/margins": 9.096875190734863, "rewards/rejected": -2.513476610183716, "step": 1110 }, { "epoch": 0.5258215962441315, "grad_norm": 202.26450736391664, "learning_rate": 2.6369327073552424e-07, "logits/chosen": -3.262500047683716, "logits/rejected": -3.206249952316284, "logps/chosen": -535.5999755859375, "logps/rejected": -607.0, "loss": 3.1496, "rewards/accuracies": 0.514843761920929, "rewards/chosen": 6.915625095367432, "rewards/margins": 7.053124904632568, "rewards/rejected": -0.13876953721046448, "step": 1120 }, { "epoch": 0.5305164319248826, "grad_norm": 233.54850849241447, "learning_rate": 2.6108502869066244e-07, "logits/chosen": -3.1796875, "logits/rejected": -3.109375, "logps/chosen": -576.4000244140625, "logps/rejected": -680.0, "loss": 3.7393, "rewards/accuracies": 0.5, "rewards/chosen": 6.479687690734863, "rewards/margins": 7.728125095367432, "rewards/rejected": -1.2414062023162842, "step": 1130 }, { "epoch": 0.5352112676056338, "grad_norm": 715.8986582014574, "learning_rate": 2.584767866458007e-07, "logits/chosen": -3.253124952316284, "logits/rejected": -3.168750047683716, "logps/chosen": -544.7999877929688, "logps/rejected": -636.2000122070312, "loss": 2.5833, "rewards/accuracies": 0.4945312440395355, "rewards/chosen": 6.440625190734863, "rewards/margins": 7.7578125, "rewards/rejected": -1.3078124523162842, "step": 1140 }, { "epoch": 0.539906103286385, "grad_norm": 266.9534201019325, "learning_rate": 2.5586854460093895e-07, "logits/chosen": -3.1312499046325684, "logits/rejected": -3.137500047683716, "logps/chosen": -562.0, "logps/rejected": -662.0, "loss": 3.3017, "rewards/accuracies": 0.522656261920929, "rewards/chosen": 6.409375190734863, "rewards/margins": 8.106249809265137, "rewards/rejected": -1.7031738758087158, "step": 1150 }, { "epoch": 0.5446009389671361, "grad_norm": 224.08328944189464, "learning_rate": 2.5326030255607715e-07, "logits/chosen": -3.2125000953674316, "logits/rejected": -3.1781249046325684, "logps/chosen": -558.2000122070312, "logps/rejected": -636.4000244140625, "loss": 2.9908, "rewards/accuracies": 0.510937511920929, "rewards/chosen": 6.909375190734863, "rewards/margins": 8.153124809265137, "rewards/rejected": -1.2466309070587158, "step": 1160 }, { "epoch": 0.5492957746478874, "grad_norm": 279.2533762941732, "learning_rate": 2.506520605112154e-07, "logits/chosen": -3.2265625, "logits/rejected": -3.2093749046325684, "logps/chosen": -549.5999755859375, "logps/rejected": -637.5999755859375, "loss": 3.8627, "rewards/accuracies": 0.46171873807907104, "rewards/chosen": 6.125, "rewards/margins": 6.453906059265137, "rewards/rejected": -0.3257812559604645, "step": 1170 }, { "epoch": 0.5539906103286385, "grad_norm": 240.7984295206075, "learning_rate": 2.4804381846635365e-07, "logits/chosen": -3.2593750953674316, "logits/rejected": -3.2249999046325684, "logps/chosen": -569.5999755859375, "logps/rejected": -641.0, "loss": 3.4113, "rewards/accuracies": 0.49687498807907104, "rewards/chosen": 5.7890625, "rewards/margins": 7.547656059265137, "rewards/rejected": -1.764062523841858, "step": 1180 }, { "epoch": 0.5586854460093896, "grad_norm": 230.7573724070305, "learning_rate": 2.454355764214919e-07, "logits/chosen": -3.184375047683716, "logits/rejected": -3.174999952316284, "logps/chosen": -535.7999877929688, "logps/rejected": -666.2000122070312, "loss": 2.8176, "rewards/accuracies": 0.4859375059604645, "rewards/chosen": 6.625, "rewards/margins": 8.231249809265137, "rewards/rejected": -1.6062500476837158, "step": 1190 }, { "epoch": 0.5633802816901409, "grad_norm": 258.81010276738493, "learning_rate": 2.4282733437663016e-07, "logits/chosen": -3.229687452316284, "logits/rejected": -3.2515625953674316, "logps/chosen": -554.4000244140625, "logps/rejected": -615.2000122070312, "loss": 3.2114, "rewards/accuracies": 0.48046875, "rewards/chosen": 7.240624904632568, "rewards/margins": 7.359375, "rewards/rejected": -0.12626953423023224, "step": 1200 }, { "epoch": 0.568075117370892, "grad_norm": 270.47734606922626, "learning_rate": 2.4021909233176836e-07, "logits/chosen": -3.2046875953674316, "logits/rejected": -3.2046875953674316, "logps/chosen": -524.7999877929688, "logps/rejected": -637.0, "loss": 2.8873, "rewards/accuracies": 0.5054687261581421, "rewards/chosen": 6.981249809265137, "rewards/margins": 8.546875, "rewards/rejected": -1.56201171875, "step": 1210 }, { "epoch": 0.5727699530516432, "grad_norm": 233.33661543891296, "learning_rate": 2.376108502869066e-07, "logits/chosen": -3.231250047683716, "logits/rejected": -3.2328124046325684, "logps/chosen": -563.2000122070312, "logps/rejected": -664.5999755859375, "loss": 3.0451, "rewards/accuracies": 0.5015624761581421, "rewards/chosen": 6.662499904632568, "rewards/margins": 7.839062690734863, "rewards/rejected": -1.1779296398162842, "step": 1220 }, { "epoch": 0.5774647887323944, "grad_norm": 266.1090049394165, "learning_rate": 2.3500260824204484e-07, "logits/chosen": -3.1656250953674316, "logits/rejected": -3.0859375, "logps/chosen": -594.5999755859375, "logps/rejected": -684.0, "loss": 3.1565, "rewards/accuracies": 0.48906248807907104, "rewards/chosen": 6.103125095367432, "rewards/margins": 8.472070693969727, "rewards/rejected": -2.381542921066284, "step": 1230 }, { "epoch": 0.5821596244131455, "grad_norm": 305.9487457845243, "learning_rate": 2.323943661971831e-07, "logits/chosen": -3.239062547683716, "logits/rejected": -3.1812500953674316, "logps/chosen": -554.2000122070312, "logps/rejected": -642.4000244140625, "loss": 3.7207, "rewards/accuracies": 0.4906249940395355, "rewards/chosen": 6.474999904632568, "rewards/margins": 7.390625, "rewards/rejected": -0.9126952886581421, "step": 1240 }, { "epoch": 0.5868544600938967, "grad_norm": 257.48418816374345, "learning_rate": 2.2978612415232132e-07, "logits/chosen": -3.1890625953674316, "logits/rejected": -3.1656250953674316, "logps/chosen": -588.0, "logps/rejected": -669.5999755859375, "loss": 2.5614, "rewards/accuracies": 0.47968751192092896, "rewards/chosen": 7.290625095367432, "rewards/margins": 9.240625381469727, "rewards/rejected": -1.9429442882537842, "step": 1250 }, { "epoch": 0.5915492957746479, "grad_norm": 270.4721169154024, "learning_rate": 2.2717788210745957e-07, "logits/chosen": -3.1656250953674316, "logits/rejected": -3.125, "logps/chosen": -550.2000122070312, "logps/rejected": -659.5999755859375, "loss": 3.9904, "rewards/accuracies": 0.504687488079071, "rewards/chosen": 6.263281345367432, "rewards/margins": 8.123437881469727, "rewards/rejected": -1.851953148841858, "step": 1260 }, { "epoch": 0.596244131455399, "grad_norm": 284.5477615847768, "learning_rate": 2.245696400625978e-07, "logits/chosen": -3.221874952316284, "logits/rejected": -3.198437452316284, "logps/chosen": -567.7999877929688, "logps/rejected": -655.5999755859375, "loss": 3.2818, "rewards/accuracies": 0.48046875, "rewards/chosen": 6.384375095367432, "rewards/margins": 7.204687595367432, "rewards/rejected": -0.8228515386581421, "step": 1270 }, { "epoch": 0.6009389671361502, "grad_norm": 220.06269466811972, "learning_rate": 2.2196139801773602e-07, "logits/chosen": -3.1859374046325684, "logits/rejected": -3.1187500953674316, "logps/chosen": -541.2000122070312, "logps/rejected": -616.5999755859375, "loss": 2.9687, "rewards/accuracies": 0.49140626192092896, "rewards/chosen": 6.400000095367432, "rewards/margins": 7.564062595367432, "rewards/rejected": -1.1668212413787842, "step": 1280 }, { "epoch": 0.6056338028169014, "grad_norm": 336.72704282740347, "learning_rate": 2.1935315597287428e-07, "logits/chosen": -3.1578125953674316, "logits/rejected": -3.171875, "logps/chosen": -564.7999877929688, "logps/rejected": -620.4000244140625, "loss": 3.6345, "rewards/accuracies": 0.526562511920929, "rewards/chosen": 6.578125, "rewards/margins": 6.42578125, "rewards/rejected": 0.14431151747703552, "step": 1290 }, { "epoch": 0.6103286384976526, "grad_norm": 261.5702252277783, "learning_rate": 2.167449139280125e-07, "logits/chosen": -3.2015624046325684, "logits/rejected": -3.1656250953674316, "logps/chosen": -564.0, "logps/rejected": -655.5999755859375, "loss": 3.1705, "rewards/accuracies": 0.510937511920929, "rewards/chosen": 7.034375190734863, "rewards/margins": 7.515625, "rewards/rejected": -0.4818359315395355, "step": 1300 }, { "epoch": 0.6150234741784038, "grad_norm": 237.58920682165183, "learning_rate": 2.1413667188315073e-07, "logits/chosen": -3.2593750953674316, "logits/rejected": -3.315624952316284, "logps/chosen": -533.0, "logps/rejected": -596.2000122070312, "loss": 3.2553, "rewards/accuracies": 0.48828125, "rewards/chosen": 6.599999904632568, "rewards/margins": 7.442187309265137, "rewards/rejected": -0.837646484375, "step": 1310 }, { "epoch": 0.6197183098591549, "grad_norm": 284.0284893595022, "learning_rate": 2.1152842983828898e-07, "logits/chosen": -3.129687547683716, "logits/rejected": -3.140625, "logps/chosen": -574.5999755859375, "logps/rejected": -635.4000244140625, "loss": 3.9295, "rewards/accuracies": 0.5132812261581421, "rewards/chosen": 5.603125095367432, "rewards/margins": 7.658593654632568, "rewards/rejected": -2.0562500953674316, "step": 1320 }, { "epoch": 0.6244131455399061, "grad_norm": 231.7736405189656, "learning_rate": 2.089201877934272e-07, "logits/chosen": -3.176562547683716, "logits/rejected": -3.151562452316284, "logps/chosen": -581.2000122070312, "logps/rejected": -687.4000244140625, "loss": 3.4721, "rewards/accuracies": 0.508593738079071, "rewards/chosen": 6.467187404632568, "rewards/margins": 8.949999809265137, "rewards/rejected": -2.483105421066284, "step": 1330 }, { "epoch": 0.6291079812206573, "grad_norm": 223.96093756368634, "learning_rate": 2.0631194574856543e-07, "logits/chosen": -3.3046875, "logits/rejected": -3.2593750953674316, "logps/chosen": -526.7999877929688, "logps/rejected": -586.4000244140625, "loss": 2.8135, "rewards/accuracies": 0.515625, "rewards/chosen": 7.831250190734863, "rewards/margins": 8.024999618530273, "rewards/rejected": -0.20263671875, "step": 1340 }, { "epoch": 0.6338028169014085, "grad_norm": 262.76467763294795, "learning_rate": 2.0370370370370369e-07, "logits/chosen": -3.1484375, "logits/rejected": -3.129687547683716, "logps/chosen": -562.0, "logps/rejected": -690.7999877929688, "loss": 3.8592, "rewards/accuracies": 0.507031261920929, "rewards/chosen": 7.050000190734863, "rewards/margins": 8.587499618530273, "rewards/rejected": -1.530175805091858, "step": 1350 }, { "epoch": 0.6384976525821596, "grad_norm": 342.0540956930175, "learning_rate": 2.010954616588419e-07, "logits/chosen": -3.2046875953674316, "logits/rejected": -3.153125047683716, "logps/chosen": -547.7999877929688, "logps/rejected": -628.0, "loss": 2.9989, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 7.368750095367432, "rewards/margins": 8.465624809265137, "rewards/rejected": -1.0910155773162842, "step": 1360 }, { "epoch": 0.6431924882629108, "grad_norm": 251.3399864703215, "learning_rate": 1.9848721961398017e-07, "logits/chosen": -3.260937452316284, "logits/rejected": -3.246875047683716, "logps/chosen": -551.4000244140625, "logps/rejected": -613.2000122070312, "loss": 2.5591, "rewards/accuracies": 0.48046875, "rewards/chosen": 7.159375190734863, "rewards/margins": 8.4375, "rewards/rejected": -1.281640648841858, "step": 1370 }, { "epoch": 0.647887323943662, "grad_norm": 250.2966758097825, "learning_rate": 1.958789775691184e-07, "logits/chosen": -3.2046875953674316, "logits/rejected": -3.140625, "logps/chosen": -537.7999877929688, "logps/rejected": -640.0, "loss": 2.6926, "rewards/accuracies": 0.4859375059604645, "rewards/chosen": 7.137499809265137, "rewards/margins": 9.2578125, "rewards/rejected": -2.123828172683716, "step": 1380 }, { "epoch": 0.6525821596244131, "grad_norm": 253.91294873585701, "learning_rate": 1.9327073552425662e-07, "logits/chosen": -3.1968750953674316, "logits/rejected": -3.143749952316284, "logps/chosen": -533.2000122070312, "logps/rejected": -613.7999877929688, "loss": 3.4994, "rewards/accuracies": 0.49140626192092896, "rewards/chosen": 6.696875095367432, "rewards/margins": 7.971875190734863, "rewards/rejected": -1.2712891101837158, "step": 1390 }, { "epoch": 0.6572769953051644, "grad_norm": 223.04763809135872, "learning_rate": 1.906624934793949e-07, "logits/chosen": -3.1546874046325684, "logits/rejected": -3.190624952316284, "logps/chosen": -552.0, "logps/rejected": -663.5999755859375, "loss": 2.8825, "rewards/accuracies": 0.532031238079071, "rewards/chosen": 7.21875, "rewards/margins": 9.815625190734863, "rewards/rejected": -2.5973877906799316, "step": 1400 }, { "epoch": 0.6619718309859155, "grad_norm": 216.17712546384078, "learning_rate": 1.8805425143453312e-07, "logits/chosen": -3.221874952316284, "logits/rejected": -3.0859375, "logps/chosen": -533.5999755859375, "logps/rejected": -697.2000122070312, "loss": 3.6138, "rewards/accuracies": 0.5, "rewards/chosen": 6.456250190734863, "rewards/margins": 8.978124618530273, "rewards/rejected": -2.5322265625, "step": 1410 }, { "epoch": 0.6666666666666666, "grad_norm": 282.7401512724836, "learning_rate": 1.8544600938967138e-07, "logits/chosen": -3.200000047683716, "logits/rejected": -3.1937499046325684, "logps/chosen": -546.0, "logps/rejected": -619.4000244140625, "loss": 3.106, "rewards/accuracies": 0.5210937261581421, "rewards/chosen": 7.259375095367432, "rewards/margins": 8.28125, "rewards/rejected": -1.0185546875, "step": 1420 }, { "epoch": 0.6713615023474179, "grad_norm": 214.2973371198878, "learning_rate": 1.828377673448096e-07, "logits/chosen": -3.1703124046325684, "logits/rejected": -3.1937499046325684, "logps/chosen": -586.2000122070312, "logps/rejected": -639.5999755859375, "loss": 3.944, "rewards/accuracies": 0.5210937261581421, "rewards/chosen": 6.467187404632568, "rewards/margins": 8.125, "rewards/rejected": -1.652929663658142, "step": 1430 }, { "epoch": 0.676056338028169, "grad_norm": 219.0548111069885, "learning_rate": 1.8022952529994783e-07, "logits/chosen": -3.231250047683716, "logits/rejected": -3.1640625, "logps/chosen": -556.4000244140625, "logps/rejected": -638.2000122070312, "loss": 2.6025, "rewards/accuracies": 0.532031238079071, "rewards/chosen": 7.456250190734863, "rewards/margins": 10.785937309265137, "rewards/rejected": -3.3238282203674316, "step": 1440 }, { "epoch": 0.6807511737089202, "grad_norm": 285.2721726652972, "learning_rate": 1.7762128325508608e-07, "logits/chosen": -3.21875, "logits/rejected": -3.2171874046325684, "logps/chosen": -576.0, "logps/rejected": -679.5999755859375, "loss": 3.5055, "rewards/accuracies": 0.5101562738418579, "rewards/chosen": 6.837500095367432, "rewards/margins": 8.471875190734863, "rewards/rejected": -1.632421851158142, "step": 1450 }, { "epoch": 0.6854460093896714, "grad_norm": 235.54935518418984, "learning_rate": 1.750130412102243e-07, "logits/chosen": -3.1109375953674316, "logits/rejected": -3.135937452316284, "logps/chosen": -604.0, "logps/rejected": -658.4000244140625, "loss": 3.2522, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 6.840624809265137, "rewards/margins": 9.0625, "rewards/rejected": -2.216357469558716, "step": 1460 }, { "epoch": 0.6901408450704225, "grad_norm": 230.7248053656623, "learning_rate": 1.7240479916536254e-07, "logits/chosen": -3.104687452316284, "logits/rejected": -3.1031250953674316, "logps/chosen": -543.0, "logps/rejected": -674.4000244140625, "loss": 2.7177, "rewards/accuracies": 0.5210937261581421, "rewards/chosen": 7.393750190734863, "rewards/margins": 9.737500190734863, "rewards/rejected": -2.353222608566284, "step": 1470 }, { "epoch": 0.6948356807511737, "grad_norm": 190.86847796342923, "learning_rate": 1.697965571205008e-07, "logits/chosen": -3.265625, "logits/rejected": -3.1953125, "logps/chosen": -545.0, "logps/rejected": -615.2000122070312, "loss": 2.9373, "rewards/accuracies": 0.52734375, "rewards/chosen": 6.996874809265137, "rewards/margins": 8.471875190734863, "rewards/rejected": -1.469628930091858, "step": 1480 }, { "epoch": 0.6995305164319249, "grad_norm": 292.41862144071786, "learning_rate": 1.6718831507563902e-07, "logits/chosen": -3.1968750953674316, "logits/rejected": -3.200000047683716, "logps/chosen": -555.0, "logps/rejected": -603.7999877929688, "loss": 3.2203, "rewards/accuracies": 0.516406238079071, "rewards/chosen": 6.784375190734863, "rewards/margins": 7.028124809265137, "rewards/rejected": -0.242431640625, "step": 1490 }, { "epoch": 0.704225352112676, "grad_norm": 248.77580575495298, "learning_rate": 1.6458007303077727e-07, "logits/chosen": -3.2015624046325684, "logits/rejected": -3.215625047683716, "logps/chosen": -535.5999755859375, "logps/rejected": -602.7999877929688, "loss": 3.1053, "rewards/accuracies": 0.49531251192092896, "rewards/chosen": 7.059374809265137, "rewards/margins": 7.209374904632568, "rewards/rejected": -0.15102538466453552, "step": 1500 }, { "epoch": 0.7089201877934272, "grad_norm": 254.34351835854568, "learning_rate": 1.619718309859155e-07, "logits/chosen": -3.2015624046325684, "logits/rejected": -3.1156249046325684, "logps/chosen": -560.0, "logps/rejected": -657.7999877929688, "loss": 3.6322, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 6.771874904632568, "rewards/margins": 8.5625, "rewards/rejected": -1.796289086341858, "step": 1510 }, { "epoch": 0.7136150234741784, "grad_norm": 234.67184609027018, "learning_rate": 1.5936358894105372e-07, "logits/chosen": -3.221874952316284, "logits/rejected": -3.1859374046325684, "logps/chosen": -558.2000122070312, "logps/rejected": -647.2000122070312, "loss": 2.9855, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 7.949999809265137, "rewards/margins": 8.662500381469727, "rewards/rejected": -0.7110351324081421, "step": 1520 }, { "epoch": 0.7183098591549296, "grad_norm": 238.33477876523006, "learning_rate": 1.5675534689619197e-07, "logits/chosen": -3.2249999046325684, "logits/rejected": -3.1953125, "logps/chosen": -560.4000244140625, "logps/rejected": -598.4000244140625, "loss": 3.1396, "rewards/accuracies": 0.51953125, "rewards/chosen": 6.5625, "rewards/margins": 7.431250095367432, "rewards/rejected": -0.86181640625, "step": 1530 }, { "epoch": 0.7230046948356808, "grad_norm": 281.3784430411394, "learning_rate": 1.541471048513302e-07, "logits/chosen": -3.1578125953674316, "logits/rejected": -3.1390624046325684, "logps/chosen": -564.0, "logps/rejected": -642.7999877929688, "loss": 2.8664, "rewards/accuracies": 0.524218738079071, "rewards/chosen": 7.71875, "rewards/margins": 8.193750381469727, "rewards/rejected": -0.4874023497104645, "step": 1540 }, { "epoch": 0.7276995305164319, "grad_norm": 206.24345933603442, "learning_rate": 1.5153886280646843e-07, "logits/chosen": -3.176562547683716, "logits/rejected": -3.1859374046325684, "logps/chosen": -525.2000122070312, "logps/rejected": -594.5999755859375, "loss": 3.2805, "rewards/accuracies": 0.5054687261581421, "rewards/chosen": 6.890625, "rewards/margins": 7.2734375, "rewards/rejected": -0.3929687440395355, "step": 1550 }, { "epoch": 0.7323943661971831, "grad_norm": 223.79553591154388, "learning_rate": 1.4893062076160668e-07, "logits/chosen": -3.2437500953674316, "logits/rejected": -3.2578125, "logps/chosen": -544.4000244140625, "logps/rejected": -647.4000244140625, "loss": 3.0315, "rewards/accuracies": 0.5171874761581421, "rewards/chosen": 7.109375, "rewards/margins": 9.737500190734863, "rewards/rejected": -2.630078077316284, "step": 1560 }, { "epoch": 0.7370892018779343, "grad_norm": 180.42701263391848, "learning_rate": 1.463223787167449e-07, "logits/chosen": -3.2109375, "logits/rejected": -3.1781249046325684, "logps/chosen": -560.0, "logps/rejected": -628.0, "loss": 3.1893, "rewards/accuracies": 0.5132812261581421, "rewards/chosen": 7.106249809265137, "rewards/margins": 7.204687595367432, "rewards/rejected": -0.11171875149011612, "step": 1570 }, { "epoch": 0.7417840375586855, "grad_norm": 270.2632681106118, "learning_rate": 1.4371413667188313e-07, "logits/chosen": -3.2421875, "logits/rejected": -3.200000047683716, "logps/chosen": -544.2000122070312, "logps/rejected": -668.4000244140625, "loss": 3.052, "rewards/accuracies": 0.542187511920929, "rewards/chosen": 7.303124904632568, "rewards/margins": 8.615625381469727, "rewards/rejected": -1.320703148841858, "step": 1580 }, { "epoch": 0.7464788732394366, "grad_norm": 222.47759597309053, "learning_rate": 1.4110589462702139e-07, "logits/chosen": -3.2718749046325684, "logits/rejected": -3.2562499046325684, "logps/chosen": -540.7999877929688, "logps/rejected": -607.4000244140625, "loss": 3.094, "rewards/accuracies": 0.5140625238418579, "rewards/chosen": 7.271874904632568, "rewards/margins": 7.453125, "rewards/rejected": -0.18339844048023224, "step": 1590 }, { "epoch": 0.7511737089201878, "grad_norm": 229.4125834407109, "learning_rate": 1.384976525821596e-07, "logits/chosen": -3.3046875, "logits/rejected": -3.2046875953674316, "logps/chosen": -559.0, "logps/rejected": -656.0, "loss": 3.2896, "rewards/accuracies": 0.49921876192092896, "rewards/chosen": 6.934374809265137, "rewards/margins": 8.587499618530273, "rewards/rejected": -1.6577637195587158, "step": 1600 }, { "epoch": 0.755868544600939, "grad_norm": 288.27015054065197, "learning_rate": 1.3588941053729787e-07, "logits/chosen": -3.176562547683716, "logits/rejected": -3.2015624046325684, "logps/chosen": -565.5999755859375, "logps/rejected": -662.7999877929688, "loss": 2.8581, "rewards/accuracies": 0.5234375, "rewards/chosen": 7.037499904632568, "rewards/margins": 8.634374618530273, "rewards/rejected": -1.6044433116912842, "step": 1610 }, { "epoch": 0.7605633802816901, "grad_norm": 249.0861315317849, "learning_rate": 1.332811684924361e-07, "logits/chosen": -3.2203125953674316, "logits/rejected": -3.176562547683716, "logps/chosen": -571.0, "logps/rejected": -634.7999877929688, "loss": 3.0383, "rewards/accuracies": 0.500781238079071, "rewards/chosen": 7.037499904632568, "rewards/margins": 8.121874809265137, "rewards/rejected": -1.0910155773162842, "step": 1620 }, { "epoch": 0.7652582159624414, "grad_norm": 336.27211809292953, "learning_rate": 1.3067292644757432e-07, "logits/chosen": -3.0859375, "logits/rejected": -3.096874952316284, "logps/chosen": -583.5999755859375, "logps/rejected": -684.0, "loss": 3.377, "rewards/accuracies": 0.4921875, "rewards/chosen": 6.793749809265137, "rewards/margins": 8.328125, "rewards/rejected": -1.5402343273162842, "step": 1630 }, { "epoch": 0.7699530516431925, "grad_norm": 296.7983824668959, "learning_rate": 1.2806468440271257e-07, "logits/chosen": -3.1875, "logits/rejected": -3.1390624046325684, "logps/chosen": -540.2000122070312, "logps/rejected": -625.0, "loss": 2.6146, "rewards/accuracies": 0.49921876192092896, "rewards/chosen": 7.290625095367432, "rewards/margins": 7.873437404632568, "rewards/rejected": -0.5884033441543579, "step": 1640 }, { "epoch": 0.7746478873239436, "grad_norm": 251.3318467521815, "learning_rate": 1.254564423578508e-07, "logits/chosen": -3.198437452316284, "logits/rejected": -3.1546874046325684, "logps/chosen": -547.7999877929688, "logps/rejected": -640.0, "loss": 2.6398, "rewards/accuracies": 0.53125, "rewards/chosen": 7.775000095367432, "rewards/margins": 9.34375, "rewards/rejected": -1.5627930164337158, "step": 1650 }, { "epoch": 0.7793427230046949, "grad_norm": 220.5249442173988, "learning_rate": 1.2284820031298902e-07, "logits/chosen": -3.207812547683716, "logits/rejected": -3.137500047683716, "logps/chosen": -583.5999755859375, "logps/rejected": -655.5999755859375, "loss": 2.8037, "rewards/accuracies": 0.5367187261581421, "rewards/chosen": 7.190625190734863, "rewards/margins": 10.106249809265137, "rewards/rejected": -2.914843797683716, "step": 1660 }, { "epoch": 0.784037558685446, "grad_norm": 275.17795087812124, "learning_rate": 1.2023995826812728e-07, "logits/chosen": -3.192187547683716, "logits/rejected": -3.1421875953674316, "logps/chosen": -550.4000244140625, "logps/rejected": -651.2000122070312, "loss": 3.0396, "rewards/accuracies": 0.514843761920929, "rewards/chosen": 7.331250190734863, "rewards/margins": 8.3671875, "rewards/rejected": -1.035742163658142, "step": 1670 }, { "epoch": 0.7887323943661971, "grad_norm": 227.48462965497947, "learning_rate": 1.176317162232655e-07, "logits/chosen": -3.2265625, "logits/rejected": -3.221874952316284, "logps/chosen": -539.0, "logps/rejected": -660.2000122070312, "loss": 3.0071, "rewards/accuracies": 0.5335937738418579, "rewards/chosen": 7.021874904632568, "rewards/margins": 8.871874809265137, "rewards/rejected": -1.8449218273162842, "step": 1680 }, { "epoch": 0.7934272300469484, "grad_norm": 244.60056821194243, "learning_rate": 1.1502347417840374e-07, "logits/chosen": -3.1734375953674316, "logits/rejected": -3.120312452316284, "logps/chosen": -551.4000244140625, "logps/rejected": -642.4000244140625, "loss": 3.3663, "rewards/accuracies": 0.51171875, "rewards/chosen": 6.449999809265137, "rewards/margins": 7.964062690734863, "rewards/rejected": -1.507226586341858, "step": 1690 }, { "epoch": 0.7981220657276995, "grad_norm": 260.2800758003632, "learning_rate": 1.1241523213354198e-07, "logits/chosen": -3.239062547683716, "logits/rejected": -3.1796875, "logps/chosen": -547.4000244140625, "logps/rejected": -656.7999877929688, "loss": 2.7516, "rewards/accuracies": 0.5015624761581421, "rewards/chosen": 7.525000095367432, "rewards/margins": 8.653124809265137, "rewards/rejected": -1.129980444908142, "step": 1700 }, { "epoch": 0.8028169014084507, "grad_norm": 537.1358038182277, "learning_rate": 1.0980699008868022e-07, "logits/chosen": -3.2093749046325684, "logits/rejected": -3.1640625, "logps/chosen": -566.0, "logps/rejected": -623.2000122070312, "loss": 3.5979, "rewards/accuracies": 0.516406238079071, "rewards/chosen": 7.125, "rewards/margins": 7.8046875, "rewards/rejected": -0.683398425579071, "step": 1710 }, { "epoch": 0.8075117370892019, "grad_norm": 226.49875791421053, "learning_rate": 1.0719874804381846e-07, "logits/chosen": -3.1546874046325684, "logits/rejected": -3.15625, "logps/chosen": -553.4000244140625, "logps/rejected": -593.0, "loss": 2.4992, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 7.696875095367432, "rewards/margins": 8.106249809265137, "rewards/rejected": -0.42167967557907104, "step": 1720 }, { "epoch": 0.812206572769953, "grad_norm": 208.7589975935314, "learning_rate": 1.045905059989567e-07, "logits/chosen": -3.1953125, "logits/rejected": -3.2125000953674316, "logps/chosen": -523.2000122070312, "logps/rejected": -582.4000244140625, "loss": 3.0689, "rewards/accuracies": 0.507031261920929, "rewards/chosen": 7.025000095367432, "rewards/margins": 7.3125, "rewards/rejected": -0.28300780057907104, "step": 1730 }, { "epoch": 0.8169014084507042, "grad_norm": 283.0112631119428, "learning_rate": 1.0198226395409494e-07, "logits/chosen": -3.1421875953674316, "logits/rejected": -3.082812547683716, "logps/chosen": -532.5999755859375, "logps/rejected": -614.0, "loss": 3.0318, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 7.181250095367432, "rewards/margins": 7.328125, "rewards/rejected": -0.13925781846046448, "step": 1740 }, { "epoch": 0.8215962441314554, "grad_norm": 214.30492446037974, "learning_rate": 9.937402190923318e-08, "logits/chosen": -3.129687547683716, "logits/rejected": -3.1156249046325684, "logps/chosen": -527.2000122070312, "logps/rejected": -649.5999755859375, "loss": 2.7622, "rewards/accuracies": 0.507031261920929, "rewards/chosen": 7.393750190734863, "rewards/margins": 9.978124618530273, "rewards/rejected": -2.5888671875, "step": 1750 }, { "epoch": 0.8262910798122066, "grad_norm": 239.2913674845102, "learning_rate": 9.676577986437141e-08, "logits/chosen": -3.1546874046325684, "logits/rejected": -3.0843749046325684, "logps/chosen": -581.2000122070312, "logps/rejected": -643.5999755859375, "loss": 3.0143, "rewards/accuracies": 0.5179687738418579, "rewards/chosen": 7.084374904632568, "rewards/margins": 9.653124809265137, "rewards/rejected": -2.5667967796325684, "step": 1760 }, { "epoch": 0.8309859154929577, "grad_norm": 261.7289874476712, "learning_rate": 9.415753781950965e-08, "logits/chosen": -3.206249952316284, "logits/rejected": -3.1953125, "logps/chosen": -535.7999877929688, "logps/rejected": -627.0, "loss": 3.1962, "rewards/accuracies": 0.52734375, "rewards/chosen": 7.068749904632568, "rewards/margins": 8.881250381469727, "rewards/rejected": -1.8058593273162842, "step": 1770 }, { "epoch": 0.8356807511737089, "grad_norm": 278.27984402382043, "learning_rate": 9.154929577464789e-08, "logits/chosen": -3.2328124046325684, "logits/rejected": -3.1875, "logps/chosen": -548.0, "logps/rejected": -632.0, "loss": 2.6488, "rewards/accuracies": 0.5367187261581421, "rewards/chosen": 7.543749809265137, "rewards/margins": 10.512499809265137, "rewards/rejected": -2.9665770530700684, "step": 1780 }, { "epoch": 0.8403755868544601, "grad_norm": 869.0934065585606, "learning_rate": 8.894105372978613e-08, "logits/chosen": -3.1484375, "logits/rejected": -3.15625, "logps/chosen": -572.7999877929688, "logps/rejected": -643.0, "loss": 3.476, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 7.753125190734863, "rewards/margins": 8.867968559265137, "rewards/rejected": -1.105078101158142, "step": 1790 }, { "epoch": 0.8450704225352113, "grad_norm": 177.18669328761814, "learning_rate": 8.633281168492435e-08, "logits/chosen": -3.2421875, "logits/rejected": -3.237499952316284, "logps/chosen": -546.2000122070312, "logps/rejected": -620.4000244140625, "loss": 2.5794, "rewards/accuracies": 0.542187511920929, "rewards/chosen": 7.628125190734863, "rewards/margins": 9.643750190734863, "rewards/rejected": -1.998046875, "step": 1800 }, { "epoch": 0.8497652582159625, "grad_norm": 320.75249929826015, "learning_rate": 8.372456964006259e-08, "logits/chosen": -3.145312547683716, "logits/rejected": -3.0953125953674316, "logps/chosen": -550.5999755859375, "logps/rejected": -637.2000122070312, "loss": 3.1061, "rewards/accuracies": 0.510937511920929, "rewards/chosen": 6.485937595367432, "rewards/margins": 8.915624618530273, "rewards/rejected": -2.4320311546325684, "step": 1810 }, { "epoch": 0.8544600938967136, "grad_norm": 298.35819523439216, "learning_rate": 8.111632759520083e-08, "logits/chosen": -3.1859374046325684, "logits/rejected": -3.1624999046325684, "logps/chosen": -586.5999755859375, "logps/rejected": -648.7999877929688, "loss": 2.7858, "rewards/accuracies": 0.528124988079071, "rewards/chosen": 7.328125, "rewards/margins": 10.743749618530273, "rewards/rejected": -3.421337842941284, "step": 1820 }, { "epoch": 0.8591549295774648, "grad_norm": 260.70360175198056, "learning_rate": 7.850808555033907e-08, "logits/chosen": -3.198437452316284, "logits/rejected": -3.120312452316284, "logps/chosen": -554.0, "logps/rejected": -660.7999877929688, "loss": 2.9332, "rewards/accuracies": 0.4945312440395355, "rewards/chosen": 7.34375, "rewards/margins": 10.267187118530273, "rewards/rejected": -2.930468797683716, "step": 1830 }, { "epoch": 0.863849765258216, "grad_norm": 336.5324358151134, "learning_rate": 7.58998435054773e-08, "logits/chosen": -3.2203125953674316, "logits/rejected": -3.1484375, "logps/chosen": -565.4000244140625, "logps/rejected": -649.2000122070312, "loss": 3.133, "rewards/accuracies": 0.52734375, "rewards/chosen": 7.131249904632568, "rewards/margins": 9.384374618530273, "rewards/rejected": -2.2672362327575684, "step": 1840 }, { "epoch": 0.8685446009389671, "grad_norm": 311.72767948233286, "learning_rate": 7.329160146061554e-08, "logits/chosen": -3.239062547683716, "logits/rejected": -3.285937547683716, "logps/chosen": -541.4000244140625, "logps/rejected": -631.5999755859375, "loss": 3.441, "rewards/accuracies": 0.520312488079071, "rewards/chosen": 7.084374904632568, "rewards/margins": 7.295312404632568, "rewards/rejected": -0.20380859076976776, "step": 1850 }, { "epoch": 0.8732394366197183, "grad_norm": 295.9764703306369, "learning_rate": 7.068335941575378e-08, "logits/chosen": -3.143749952316284, "logits/rejected": -3.174999952316284, "logps/chosen": -544.5999755859375, "logps/rejected": -616.7999877929688, "loss": 3.3042, "rewards/accuracies": 0.47968751192092896, "rewards/chosen": 5.900000095367432, "rewards/margins": 7.462500095367432, "rewards/rejected": -1.560156226158142, "step": 1860 }, { "epoch": 0.8779342723004695, "grad_norm": 338.46224187494545, "learning_rate": 6.807511737089202e-08, "logits/chosen": -3.2359375953674316, "logits/rejected": -3.1937499046325684, "logps/chosen": -540.7999877929688, "logps/rejected": -641.2000122070312, "loss": 2.9456, "rewards/accuracies": 0.51171875, "rewards/chosen": 6.884375095367432, "rewards/margins": 8.481249809265137, "rewards/rejected": -1.6007812023162842, "step": 1870 }, { "epoch": 0.8826291079812206, "grad_norm": 221.38172987684965, "learning_rate": 6.546687532603024e-08, "logits/chosen": -3.2093749046325684, "logits/rejected": -3.176562547683716, "logps/chosen": -545.7999877929688, "logps/rejected": -642.0, "loss": 2.7403, "rewards/accuracies": 0.516406238079071, "rewards/chosen": 7.334374904632568, "rewards/margins": 8.307812690734863, "rewards/rejected": -0.9735351800918579, "step": 1880 }, { "epoch": 0.8873239436619719, "grad_norm": 240.72999782160588, "learning_rate": 6.285863328116848e-08, "logits/chosen": -3.190624952316284, "logits/rejected": -3.1578125953674316, "logps/chosen": -499.20001220703125, "logps/rejected": -654.4000244140625, "loss": 2.6771, "rewards/accuracies": 0.522656261920929, "rewards/chosen": 7.665625095367432, "rewards/margins": 10.246874809265137, "rewards/rejected": -2.581835985183716, "step": 1890 }, { "epoch": 0.892018779342723, "grad_norm": 232.53227092972915, "learning_rate": 6.025039123630672e-08, "logits/chosen": -3.200000047683716, "logits/rejected": -3.1578125953674316, "logps/chosen": -558.4000244140625, "logps/rejected": -610.0, "loss": 2.9898, "rewards/accuracies": 0.48359376192092896, "rewards/chosen": 7.087500095367432, "rewards/margins": 8.743749618530273, "rewards/rejected": -1.6506836414337158, "step": 1900 }, { "epoch": 0.8967136150234741, "grad_norm": 259.5993100857981, "learning_rate": 5.764214919144496e-08, "logits/chosen": -3.2109375, "logits/rejected": -3.2249999046325684, "logps/chosen": -567.2000122070312, "logps/rejected": -604.0, "loss": 3.2302, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 7.474999904632568, "rewards/margins": 7.65625, "rewards/rejected": -0.17148438096046448, "step": 1910 }, { "epoch": 0.9014084507042254, "grad_norm": 272.03270384560665, "learning_rate": 5.50339071465832e-08, "logits/chosen": -3.1953125, "logits/rejected": -3.21875, "logps/chosen": -561.0, "logps/rejected": -589.4000244140625, "loss": 3.044, "rewards/accuracies": 0.5101562738418579, "rewards/chosen": 7.71875, "rewards/margins": 7.587500095367432, "rewards/rejected": 0.13071289658546448, "step": 1920 }, { "epoch": 0.9061032863849765, "grad_norm": 347.3443264035594, "learning_rate": 5.2425665101721436e-08, "logits/chosen": -3.2046875953674316, "logits/rejected": -3.089062452316284, "logps/chosen": -564.2000122070312, "logps/rejected": -682.7999877929688, "loss": 3.1516, "rewards/accuracies": 0.500781238079071, "rewards/chosen": 6.857812404632568, "rewards/margins": 8.010937690734863, "rewards/rejected": -1.1521484851837158, "step": 1930 }, { "epoch": 0.9107981220657277, "grad_norm": 262.9951136929276, "learning_rate": 4.9817423056859675e-08, "logits/chosen": -3.2171874046325684, "logits/rejected": -3.1656250953674316, "logps/chosen": -559.0, "logps/rejected": -652.4000244140625, "loss": 2.9059, "rewards/accuracies": 0.5218750238418579, "rewards/chosen": 7.0625, "rewards/margins": 9.043749809265137, "rewards/rejected": -1.981054663658142, "step": 1940 }, { "epoch": 0.9154929577464789, "grad_norm": 253.21404195030644, "learning_rate": 4.720918101199791e-08, "logits/chosen": -3.1390624046325684, "logits/rejected": -3.075000047683716, "logps/chosen": -573.2000122070312, "logps/rejected": -708.0, "loss": 3.2897, "rewards/accuracies": 0.514843761920929, "rewards/chosen": 6.673437595367432, "rewards/margins": 9.121874809265137, "rewards/rejected": -2.451367139816284, "step": 1950 }, { "epoch": 0.92018779342723, "grad_norm": 415.35862908474485, "learning_rate": 4.460093896713615e-08, "logits/chosen": -3.2265625, "logits/rejected": -3.1421875953674316, "logps/chosen": -549.2000122070312, "logps/rejected": -660.7999877929688, "loss": 3.0769, "rewards/accuracies": 0.5210937261581421, "rewards/chosen": 7.340624809265137, "rewards/margins": 8.334375381469727, "rewards/rejected": -0.9888671636581421, "step": 1960 }, { "epoch": 0.9248826291079812, "grad_norm": 211.19149426581953, "learning_rate": 4.199269692227438e-08, "logits/chosen": -3.1937499046325684, "logits/rejected": -3.2109375, "logps/chosen": -544.7999877929688, "logps/rejected": -597.2000122070312, "loss": 3.3169, "rewards/accuracies": 0.514843761920929, "rewards/chosen": 7.521874904632568, "rewards/margins": 7.584374904632568, "rewards/rejected": -0.05854492262005806, "step": 1970 }, { "epoch": 0.9295774647887324, "grad_norm": 246.97599476259882, "learning_rate": 3.938445487741262e-08, "logits/chosen": -3.1031250953674316, "logits/rejected": -3.059375047683716, "logps/chosen": -574.5999755859375, "logps/rejected": -638.0, "loss": 2.8105, "rewards/accuracies": 0.528124988079071, "rewards/chosen": 7.303124904632568, "rewards/margins": 9.528124809265137, "rewards/rejected": -2.2249999046325684, "step": 1980 }, { "epoch": 0.9342723004694836, "grad_norm": 279.61160915300826, "learning_rate": 3.677621283255086e-08, "logits/chosen": -3.246875047683716, "logits/rejected": -3.2359375953674316, "logps/chosen": -567.7999877929688, "logps/rejected": -628.4000244140625, "loss": 3.3836, "rewards/accuracies": 0.5289062261581421, "rewards/chosen": 7.5, "rewards/margins": 8.337499618530273, "rewards/rejected": -0.842089831829071, "step": 1990 }, { "epoch": 0.9389671361502347, "grad_norm": 205.70295434604552, "learning_rate": 3.41679707876891e-08, "logits/chosen": -3.2203125953674316, "logits/rejected": -3.125, "logps/chosen": -554.4000244140625, "logps/rejected": -696.4000244140625, "loss": 3.1675, "rewards/accuracies": 0.5289062261581421, "rewards/chosen": 7.099999904632568, "rewards/margins": 10.25, "rewards/rejected": -3.1507811546325684, "step": 2000 }, { "epoch": 0.9436619718309859, "grad_norm": 254.40619070914613, "learning_rate": 3.155972874282733e-08, "logits/chosen": -3.237499952316284, "logits/rejected": -3.184375047683716, "logps/chosen": -533.2000122070312, "logps/rejected": -630.4000244140625, "loss": 3.343, "rewards/accuracies": 0.507031261920929, "rewards/chosen": 6.965624809265137, "rewards/margins": 7.915625095367432, "rewards/rejected": -0.9505859613418579, "step": 2010 }, { "epoch": 0.9483568075117371, "grad_norm": 220.33132929742834, "learning_rate": 2.8951486697965573e-08, "logits/chosen": -3.1937499046325684, "logits/rejected": -3.1656250953674316, "logps/chosen": -565.7999877929688, "logps/rejected": -644.7999877929688, "loss": 3.3804, "rewards/accuracies": 0.508593738079071, "rewards/chosen": 7.599999904632568, "rewards/margins": 7.759375095367432, "rewards/rejected": -0.16523437201976776, "step": 2020 }, { "epoch": 0.9530516431924883, "grad_norm": 272.37246310322604, "learning_rate": 2.634324465310381e-08, "logits/chosen": -3.203125, "logits/rejected": -3.1484375, "logps/chosen": -542.0, "logps/rejected": -635.5999755859375, "loss": 3.0197, "rewards/accuracies": 0.5179687738418579, "rewards/chosen": 6.753125190734863, "rewards/margins": 8.303125381469727, "rewards/rejected": -1.5498046875, "step": 2030 }, { "epoch": 0.9577464788732394, "grad_norm": 245.31063589482577, "learning_rate": 2.3735002608242045e-08, "logits/chosen": -3.21875, "logits/rejected": -3.184375047683716, "logps/chosen": -564.0, "logps/rejected": -606.7999877929688, "loss": 3.171, "rewards/accuracies": 0.5179687738418579, "rewards/chosen": 6.828125, "rewards/margins": 8.393750190734863, "rewards/rejected": -1.563085913658142, "step": 2040 }, { "epoch": 0.9624413145539906, "grad_norm": 245.63046415376243, "learning_rate": 2.1126760563380282e-08, "logits/chosen": -3.231250047683716, "logits/rejected": -3.265625, "logps/chosen": -545.7999877929688, "logps/rejected": -580.0, "loss": 3.0817, "rewards/accuracies": 0.5078125, "rewards/chosen": 7.0625, "rewards/margins": 7.109375, "rewards/rejected": -0.04501952975988388, "step": 2050 }, { "epoch": 0.9671361502347418, "grad_norm": 1711.8411407572087, "learning_rate": 1.8518518518518518e-08, "logits/chosen": -3.2171874046325684, "logits/rejected": -3.1234374046325684, "logps/chosen": -570.4000244140625, "logps/rejected": -638.0, "loss": 3.162, "rewards/accuracies": 0.514843761920929, "rewards/chosen": 7.546875, "rewards/margins": 8.324999809265137, "rewards/rejected": -0.786914050579071, "step": 2060 }, { "epoch": 0.971830985915493, "grad_norm": 267.19856736301995, "learning_rate": 1.5910276473656755e-08, "logits/chosen": -3.168750047683716, "logits/rejected": -3.0703125, "logps/chosen": -549.2000122070312, "logps/rejected": -635.5999755859375, "loss": 3.026, "rewards/accuracies": 0.508593738079071, "rewards/chosen": 7.5078125, "rewards/margins": 10.239843368530273, "rewards/rejected": -2.7281250953674316, "step": 2070 }, { "epoch": 0.9765258215962441, "grad_norm": 216.4347507416067, "learning_rate": 1.3302034428794991e-08, "logits/chosen": -3.253124952316284, "logits/rejected": -3.2671875953674316, "logps/chosen": -523.7999877929688, "logps/rejected": -592.5999755859375, "loss": 3.0064, "rewards/accuracies": 0.5140625238418579, "rewards/chosen": 7.137499809265137, "rewards/margins": 7.170312404632568, "rewards/rejected": -0.02773437462747097, "step": 2080 }, { "epoch": 0.9812206572769953, "grad_norm": 260.8350621859381, "learning_rate": 1.0693792383933229e-08, "logits/chosen": -3.2515625953674316, "logits/rejected": -3.2796874046325684, "logps/chosen": -555.5999755859375, "logps/rejected": -642.4000244140625, "loss": 2.6919, "rewards/accuracies": 0.510937511920929, "rewards/chosen": 7.987500190734863, "rewards/margins": 8.418749809265137, "rewards/rejected": -0.4306640625, "step": 2090 }, { "epoch": 0.9859154929577465, "grad_norm": 472.6432154571831, "learning_rate": 8.085550339071465e-09, "logits/chosen": -3.167187452316284, "logits/rejected": -3.1343750953674316, "logps/chosen": -573.4000244140625, "logps/rejected": -673.7999877929688, "loss": 2.7546, "rewards/accuracies": 0.530468761920929, "rewards/chosen": 7.675000190734863, "rewards/margins": 8.987500190734863, "rewards/rejected": -1.3237793445587158, "step": 2100 }, { "epoch": 0.9906103286384976, "grad_norm": 572.7812509906279, "learning_rate": 5.4773082942097025e-09, "logits/chosen": -3.214062452316284, "logits/rejected": -3.2125000953674316, "logps/chosen": -541.4000244140625, "logps/rejected": -610.7999877929688, "loss": 3.079, "rewards/accuracies": 0.522656261920929, "rewards/chosen": 7.303124904632568, "rewards/margins": 8.268750190734863, "rewards/rejected": -0.9698241949081421, "step": 2110 }, { "epoch": 0.9953051643192489, "grad_norm": 253.50766618560678, "learning_rate": 2.8690662493479393e-09, "logits/chosen": -3.206249952316284, "logits/rejected": -3.120312452316284, "logps/chosen": -548.7999877929688, "logps/rejected": -624.7999877929688, "loss": 3.3381, "rewards/accuracies": 0.520312488079071, "rewards/chosen": 7.053124904632568, "rewards/margins": 8.865625381469727, "rewards/rejected": -1.8049805164337158, "step": 2120 }, { "epoch": 1.0, "grad_norm": 349.3079567367046, "learning_rate": 2.608242044861763e-10, "logits/chosen": -3.160937547683716, "logits/rejected": -3.1968750953674316, "logps/chosen": -544.7999877929688, "logps/rejected": -640.7999877929688, "loss": 3.374, "rewards/accuracies": 0.5139768719673157, "rewards/chosen": 7.634375095367432, "rewards/margins": 11.021875381469727, "rewards/rejected": -3.389453172683716, "step": 2130 }, { "epoch": 1.0, "step": 2130, "total_flos": 0.0, "train_loss": 3.0317540988116196, "train_runtime": 18550.6661, "train_samples_per_second": 14.694, "train_steps_per_second": 0.115 } ], "logging_steps": 10, "max_steps": 2130, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }