Files
open-sci-ref-v0.02-1.7b-nem…/trainer_state.json
ModelHub XC 98b29f3224 初始化项目,由ModelHub XC社区提供模型
Model: ali-elganzory/open-sci-ref-v0.02-1.7b-nemotron-hq-300B-16k-DPO-Tulu3-decontaminated
Source: Original Platform
2026-05-27 18:48:19 +08:00

3239 lines
108 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 2130,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004694835680751174,
"grad_norm": 312.0256219422113,
"learning_rate": 2.1126760563380282e-08,
"logits/chosen": -3.2750000953674316,
"logits/rejected": -3.1578125953674316,
"logps/chosen": -520.0,
"logps/rejected": -623.4000244140625,
"loss": 2.0975,
"rewards/accuracies": 0.171875,
"rewards/chosen": 0.06386718899011612,
"rewards/margins": 0.14946289360523224,
"rewards/rejected": -0.08598633110523224,
"step": 10
},
{
"epoch": 0.009389671361502348,
"grad_norm": 357.83885964439077,
"learning_rate": 4.460093896713615e-08,
"logits/chosen": -3.176562547683716,
"logits/rejected": -3.1796875,
"logps/chosen": -553.0,
"logps/rejected": -617.2000122070312,
"loss": 2.6057,
"rewards/accuracies": 0.23828125,
"rewards/chosen": 0.19448241591453552,
"rewards/margins": 0.205078125,
"rewards/rejected": -0.01123046875,
"step": 20
},
{
"epoch": 0.014084507042253521,
"grad_norm": 292.5595630275396,
"learning_rate": 6.807511737089202e-08,
"logits/chosen": -3.167187452316284,
"logits/rejected": -3.090625047683716,
"logps/chosen": -585.0,
"logps/rejected": -682.7999877929688,
"loss": 2.6069,
"rewards/accuracies": 0.24921874701976776,
"rewards/chosen": 0.27446287870407104,
"rewards/margins": 0.12026367336511612,
"rewards/rejected": 0.1552734375,
"step": 30
},
{
"epoch": 0.018779342723004695,
"grad_norm": 1477.179885439986,
"learning_rate": 9.154929577464789e-08,
"logits/chosen": -3.2281250953674316,
"logits/rejected": -3.1703124046325684,
"logps/chosen": -573.7999877929688,
"logps/rejected": -706.0,
"loss": 3.0694,
"rewards/accuracies": 0.24687500298023224,
"rewards/chosen": 0.17978516221046448,
"rewards/margins": 0.2806640565395355,
"rewards/rejected": -0.10068359225988388,
"step": 40
},
{
"epoch": 0.023474178403755867,
"grad_norm": 361.20032694705145,
"learning_rate": 1.1502347417840374e-07,
"logits/chosen": -3.1484375,
"logits/rejected": -3.1781249046325684,
"logps/chosen": -554.2000122070312,
"logps/rejected": -629.2000122070312,
"loss": 3.243,
"rewards/accuracies": 0.2421875,
"rewards/chosen": -0.10427246242761612,
"rewards/margins": -0.28144532442092896,
"rewards/rejected": 0.17734375596046448,
"step": 50
},
{
"epoch": 0.028169014084507043,
"grad_norm": 323.77333516562635,
"learning_rate": 1.384976525821596e-07,
"logits/chosen": -3.215625047683716,
"logits/rejected": -3.176562547683716,
"logps/chosen": -569.0,
"logps/rejected": -684.0,
"loss": 2.4599,
"rewards/accuracies": 0.23828125,
"rewards/chosen": 0.3821777403354645,
"rewards/margins": 0.4833007752895355,
"rewards/rejected": -0.10146484524011612,
"step": 60
},
{
"epoch": 0.03286384976525822,
"grad_norm": 276.7746357585784,
"learning_rate": 1.619718309859155e-07,
"logits/chosen": -3.104687452316284,
"logits/rejected": -3.0875000953674316,
"logps/chosen": -550.2000122070312,
"logps/rejected": -621.2000122070312,
"loss": 2.7151,
"rewards/accuracies": 0.25703126192092896,
"rewards/chosen": 0.08867187798023224,
"rewards/margins": 0.12685546278953552,
"rewards/rejected": -0.03798828274011612,
"step": 70
},
{
"epoch": 0.03755868544600939,
"grad_norm": 315.8250479371314,
"learning_rate": 1.8544600938967138e-07,
"logits/chosen": -3.2640624046325684,
"logits/rejected": -3.2421875,
"logps/chosen": -550.5999755859375,
"logps/rejected": -655.2000122070312,
"loss": 2.6591,
"rewards/accuracies": 0.23749999701976776,
"rewards/chosen": 0.11894531548023224,
"rewards/margins": 0.276123046875,
"rewards/rejected": -0.15708008408546448,
"step": 80
},
{
"epoch": 0.04225352112676056,
"grad_norm": 277.4297891268117,
"learning_rate": 2.089201877934272e-07,
"logits/chosen": -3.2906250953674316,
"logits/rejected": -3.262500047683716,
"logps/chosen": -527.7999877929688,
"logps/rejected": -603.4000244140625,
"loss": 2.9056,
"rewards/accuracies": 0.2578125,
"rewards/chosen": 0.42170411348342896,
"rewards/margins": -0.03627929836511612,
"rewards/rejected": 0.4574218690395355,
"step": 90
},
{
"epoch": 0.046948356807511735,
"grad_norm": 662.6082353712405,
"learning_rate": 2.323943661971831e-07,
"logits/chosen": -3.1734375953674316,
"logits/rejected": -3.192187547683716,
"logps/chosen": -574.0,
"logps/rejected": -603.7999877929688,
"loss": 2.8654,
"rewards/accuracies": 0.25078123807907104,
"rewards/chosen": 0.13359375298023224,
"rewards/margins": 0.18437500298023224,
"rewards/rejected": -0.05058593675494194,
"step": 100
},
{
"epoch": 0.051643192488262914,
"grad_norm": 300.25041205050195,
"learning_rate": 2.5586854460093895e-07,
"logits/chosen": -3.237499952316284,
"logits/rejected": -3.1859374046325684,
"logps/chosen": -550.5999755859375,
"logps/rejected": -657.5999755859375,
"loss": 2.8981,
"rewards/accuracies": 0.23828125,
"rewards/chosen": -0.21835938096046448,
"rewards/margins": -0.20292969048023224,
"rewards/rejected": -0.014208984561264515,
"step": 110
},
{
"epoch": 0.056338028169014086,
"grad_norm": 254.4772537543913,
"learning_rate": 2.7934272300469483e-07,
"logits/chosen": -3.2359375953674316,
"logits/rejected": -3.171875,
"logps/chosen": -561.0,
"logps/rejected": -649.2000122070312,
"loss": 2.9676,
"rewards/accuracies": 0.2710937559604645,
"rewards/chosen": 0.18388672173023224,
"rewards/margins": 0.2694335877895355,
"rewards/rejected": -0.08457031100988388,
"step": 120
},
{
"epoch": 0.06103286384976526,
"grad_norm": 359.56965403329684,
"learning_rate": 3.0281690140845066e-07,
"logits/chosen": -3.153125047683716,
"logits/rejected": -3.1484375,
"logps/chosen": -576.0,
"logps/rejected": -665.5999755859375,
"loss": 2.4338,
"rewards/accuracies": 0.28203123807907104,
"rewards/chosen": 0.49860841035842896,
"rewards/margins": 1.219873070716858,
"rewards/rejected": -0.722363293170929,
"step": 130
},
{
"epoch": 0.06572769953051644,
"grad_norm": 353.8273960746246,
"learning_rate": 3.2629107981220654e-07,
"logits/chosen": -3.151562452316284,
"logits/rejected": -3.1484375,
"logps/chosen": -564.4000244140625,
"logps/rejected": -689.2000122070312,
"loss": 3.216,
"rewards/accuracies": 0.24296875298023224,
"rewards/chosen": 0.553417980670929,
"rewards/margins": -0.5686279535293579,
"rewards/rejected": 1.121679663658142,
"step": 140
},
{
"epoch": 0.07042253521126761,
"grad_norm": 297.09345656551403,
"learning_rate": 3.497652582159624e-07,
"logits/chosen": -3.2109375,
"logits/rejected": -3.229687452316284,
"logps/chosen": -560.4000244140625,
"logps/rejected": -666.4000244140625,
"loss": 2.6835,
"rewards/accuracies": 0.2640624940395355,
"rewards/chosen": 0.43452149629592896,
"rewards/margins": 0.678906261920929,
"rewards/rejected": -0.24501952528953552,
"step": 150
},
{
"epoch": 0.07511737089201878,
"grad_norm": 324.5518620175009,
"learning_rate": 3.732394366197183e-07,
"logits/chosen": -3.2109375,
"logits/rejected": -3.1703124046325684,
"logps/chosen": -574.5999755859375,
"logps/rejected": -665.0,
"loss": 2.5045,
"rewards/accuracies": 0.27734375,
"rewards/chosen": 0.5849609375,
"rewards/margins": 0.823046863079071,
"rewards/rejected": -0.23779296875,
"step": 160
},
{
"epoch": 0.07981220657276995,
"grad_norm": 370.79323789587045,
"learning_rate": 3.967136150234742e-07,
"logits/chosen": -3.2203125953674316,
"logits/rejected": -3.1421875953674316,
"logps/chosen": -528.7999877929688,
"logps/rejected": -648.4000244140625,
"loss": 2.826,
"rewards/accuracies": 0.265625,
"rewards/chosen": 0.32744139432907104,
"rewards/margins": 0.3781494200229645,
"rewards/rejected": -0.04990234225988388,
"step": 170
},
{
"epoch": 0.08450704225352113,
"grad_norm": 493.3277569235484,
"learning_rate": 4.2018779342723e-07,
"logits/chosen": -3.167187452316284,
"logits/rejected": -3.112499952316284,
"logps/chosen": -517.5999755859375,
"logps/rejected": -644.4000244140625,
"loss": 2.4864,
"rewards/accuracies": 0.2867187559604645,
"rewards/chosen": 0.49970704317092896,
"rewards/margins": 0.4146484434604645,
"rewards/rejected": 0.085205078125,
"step": 180
},
{
"epoch": 0.0892018779342723,
"grad_norm": 285.4873436657188,
"learning_rate": 4.436619718309859e-07,
"logits/chosen": -3.171875,
"logits/rejected": -3.1578125953674316,
"logps/chosen": -553.0,
"logps/rejected": -625.2000122070312,
"loss": 2.5783,
"rewards/accuracies": 0.30156248807907104,
"rewards/chosen": 1.051855444908142,
"rewards/margins": 1.269140601158142,
"rewards/rejected": -0.21464844048023224,
"step": 190
},
{
"epoch": 0.09389671361502347,
"grad_norm": 349.67590705902006,
"learning_rate": 4.671361502347418e-07,
"logits/chosen": -3.159374952316284,
"logits/rejected": -3.1468749046325684,
"logps/chosen": -541.7999877929688,
"logps/rejected": -632.5999755859375,
"loss": 2.3192,
"rewards/accuracies": 0.31171876192092896,
"rewards/chosen": 1.356347680091858,
"rewards/margins": 1.372656226158142,
"rewards/rejected": -0.0159912109375,
"step": 200
},
{
"epoch": 0.09859154929577464,
"grad_norm": 302.1865630833012,
"learning_rate": 4.906103286384976e-07,
"logits/chosen": -3.192187547683716,
"logits/rejected": -3.151562452316284,
"logps/chosen": -555.2000122070312,
"logps/rejected": -669.5999755859375,
"loss": 2.5959,
"rewards/accuracies": 0.31640625,
"rewards/chosen": 1.054296851158142,
"rewards/margins": 1.321874976158142,
"rewards/rejected": -0.26777344942092896,
"step": 210
},
{
"epoch": 0.10328638497652583,
"grad_norm": 300.72724401497436,
"learning_rate": 4.984350547730829e-07,
"logits/chosen": -3.268749952316284,
"logits/rejected": -3.1937499046325684,
"logps/chosen": -569.5999755859375,
"logps/rejected": -648.0,
"loss": 2.2734,
"rewards/accuracies": 0.33671873807907104,
"rewards/chosen": 1.7156250476837158,
"rewards/margins": 1.9314453601837158,
"rewards/rejected": -0.21503905951976776,
"step": 220
},
{
"epoch": 0.107981220657277,
"grad_norm": 1102.3813264661942,
"learning_rate": 4.958268127282212e-07,
"logits/chosen": -3.200000047683716,
"logits/rejected": -3.1703124046325684,
"logps/chosen": -556.5999755859375,
"logps/rejected": -673.5999755859375,
"loss": 2.4886,
"rewards/accuracies": 0.33125001192092896,
"rewards/chosen": 2.0492186546325684,
"rewards/margins": 1.604589819908142,
"rewards/rejected": 0.4451660215854645,
"step": 230
},
{
"epoch": 0.11267605633802817,
"grad_norm": 282.37733202209813,
"learning_rate": 4.932185706833594e-07,
"logits/chosen": -3.1500000953674316,
"logits/rejected": -3.1812500953674316,
"logps/chosen": -513.4000244140625,
"logps/rejected": -563.0,
"loss": 2.3018,
"rewards/accuracies": 0.36406248807907104,
"rewards/chosen": 2.0999999046325684,
"rewards/margins": 2.255859375,
"rewards/rejected": -0.15898437798023224,
"step": 240
},
{
"epoch": 0.11737089201877934,
"grad_norm": 279.1099151963578,
"learning_rate": 4.906103286384976e-07,
"logits/chosen": -3.207812547683716,
"logits/rejected": -3.1656250953674316,
"logps/chosen": -552.4000244140625,
"logps/rejected": -684.0,
"loss": 2.4892,
"rewards/accuracies": 0.3812499940395355,
"rewards/chosen": 1.790624976158142,
"rewards/margins": 2.741015672683716,
"rewards/rejected": -0.951904296875,
"step": 250
},
{
"epoch": 0.12206572769953052,
"grad_norm": 287.6263722929828,
"learning_rate": 4.880020865936358e-07,
"logits/chosen": -3.167187452316284,
"logits/rejected": -3.1812500953674316,
"logps/chosen": -516.7999877929688,
"logps/rejected": -623.7999877929688,
"loss": 2.6498,
"rewards/accuracies": 0.3671875,
"rewards/chosen": 2.3648438453674316,
"rewards/margins": 2.208203077316284,
"rewards/rejected": 0.15483398735523224,
"step": 260
},
{
"epoch": 0.1267605633802817,
"grad_norm": 343.03300828691033,
"learning_rate": 4.853938445487741e-07,
"logits/chosen": -3.096874952316284,
"logits/rejected": -3.0718750953674316,
"logps/chosen": -554.0,
"logps/rejected": -658.7999877929688,
"loss": 2.7283,
"rewards/accuracies": 0.3851562440395355,
"rewards/chosen": 2.793750047683716,
"rewards/margins": 2.6507811546325684,
"rewards/rejected": 0.14072266221046448,
"step": 270
},
{
"epoch": 0.13145539906103287,
"grad_norm": 260.13644441380166,
"learning_rate": 4.827856025039123e-07,
"logits/chosen": -3.1781249046325684,
"logits/rejected": -3.174999952316284,
"logps/chosen": -553.4000244140625,
"logps/rejected": -664.0,
"loss": 2.5984,
"rewards/accuracies": 0.38671875,
"rewards/chosen": 3.018749952316284,
"rewards/margins": 3.389843702316284,
"rewards/rejected": -0.3746093809604645,
"step": 280
},
{
"epoch": 0.13615023474178403,
"grad_norm": 231.6706812957532,
"learning_rate": 4.801773604590506e-07,
"logits/chosen": -3.1390624046325684,
"logits/rejected": -3.1703124046325684,
"logps/chosen": -599.7999877929688,
"logps/rejected": -660.4000244140625,
"loss": 2.7207,
"rewards/accuracies": 0.4039062559604645,
"rewards/chosen": 2.717968702316284,
"rewards/margins": 3.4097657203674316,
"rewards/rejected": -0.693408191204071,
"step": 290
},
{
"epoch": 0.14084507042253522,
"grad_norm": 220.97715297066875,
"learning_rate": 4.775691184141888e-07,
"logits/chosen": -3.1578125953674316,
"logits/rejected": -3.182812452316284,
"logps/chosen": -572.5999755859375,
"logps/rejected": -656.5999755859375,
"loss": 2.8978,
"rewards/accuracies": 0.39765626192092896,
"rewards/chosen": 2.4898438453674316,
"rewards/margins": 4.083276271820068,
"rewards/rejected": -1.5941894054412842,
"step": 300
},
{
"epoch": 0.14553990610328638,
"grad_norm": 302.8311801574877,
"learning_rate": 4.749608763693271e-07,
"logits/chosen": -3.245312452316284,
"logits/rejected": -3.2015624046325684,
"logps/chosen": -537.7999877929688,
"logps/rejected": -635.2000122070312,
"loss": 2.9098,
"rewards/accuracies": 0.4195312559604645,
"rewards/chosen": 3.0171875953674316,
"rewards/margins": 2.634570360183716,
"rewards/rejected": 0.38300782442092896,
"step": 310
},
{
"epoch": 0.15023474178403756,
"grad_norm": 405.1955863726396,
"learning_rate": 4.7235263432446533e-07,
"logits/chosen": -3.2203125953674316,
"logits/rejected": -3.137500047683716,
"logps/chosen": -545.2000122070312,
"logps/rejected": -634.2000122070312,
"loss": 2.7902,
"rewards/accuracies": 0.41718751192092896,
"rewards/chosen": 3.2046875953674316,
"rewards/margins": 3.9234375953674316,
"rewards/rejected": -0.721484363079071,
"step": 320
},
{
"epoch": 0.15492957746478872,
"grad_norm": 244.65396594212294,
"learning_rate": 4.6974439227960353e-07,
"logits/chosen": -3.1875,
"logits/rejected": -3.1265625953674316,
"logps/chosen": -564.7999877929688,
"logps/rejected": -665.2000122070312,
"loss": 3.1965,
"rewards/accuracies": 0.4242187440395355,
"rewards/chosen": 3.067578077316284,
"rewards/margins": 4.0234375,
"rewards/rejected": -0.956298828125,
"step": 330
},
{
"epoch": 0.1596244131455399,
"grad_norm": 372.5758889280196,
"learning_rate": 4.671361502347418e-07,
"logits/chosen": -3.2281250953674316,
"logits/rejected": -3.2171874046325684,
"logps/chosen": -540.4000244140625,
"logps/rejected": -591.0,
"loss": 3.0711,
"rewards/accuracies": 0.41718751192092896,
"rewards/chosen": 3.635937452316284,
"rewards/margins": 2.765625,
"rewards/rejected": 0.8724609613418579,
"step": 340
},
{
"epoch": 0.1643192488262911,
"grad_norm": 268.8524534862668,
"learning_rate": 4.6452790818988004e-07,
"logits/chosen": -3.1890625953674316,
"logits/rejected": -3.176562547683716,
"logps/chosen": -575.2000122070312,
"logps/rejected": -662.4000244140625,
"loss": 3.209,
"rewards/accuracies": 0.4242187440395355,
"rewards/chosen": 3.598437547683716,
"rewards/margins": 3.905468702316284,
"rewards/rejected": -0.3086914122104645,
"step": 350
},
{
"epoch": 0.16901408450704225,
"grad_norm": 331.4831794798673,
"learning_rate": 4.6191966614501824e-07,
"logits/chosen": -3.1781249046325684,
"logits/rejected": -3.1546874046325684,
"logps/chosen": -544.0,
"logps/rejected": -637.4000244140625,
"loss": 2.7959,
"rewards/accuracies": 0.40625,
"rewards/chosen": 3.909374952316284,
"rewards/margins": 3.6156249046325684,
"rewards/rejected": 0.29326170682907104,
"step": 360
},
{
"epoch": 0.17370892018779344,
"grad_norm": 300.48013396150066,
"learning_rate": 4.593114241001565e-07,
"logits/chosen": -3.1265625953674316,
"logits/rejected": -3.1500000953674316,
"logps/chosen": -586.5999755859375,
"logps/rejected": -667.5999755859375,
"loss": 2.9779,
"rewards/accuracies": 0.41874998807907104,
"rewards/chosen": 4.057812690734863,
"rewards/margins": 4.220703125,
"rewards/rejected": -0.16093750298023224,
"step": 370
},
{
"epoch": 0.1784037558685446,
"grad_norm": 259.22801416212894,
"learning_rate": 4.5670318205529474e-07,
"logits/chosen": -3.1796875,
"logits/rejected": -3.1234374046325684,
"logps/chosen": -552.4000244140625,
"logps/rejected": -658.7999877929688,
"loss": 2.6823,
"rewards/accuracies": 0.4078125059604645,
"rewards/chosen": 3.3335938453674316,
"rewards/margins": 4.030468940734863,
"rewards/rejected": -0.696972668170929,
"step": 380
},
{
"epoch": 0.18309859154929578,
"grad_norm": 314.2218817447742,
"learning_rate": 4.54094940010433e-07,
"logits/chosen": -3.2015624046325684,
"logits/rejected": -3.160937547683716,
"logps/chosen": -565.5999755859375,
"logps/rejected": -674.4000244140625,
"loss": 3.0923,
"rewards/accuracies": 0.4398437440395355,
"rewards/chosen": 3.741015672683716,
"rewards/margins": 4.584374904632568,
"rewards/rejected": -0.849902331829071,
"step": 390
},
{
"epoch": 0.18779342723004694,
"grad_norm": 303.68185985451623,
"learning_rate": 4.514866979655712e-07,
"logits/chosen": -3.221874952316284,
"logits/rejected": -3.1546874046325684,
"logps/chosen": -553.5999755859375,
"logps/rejected": -639.2000122070312,
"loss": 2.8172,
"rewards/accuracies": 0.4593749940395355,
"rewards/chosen": 4.425000190734863,
"rewards/margins": 5.13671875,
"rewards/rejected": -0.7081298828125,
"step": 400
},
{
"epoch": 0.19248826291079812,
"grad_norm": 316.63227968980334,
"learning_rate": 4.4887845592070945e-07,
"logits/chosen": -3.1546874046325684,
"logits/rejected": -3.1234374046325684,
"logps/chosen": -542.4000244140625,
"logps/rejected": -677.5999755859375,
"loss": 2.8016,
"rewards/accuracies": 0.4554687440395355,
"rewards/chosen": 4.473437309265137,
"rewards/margins": 5.852343559265137,
"rewards/rejected": -1.3835937976837158,
"step": 410
},
{
"epoch": 0.19718309859154928,
"grad_norm": 294.08918909696143,
"learning_rate": 4.462702138758477e-07,
"logits/chosen": -3.1812500953674316,
"logits/rejected": -3.135937452316284,
"logps/chosen": -575.2000122070312,
"logps/rejected": -644.4000244140625,
"loss": 2.9949,
"rewards/accuracies": 0.43046873807907104,
"rewards/chosen": 4.242968559265137,
"rewards/margins": 4.417187690734863,
"rewards/rejected": -0.17177733778953552,
"step": 420
},
{
"epoch": 0.20187793427230047,
"grad_norm": 265.6457311420601,
"learning_rate": 4.436619718309859e-07,
"logits/chosen": -3.153125047683716,
"logits/rejected": -3.0703125,
"logps/chosen": -557.7999877929688,
"logps/rejected": -684.4000244140625,
"loss": 2.853,
"rewards/accuracies": 0.4281249940395355,
"rewards/chosen": 3.6234374046325684,
"rewards/margins": 4.569531440734863,
"rewards/rejected": -0.94482421875,
"step": 430
},
{
"epoch": 0.20657276995305165,
"grad_norm": 278.50429679548523,
"learning_rate": 4.4105372978612415e-07,
"logits/chosen": -3.2359375953674316,
"logits/rejected": -3.2171874046325684,
"logps/chosen": -572.0,
"logps/rejected": -673.2000122070312,
"loss": 3.1858,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": 4.300000190734863,
"rewards/margins": 5.065625190734863,
"rewards/rejected": -0.7640625238418579,
"step": 440
},
{
"epoch": 0.2112676056338028,
"grad_norm": 230.10305977022676,
"learning_rate": 4.384454877412624e-07,
"logits/chosen": -3.260937452316284,
"logits/rejected": -3.153125047683716,
"logps/chosen": -556.4000244140625,
"logps/rejected": -748.7999877929688,
"loss": 3.0931,
"rewards/accuracies": 0.4359374940395355,
"rewards/chosen": 4.378125190734863,
"rewards/margins": 6.3046875,
"rewards/rejected": -1.936132788658142,
"step": 450
},
{
"epoch": 0.215962441314554,
"grad_norm": 263.6305024511116,
"learning_rate": 4.358372456964006e-07,
"logits/chosen": -3.160937547683716,
"logits/rejected": -3.15625,
"logps/chosen": -552.5999755859375,
"logps/rejected": -659.2000122070312,
"loss": 2.8102,
"rewards/accuracies": 0.46015626192092896,
"rewards/chosen": 4.801562309265137,
"rewards/margins": 4.841406345367432,
"rewards/rejected": -0.0338134765625,
"step": 460
},
{
"epoch": 0.22065727699530516,
"grad_norm": 324.489924430666,
"learning_rate": 4.3322900365153886e-07,
"logits/chosen": -3.2640624046325684,
"logits/rejected": -3.192187547683716,
"logps/chosen": -593.5999755859375,
"logps/rejected": -655.5999755859375,
"loss": 3.3795,
"rewards/accuracies": 0.44062501192092896,
"rewards/chosen": 4.314062595367432,
"rewards/margins": 4.260937690734863,
"rewards/rejected": 0.05312500149011612,
"step": 470
},
{
"epoch": 0.22535211267605634,
"grad_norm": 255.4091492673624,
"learning_rate": 4.306207616066771e-07,
"logits/chosen": -3.2015624046325684,
"logits/rejected": -3.174999952316284,
"logps/chosen": -542.5999755859375,
"logps/rejected": -636.2000122070312,
"loss": 2.8316,
"rewards/accuracies": 0.4281249940395355,
"rewards/chosen": 4.528124809265137,
"rewards/margins": 4.978125095367432,
"rewards/rejected": -0.44755858182907104,
"step": 480
},
{
"epoch": 0.2300469483568075,
"grad_norm": 292.3375710045175,
"learning_rate": 4.280125195618153e-07,
"logits/chosen": -3.1953125,
"logits/rejected": -3.1312499046325684,
"logps/chosen": -548.2000122070312,
"logps/rejected": -663.5999755859375,
"loss": 3.1282,
"rewards/accuracies": 0.4476562440395355,
"rewards/chosen": 4.767187595367432,
"rewards/margins": 4.903124809265137,
"rewards/rejected": -0.13076171278953552,
"step": 490
},
{
"epoch": 0.2347417840375587,
"grad_norm": 276.8383687595893,
"learning_rate": 4.2540427751695357e-07,
"logits/chosen": -3.1234374046325684,
"logits/rejected": -3.135937452316284,
"logps/chosen": -563.7999877929688,
"logps/rejected": -639.2000122070312,
"loss": 2.8798,
"rewards/accuracies": 0.45390623807907104,
"rewards/chosen": 4.951562404632568,
"rewards/margins": 5.360937595367432,
"rewards/rejected": -0.40961915254592896,
"step": 500
},
{
"epoch": 0.23943661971830985,
"grad_norm": 240.08669970605214,
"learning_rate": 4.227960354720918e-07,
"logits/chosen": -3.262500047683716,
"logits/rejected": -3.200000047683716,
"logps/chosen": -542.2000122070312,
"logps/rejected": -677.5999755859375,
"loss": 2.9689,
"rewards/accuracies": 0.4429687559604645,
"rewards/chosen": 4.881249904632568,
"rewards/margins": 4.892480373382568,
"rewards/rejected": -0.00820312462747097,
"step": 510
},
{
"epoch": 0.24413145539906103,
"grad_norm": 316.99746852995713,
"learning_rate": 4.2018779342723e-07,
"logits/chosen": -3.160937547683716,
"logits/rejected": -3.1703124046325684,
"logps/chosen": -562.4000244140625,
"logps/rejected": -626.0,
"loss": 3.6521,
"rewards/accuracies": 0.4359374940395355,
"rewards/chosen": 4.800000190734863,
"rewards/margins": 3.233593702316284,
"rewards/rejected": 1.5671875476837158,
"step": 520
},
{
"epoch": 0.24882629107981222,
"grad_norm": 270.0542704810816,
"learning_rate": 4.1757955138236827e-07,
"logits/chosen": -3.253124952316284,
"logits/rejected": -3.182812452316284,
"logps/chosen": -541.5999755859375,
"logps/rejected": -660.0,
"loss": 2.8176,
"rewards/accuracies": 0.4921875,
"rewards/chosen": 5.292187690734863,
"rewards/margins": 6.079687595367432,
"rewards/rejected": -0.780078113079071,
"step": 530
},
{
"epoch": 0.2535211267605634,
"grad_norm": 295.63002336313383,
"learning_rate": 4.149713093375065e-07,
"logits/chosen": -3.2093749046325684,
"logits/rejected": -3.176562547683716,
"logps/chosen": -543.5999755859375,
"logps/rejected": -606.0,
"loss": 3.0713,
"rewards/accuracies": 0.453125,
"rewards/chosen": 5.207812309265137,
"rewards/margins": 5.46875,
"rewards/rejected": -0.25507813692092896,
"step": 540
},
{
"epoch": 0.25821596244131456,
"grad_norm": 310.35958187748247,
"learning_rate": 4.123630672926447e-07,
"logits/chosen": -3.231250047683716,
"logits/rejected": -3.1968750953674316,
"logps/chosen": -532.0,
"logps/rejected": -653.0,
"loss": 3.0968,
"rewards/accuracies": 0.44218748807907104,
"rewards/chosen": 4.865624904632568,
"rewards/margins": 5.584374904632568,
"rewards/rejected": -0.7201172113418579,
"step": 550
},
{
"epoch": 0.26291079812206575,
"grad_norm": 312.5796454732532,
"learning_rate": 4.09754825247783e-07,
"logits/chosen": -3.215625047683716,
"logits/rejected": -3.1578125953674316,
"logps/chosen": -549.2000122070312,
"logps/rejected": -630.0,
"loss": 2.7618,
"rewards/accuracies": 0.46562498807907104,
"rewards/chosen": 5.068749904632568,
"rewards/margins": 6.120312690734863,
"rewards/rejected": -1.05029296875,
"step": 560
},
{
"epoch": 0.2676056338028169,
"grad_norm": 250.4466960516743,
"learning_rate": 4.0714658320292123e-07,
"logits/chosen": -3.28125,
"logits/rejected": -3.2093749046325684,
"logps/chosen": -544.0,
"logps/rejected": -643.7999877929688,
"loss": 2.8375,
"rewards/accuracies": 0.4585937559604645,
"rewards/chosen": 5.240624904632568,
"rewards/margins": 5.368750095367432,
"rewards/rejected": -0.12744140625,
"step": 570
},
{
"epoch": 0.27230046948356806,
"grad_norm": 263.6284568812628,
"learning_rate": 4.045383411580595e-07,
"logits/chosen": -3.2249999046325684,
"logits/rejected": -3.190624952316284,
"logps/chosen": -529.5999755859375,
"logps/rejected": -646.4000244140625,
"loss": 3.1771,
"rewards/accuracies": 0.46171873807907104,
"rewards/chosen": 5.1875,
"rewards/margins": 5.52734375,
"rewards/rejected": -0.3402343690395355,
"step": 580
},
{
"epoch": 0.27699530516431925,
"grad_norm": 247.26597055635676,
"learning_rate": 4.019300991131977e-07,
"logits/chosen": -3.106250047683716,
"logits/rejected": -3.0843749046325684,
"logps/chosen": -543.5999755859375,
"logps/rejected": -643.2000122070312,
"loss": 3.1357,
"rewards/accuracies": 0.4710937440395355,
"rewards/chosen": 5.296093940734863,
"rewards/margins": 5.807812690734863,
"rewards/rejected": -0.5101562738418579,
"step": 590
},
{
"epoch": 0.28169014084507044,
"grad_norm": 343.6934307609959,
"learning_rate": 3.9932185706833594e-07,
"logits/chosen": -3.2593750953674316,
"logits/rejected": -3.1859374046325684,
"logps/chosen": -557.7999877929688,
"logps/rejected": -634.4000244140625,
"loss": 3.2729,
"rewards/accuracies": 0.4609375,
"rewards/chosen": 5.770312309265137,
"rewards/margins": 5.9296875,
"rewards/rejected": -0.15639647841453552,
"step": 600
},
{
"epoch": 0.2863849765258216,
"grad_norm": 261.3531599965703,
"learning_rate": 3.967136150234742e-07,
"logits/chosen": -3.207812547683716,
"logits/rejected": -3.215625047683716,
"logps/chosen": -559.5999755859375,
"logps/rejected": -643.0,
"loss": 3.269,
"rewards/accuracies": 0.47343748807907104,
"rewards/chosen": 5.671875,
"rewards/margins": 6.106249809265137,
"rewards/rejected": -0.42988282442092896,
"step": 610
},
{
"epoch": 0.29107981220657275,
"grad_norm": 236.1085139278817,
"learning_rate": 3.941053729786124e-07,
"logits/chosen": -3.151562452316284,
"logits/rejected": -3.1078124046325684,
"logps/chosen": -569.7999877929688,
"logps/rejected": -654.7999877929688,
"loss": 2.7709,
"rewards/accuracies": 0.484375,
"rewards/chosen": 5.900000095367432,
"rewards/margins": 7.834374904632568,
"rewards/rejected": -1.9337890148162842,
"step": 620
},
{
"epoch": 0.29577464788732394,
"grad_norm": 1668.1858891625257,
"learning_rate": 3.9149713093375064e-07,
"logits/chosen": -3.1640625,
"logits/rejected": -3.125,
"logps/chosen": -554.0,
"logps/rejected": -673.5999755859375,
"loss": 2.8208,
"rewards/accuracies": 0.48359376192092896,
"rewards/chosen": 6.259375095367432,
"rewards/margins": 7.706250190734863,
"rewards/rejected": -1.448828101158142,
"step": 630
},
{
"epoch": 0.3004694835680751,
"grad_norm": 274.4990815783928,
"learning_rate": 3.888888888888889e-07,
"logits/chosen": -3.2015624046325684,
"logits/rejected": -3.1546874046325684,
"logps/chosen": -545.4000244140625,
"logps/rejected": -686.4000244140625,
"loss": 2.9131,
"rewards/accuracies": 0.49140626192092896,
"rewards/chosen": 5.653124809265137,
"rewards/margins": 7.78125,
"rewards/rejected": -2.1275634765625,
"step": 640
},
{
"epoch": 0.3051643192488263,
"grad_norm": 264.72591172971954,
"learning_rate": 3.862806468440271e-07,
"logits/chosen": -3.2874999046325684,
"logits/rejected": -3.2593750953674316,
"logps/chosen": -505.0,
"logps/rejected": -605.2000122070312,
"loss": 2.9616,
"rewards/accuracies": 0.4671874940395355,
"rewards/chosen": 5.503125190734863,
"rewards/margins": 5.125,
"rewards/rejected": 0.37983399629592896,
"step": 650
},
{
"epoch": 0.30985915492957744,
"grad_norm": 302.6815782764531,
"learning_rate": 3.8367240479916535e-07,
"logits/chosen": -3.2171874046325684,
"logits/rejected": -3.254687547683716,
"logps/chosen": -538.7999877929688,
"logps/rejected": -611.0,
"loss": 2.8749,
"rewards/accuracies": 0.50390625,
"rewards/chosen": 6.021874904632568,
"rewards/margins": 6.935937404632568,
"rewards/rejected": -0.921313464641571,
"step": 660
},
{
"epoch": 0.3145539906103286,
"grad_norm": 285.79486724420923,
"learning_rate": 3.810641627543036e-07,
"logits/chosen": -3.2203125953674316,
"logits/rejected": -3.1859374046325684,
"logps/chosen": -554.0,
"logps/rejected": -647.0,
"loss": 3.0776,
"rewards/accuracies": 0.48515623807907104,
"rewards/chosen": 5.768750190734863,
"rewards/margins": 7.626562595367432,
"rewards/rejected": -1.8566405773162842,
"step": 670
},
{
"epoch": 0.3192488262910798,
"grad_norm": 267.03878626735144,
"learning_rate": 3.784559207094418e-07,
"logits/chosen": -3.2171874046325684,
"logits/rejected": -3.1859374046325684,
"logps/chosen": -564.0,
"logps/rejected": -634.4000244140625,
"loss": 3.018,
"rewards/accuracies": 0.46015626192092896,
"rewards/chosen": 5.534375190734863,
"rewards/margins": 5.573437690734863,
"rewards/rejected": -0.03432617336511612,
"step": 680
},
{
"epoch": 0.323943661971831,
"grad_norm": 241.42149101925932,
"learning_rate": 3.7584767866458005e-07,
"logits/chosen": -3.167187452316284,
"logits/rejected": -3.1468749046325684,
"logps/chosen": -558.4000244140625,
"logps/rejected": -612.2000122070312,
"loss": 3.2028,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": 5.759375095367432,
"rewards/margins": 6.537499904632568,
"rewards/rejected": -0.775390625,
"step": 690
},
{
"epoch": 0.3286384976525822,
"grad_norm": 347.3003027177567,
"learning_rate": 3.732394366197183e-07,
"logits/chosen": -3.253124952316284,
"logits/rejected": -3.206249952316284,
"logps/chosen": -581.0,
"logps/rejected": -622.7999877929688,
"loss": 3.4616,
"rewards/accuracies": 0.4789062440395355,
"rewards/chosen": 5.829687595367432,
"rewards/margins": 6.435937404632568,
"rewards/rejected": -0.612841784954071,
"step": 700
},
{
"epoch": 0.3333333333333333,
"grad_norm": 370.0683750172563,
"learning_rate": 3.706311945748565e-07,
"logits/chosen": -3.2328124046325684,
"logits/rejected": -3.1781249046325684,
"logps/chosen": -594.4000244140625,
"logps/rejected": -631.5999755859375,
"loss": 3.6096,
"rewards/accuracies": 0.4984374940395355,
"rewards/chosen": 5.745312690734863,
"rewards/margins": 6.145312309265137,
"rewards/rejected": -0.40507811307907104,
"step": 710
},
{
"epoch": 0.3380281690140845,
"grad_norm": 297.31120098621625,
"learning_rate": 3.6802295252999476e-07,
"logits/chosen": -3.2015624046325684,
"logits/rejected": -3.0640625953674316,
"logps/chosen": -571.7999877929688,
"logps/rejected": -682.2000122070312,
"loss": 3.2536,
"rewards/accuracies": 0.49687498807907104,
"rewards/chosen": 6.015625,
"rewards/margins": 7.296875,
"rewards/rejected": -1.282324194908142,
"step": 720
},
{
"epoch": 0.3427230046948357,
"grad_norm": 909.3109580482733,
"learning_rate": 3.65414710485133e-07,
"logits/chosen": -3.184375047683716,
"logits/rejected": -3.2125000953674316,
"logps/chosen": -566.2000122070312,
"logps/rejected": -619.2000122070312,
"loss": 3.2271,
"rewards/accuracies": 0.48828125,
"rewards/chosen": 5.009375095367432,
"rewards/margins": 7.012499809265137,
"rewards/rejected": -2.0113282203674316,
"step": 730
},
{
"epoch": 0.3474178403755869,
"grad_norm": 251.82987219431644,
"learning_rate": 3.6280646844027127e-07,
"logits/chosen": -3.160937547683716,
"logits/rejected": -3.1812500953674316,
"logps/chosen": -548.4000244140625,
"logps/rejected": -692.7999877929688,
"loss": 2.4756,
"rewards/accuracies": 0.5023437738418579,
"rewards/chosen": 6.371874809265137,
"rewards/margins": 8.1171875,
"rewards/rejected": -1.7410156726837158,
"step": 740
},
{
"epoch": 0.352112676056338,
"grad_norm": 325.56963232354997,
"learning_rate": 3.6019822639540947e-07,
"logits/chosen": -3.192187547683716,
"logits/rejected": -3.128124952316284,
"logps/chosen": -577.0,
"logps/rejected": -614.4000244140625,
"loss": 3.3683,
"rewards/accuracies": 0.48046875,
"rewards/chosen": 5.904687404632568,
"rewards/margins": 6.515625,
"rewards/rejected": -0.6114746332168579,
"step": 750
},
{
"epoch": 0.3568075117370892,
"grad_norm": 262.68653804411235,
"learning_rate": 3.575899843505477e-07,
"logits/chosen": -3.171875,
"logits/rejected": -3.121875047683716,
"logps/chosen": -553.7999877929688,
"logps/rejected": -606.4000244140625,
"loss": 3.2532,
"rewards/accuracies": 0.50390625,
"rewards/chosen": 5.689062595367432,
"rewards/margins": 6.7109375,
"rewards/rejected": -1.021875023841858,
"step": 760
},
{
"epoch": 0.3615023474178404,
"grad_norm": 308.5027585504221,
"learning_rate": 3.5498174230568597e-07,
"logits/chosen": -3.174999952316284,
"logits/rejected": -3.143749952316284,
"logps/chosen": -549.5999755859375,
"logps/rejected": -639.5999755859375,
"loss": 2.7304,
"rewards/accuracies": 0.47734373807907104,
"rewards/chosen": 5.609375,
"rewards/margins": 7.015625,
"rewards/rejected": -1.402978539466858,
"step": 770
},
{
"epoch": 0.36619718309859156,
"grad_norm": 304.36534286887735,
"learning_rate": 3.5237350026082417e-07,
"logits/chosen": -3.128124952316284,
"logits/rejected": -3.125,
"logps/chosen": -559.5999755859375,
"logps/rejected": -600.0,
"loss": 3.1165,
"rewards/accuracies": 0.4859375059604645,
"rewards/chosen": 5.96875,
"rewards/margins": 5.890625,
"rewards/rejected": 0.07460937649011612,
"step": 780
},
{
"epoch": 0.37089201877934275,
"grad_norm": 191.4390918096237,
"learning_rate": 3.497652582159624e-07,
"logits/chosen": -3.1859374046325684,
"logits/rejected": -3.151562452316284,
"logps/chosen": -555.7999877929688,
"logps/rejected": -659.2000122070312,
"loss": 3.0015,
"rewards/accuracies": 0.4859375059604645,
"rewards/chosen": 6.221875190734863,
"rewards/margins": 7.193749904632568,
"rewards/rejected": -0.9771484136581421,
"step": 790
},
{
"epoch": 0.3755868544600939,
"grad_norm": 425.7166621373832,
"learning_rate": 3.471570161711007e-07,
"logits/chosen": -3.214062452316284,
"logits/rejected": -3.1484375,
"logps/chosen": -586.2000122070312,
"logps/rejected": -674.5999755859375,
"loss": 3.3681,
"rewards/accuracies": 0.49921876192092896,
"rewards/chosen": 6.193749904632568,
"rewards/margins": 7.546875,
"rewards/rejected": -1.3479492664337158,
"step": 800
},
{
"epoch": 0.38028169014084506,
"grad_norm": 300.49720808118195,
"learning_rate": 3.445487741262389e-07,
"logits/chosen": -3.2015624046325684,
"logits/rejected": -3.1968750953674316,
"logps/chosen": -538.2000122070312,
"logps/rejected": -643.5999755859375,
"loss": 2.6164,
"rewards/accuracies": 0.4976562559604645,
"rewards/chosen": 5.670312404632568,
"rewards/margins": 7.168749809265137,
"rewards/rejected": -1.4990234375,
"step": 810
},
{
"epoch": 0.38497652582159625,
"grad_norm": 273.38949554271784,
"learning_rate": 3.4194053208137713e-07,
"logits/chosen": -3.2171874046325684,
"logits/rejected": -3.1734375953674316,
"logps/chosen": -549.2000122070312,
"logps/rejected": -615.7999877929688,
"loss": 2.9592,
"rewards/accuracies": 0.49140626192092896,
"rewards/chosen": 5.9140625,
"rewards/margins": 7.162499904632568,
"rewards/rejected": -1.2492187023162842,
"step": 820
},
{
"epoch": 0.38967136150234744,
"grad_norm": 271.8475642928366,
"learning_rate": 3.393322900365154e-07,
"logits/chosen": -3.1890625953674316,
"logits/rejected": -3.096874952316284,
"logps/chosen": -521.5999755859375,
"logps/rejected": -621.2000122070312,
"loss": 3.5942,
"rewards/accuracies": 0.48515623807907104,
"rewards/chosen": 5.356249809265137,
"rewards/margins": 6.192187309265137,
"rewards/rejected": -0.8359375,
"step": 830
},
{
"epoch": 0.39436619718309857,
"grad_norm": 258.8715832438056,
"learning_rate": 3.367240479916536e-07,
"logits/chosen": -3.1890625953674316,
"logits/rejected": -3.1312499046325684,
"logps/chosen": -586.7999877929688,
"logps/rejected": -662.7999877929688,
"loss": 3.1732,
"rewards/accuracies": 0.4945312440395355,
"rewards/chosen": 5.978125095367432,
"rewards/margins": 6.753125190734863,
"rewards/rejected": -0.7646484375,
"step": 840
},
{
"epoch": 0.39906103286384975,
"grad_norm": 277.01025031111124,
"learning_rate": 3.3411580594679184e-07,
"logits/chosen": -3.262500047683716,
"logits/rejected": -3.1937499046325684,
"logps/chosen": -537.4000244140625,
"logps/rejected": -634.0,
"loss": 3.1979,
"rewards/accuracies": 0.49921876192092896,
"rewards/chosen": 6.203125,
"rewards/margins": 6.993750095367432,
"rewards/rejected": -0.7895263433456421,
"step": 850
},
{
"epoch": 0.40375586854460094,
"grad_norm": 262.8271384549558,
"learning_rate": 3.315075639019301e-07,
"logits/chosen": -3.1812500953674316,
"logits/rejected": -3.184375047683716,
"logps/chosen": -541.2000122070312,
"logps/rejected": -632.4000244140625,
"loss": 2.9426,
"rewards/accuracies": 0.51171875,
"rewards/chosen": 6.125,
"rewards/margins": 7.006249904632568,
"rewards/rejected": -0.8836669921875,
"step": 860
},
{
"epoch": 0.4084507042253521,
"grad_norm": 257.76714659870345,
"learning_rate": 3.288993218570683e-07,
"logits/chosen": -3.207812547683716,
"logits/rejected": -3.140625,
"logps/chosen": -544.4000244140625,
"logps/rejected": -617.2000122070312,
"loss": 3.0632,
"rewards/accuracies": 0.4867187440395355,
"rewards/chosen": 6.678124904632568,
"rewards/margins": 6.890625,
"rewards/rejected": -0.21953125298023224,
"step": 870
},
{
"epoch": 0.4131455399061033,
"grad_norm": 270.3906090837782,
"learning_rate": 3.2629107981220654e-07,
"logits/chosen": -3.2249999046325684,
"logits/rejected": -3.1343750953674316,
"logps/chosen": -561.5999755859375,
"logps/rejected": -642.5999755859375,
"loss": 3.1866,
"rewards/accuracies": 0.4976562559604645,
"rewards/chosen": 6.128125190734863,
"rewards/margins": 7.431250095367432,
"rewards/rejected": -1.2996094226837158,
"step": 880
},
{
"epoch": 0.41784037558685444,
"grad_norm": 361.90483580549267,
"learning_rate": 3.236828377673448e-07,
"logits/chosen": -3.1640625,
"logits/rejected": -3.2015624046325684,
"logps/chosen": -588.2000122070312,
"logps/rejected": -631.0,
"loss": 3.6112,
"rewards/accuracies": 0.5132812261581421,
"rewards/chosen": 6.135937690734863,
"rewards/margins": 6.685937404632568,
"rewards/rejected": -0.5540527105331421,
"step": 890
},
{
"epoch": 0.4225352112676056,
"grad_norm": 212.9363956810051,
"learning_rate": 3.2107459572248305e-07,
"logits/chosen": -3.168750047683716,
"logits/rejected": -3.168750047683716,
"logps/chosen": -563.0,
"logps/rejected": -641.2000122070312,
"loss": 3.2769,
"rewards/accuracies": 0.4984374940395355,
"rewards/chosen": 5.487500190734863,
"rewards/margins": 7.017187595367432,
"rewards/rejected": -1.535375952720642,
"step": 900
},
{
"epoch": 0.4272300469483568,
"grad_norm": 362.81405145397866,
"learning_rate": 3.1846635367762125e-07,
"logits/chosen": -3.159374952316284,
"logits/rejected": -3.151562452316284,
"logps/chosen": -567.4000244140625,
"logps/rejected": -667.7999877929688,
"loss": 3.3387,
"rewards/accuracies": 0.4906249940395355,
"rewards/chosen": 6.262499809265137,
"rewards/margins": 6.796875,
"rewards/rejected": -0.536425769329071,
"step": 910
},
{
"epoch": 0.431924882629108,
"grad_norm": 301.9066438976272,
"learning_rate": 3.158581116327595e-07,
"logits/chosen": -3.2015624046325684,
"logits/rejected": -3.1703124046325684,
"logps/chosen": -559.5999755859375,
"logps/rejected": -698.0,
"loss": 3.2574,
"rewards/accuracies": 0.5054687261581421,
"rewards/chosen": 6.571875095367432,
"rewards/margins": 7.837500095367432,
"rewards/rejected": -1.2675292491912842,
"step": 920
},
{
"epoch": 0.43661971830985913,
"grad_norm": 261.95731135022044,
"learning_rate": 3.1324986958789775e-07,
"logits/chosen": -3.1640625,
"logits/rejected": -3.1468749046325684,
"logps/chosen": -526.7999877929688,
"logps/rejected": -615.5999755859375,
"loss": 3.0444,
"rewards/accuracies": 0.48359376192092896,
"rewards/chosen": 6.434374809265137,
"rewards/margins": 6.09375,
"rewards/rejected": 0.3431640565395355,
"step": 930
},
{
"epoch": 0.4413145539906103,
"grad_norm": 255.23362051238044,
"learning_rate": 3.1064162754303595e-07,
"logits/chosen": -3.1390624046325684,
"logits/rejected": -3.09375,
"logps/chosen": -571.7999877929688,
"logps/rejected": -674.7999877929688,
"loss": 2.7951,
"rewards/accuracies": 0.4945312440395355,
"rewards/chosen": 6.862500190734863,
"rewards/margins": 9.096875190734863,
"rewards/rejected": -2.2445311546325684,
"step": 940
},
{
"epoch": 0.4460093896713615,
"grad_norm": 246.61090183265875,
"learning_rate": 3.080333854981742e-07,
"logits/chosen": -3.2093749046325684,
"logits/rejected": -3.143749952316284,
"logps/chosen": -547.0,
"logps/rejected": -617.5999755859375,
"loss": 3.1938,
"rewards/accuracies": 0.46015626192092896,
"rewards/chosen": 6.581250190734863,
"rewards/margins": 5.469531059265137,
"rewards/rejected": 1.105712890625,
"step": 950
},
{
"epoch": 0.4507042253521127,
"grad_norm": 263.40064087446916,
"learning_rate": 3.0542514345331246e-07,
"logits/chosen": -3.160937547683716,
"logits/rejected": -3.1078124046325684,
"logps/chosen": -559.7999877929688,
"logps/rejected": -700.7999877929688,
"loss": 3.1048,
"rewards/accuracies": 0.4710937440395355,
"rewards/chosen": 6.578125,
"rewards/margins": 7.384375095367432,
"rewards/rejected": -0.7998046875,
"step": 960
},
{
"epoch": 0.45539906103286387,
"grad_norm": 285.6016326943243,
"learning_rate": 3.0281690140845066e-07,
"logits/chosen": -3.109375,
"logits/rejected": -3.1656250953674316,
"logps/chosen": -594.5999755859375,
"logps/rejected": -667.5999755859375,
"loss": 3.7553,
"rewards/accuracies": 0.508593738079071,
"rewards/chosen": 6.043749809265137,
"rewards/margins": 6.780468940734863,
"rewards/rejected": -0.7427734136581421,
"step": 970
},
{
"epoch": 0.460093896713615,
"grad_norm": 363.14693418021574,
"learning_rate": 3.002086593635889e-07,
"logits/chosen": -3.2750000953674316,
"logits/rejected": -3.1859374046325684,
"logps/chosen": -550.0,
"logps/rejected": -666.4000244140625,
"loss": 3.1473,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": 6.168749809265137,
"rewards/margins": 7.689062595367432,
"rewards/rejected": -1.517187476158142,
"step": 980
},
{
"epoch": 0.4647887323943662,
"grad_norm": 253.9270033153478,
"learning_rate": 2.9760041731872716e-07,
"logits/chosen": -3.2578125,
"logits/rejected": -3.2093749046325684,
"logps/chosen": -514.2000122070312,
"logps/rejected": -627.0,
"loss": 2.8604,
"rewards/accuracies": 0.522656261920929,
"rewards/chosen": 7.346875190734863,
"rewards/margins": 8.475000381469727,
"rewards/rejected": -1.1066405773162842,
"step": 990
},
{
"epoch": 0.4694835680751174,
"grad_norm": 335.1062534816379,
"learning_rate": 2.9499217527386536e-07,
"logits/chosen": -3.214062452316284,
"logits/rejected": -3.192187547683716,
"logps/chosen": -544.2000122070312,
"logps/rejected": -641.2000122070312,
"loss": 2.9158,
"rewards/accuracies": 0.48515623807907104,
"rewards/chosen": 6.574999809265137,
"rewards/margins": 7.496874809265137,
"rewards/rejected": -0.925537109375,
"step": 1000
},
{
"epoch": 0.47417840375586856,
"grad_norm": 282.9372854168335,
"learning_rate": 2.923839332290036e-07,
"logits/chosen": -3.1265625953674316,
"logits/rejected": -3.0999999046325684,
"logps/chosen": -545.4000244140625,
"logps/rejected": -657.5999755859375,
"loss": 3.5473,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": 5.871874809265137,
"rewards/margins": 7.546875,
"rewards/rejected": -1.6741211414337158,
"step": 1010
},
{
"epoch": 0.4788732394366197,
"grad_norm": 239.71190395939402,
"learning_rate": 2.8977569118414187e-07,
"logits/chosen": -3.2437500953674316,
"logits/rejected": -3.1890625953674316,
"logps/chosen": -531.7999877929688,
"logps/rejected": -632.4000244140625,
"loss": 2.9164,
"rewards/accuracies": 0.5,
"rewards/chosen": 6.550000190734863,
"rewards/margins": 8.649999618530273,
"rewards/rejected": -2.099804639816284,
"step": 1020
},
{
"epoch": 0.4835680751173709,
"grad_norm": 338.5277676426185,
"learning_rate": 2.8716744913928007e-07,
"logits/chosen": -3.2046875953674316,
"logits/rejected": -3.1953125,
"logps/chosen": -552.7999877929688,
"logps/rejected": -620.4000244140625,
"loss": 2.6548,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": 6.712500095367432,
"rewards/margins": 7.595312595367432,
"rewards/rejected": -0.874804675579071,
"step": 1030
},
{
"epoch": 0.48826291079812206,
"grad_norm": 233.06442469550254,
"learning_rate": 2.845592070944183e-07,
"logits/chosen": -3.2421875,
"logits/rejected": -3.192187547683716,
"logps/chosen": -549.0,
"logps/rejected": -671.5999755859375,
"loss": 2.8817,
"rewards/accuracies": 0.51953125,
"rewards/chosen": 7.153124809265137,
"rewards/margins": 8.631250381469727,
"rewards/rejected": -1.477929711341858,
"step": 1040
},
{
"epoch": 0.49295774647887325,
"grad_norm": 220.02081206868604,
"learning_rate": 2.819509650495566e-07,
"logits/chosen": -3.1890625953674316,
"logits/rejected": -3.114062547683716,
"logps/chosen": -539.4000244140625,
"logps/rejected": -620.4000244140625,
"loss": 2.8089,
"rewards/accuracies": 0.5289062261581421,
"rewards/chosen": 7.287499904632568,
"rewards/margins": 8.315625190734863,
"rewards/rejected": -1.0333983898162842,
"step": 1050
},
{
"epoch": 0.49765258215962443,
"grad_norm": 217.0188288235901,
"learning_rate": 2.7934272300469483e-07,
"logits/chosen": -3.1312499046325684,
"logits/rejected": -3.0875000953674316,
"logps/chosen": -581.5999755859375,
"logps/rejected": -634.4000244140625,
"loss": 2.9255,
"rewards/accuracies": 0.50390625,
"rewards/chosen": 7.106249809265137,
"rewards/margins": 8.217187881469727,
"rewards/rejected": -1.108007788658142,
"step": 1060
},
{
"epoch": 0.5023474178403756,
"grad_norm": 391.863680049624,
"learning_rate": 2.7673448095983303e-07,
"logits/chosen": -3.2093749046325684,
"logits/rejected": -3.192187547683716,
"logps/chosen": -550.5999755859375,
"logps/rejected": -623.2000122070312,
"loss": 2.692,
"rewards/accuracies": 0.4945312440395355,
"rewards/chosen": 6.865624904632568,
"rewards/margins": 6.509375095367432,
"rewards/rejected": 0.34440916776657104,
"step": 1070
},
{
"epoch": 0.5070422535211268,
"grad_norm": 242.99585174045663,
"learning_rate": 2.741262389149713e-07,
"logits/chosen": -3.159374952316284,
"logits/rejected": -3.078125,
"logps/chosen": -564.0,
"logps/rejected": -651.5999755859375,
"loss": 2.9599,
"rewards/accuracies": 0.5218750238418579,
"rewards/chosen": 6.462500095367432,
"rewards/margins": 7.978125095367432,
"rewards/rejected": -1.518164038658142,
"step": 1080
},
{
"epoch": 0.5117370892018779,
"grad_norm": 333.24219218814267,
"learning_rate": 2.7151799687010953e-07,
"logits/chosen": -3.203125,
"logits/rejected": -3.1312499046325684,
"logps/chosen": -564.2000122070312,
"logps/rejected": -659.5999755859375,
"loss": 2.8861,
"rewards/accuracies": 0.5367187261581421,
"rewards/chosen": 6.493750095367432,
"rewards/margins": 8.649999618530273,
"rewards/rejected": -2.144726514816284,
"step": 1090
},
{
"epoch": 0.5164319248826291,
"grad_norm": 334.21800919068465,
"learning_rate": 2.6890975482524773e-07,
"logits/chosen": -3.114062547683716,
"logits/rejected": -3.104687452316284,
"logps/chosen": -588.4000244140625,
"logps/rejected": -649.5999755859375,
"loss": 3.4208,
"rewards/accuracies": 0.4859375059604645,
"rewards/chosen": 6.010156154632568,
"rewards/margins": 8.053125381469727,
"rewards/rejected": -2.0474610328674316,
"step": 1100
},
{
"epoch": 0.5211267605633803,
"grad_norm": 242.1193531722806,
"learning_rate": 2.66301512780386e-07,
"logits/chosen": -3.184375047683716,
"logits/rejected": -3.082812547683716,
"logps/chosen": -526.0,
"logps/rejected": -645.5999755859375,
"loss": 2.8418,
"rewards/accuracies": 0.49296873807907104,
"rewards/chosen": 6.581250190734863,
"rewards/margins": 9.096875190734863,
"rewards/rejected": -2.513476610183716,
"step": 1110
},
{
"epoch": 0.5258215962441315,
"grad_norm": 202.26450736391664,
"learning_rate": 2.6369327073552424e-07,
"logits/chosen": -3.262500047683716,
"logits/rejected": -3.206249952316284,
"logps/chosen": -535.5999755859375,
"logps/rejected": -607.0,
"loss": 3.1496,
"rewards/accuracies": 0.514843761920929,
"rewards/chosen": 6.915625095367432,
"rewards/margins": 7.053124904632568,
"rewards/rejected": -0.13876953721046448,
"step": 1120
},
{
"epoch": 0.5305164319248826,
"grad_norm": 233.54850849241447,
"learning_rate": 2.6108502869066244e-07,
"logits/chosen": -3.1796875,
"logits/rejected": -3.109375,
"logps/chosen": -576.4000244140625,
"logps/rejected": -680.0,
"loss": 3.7393,
"rewards/accuracies": 0.5,
"rewards/chosen": 6.479687690734863,
"rewards/margins": 7.728125095367432,
"rewards/rejected": -1.2414062023162842,
"step": 1130
},
{
"epoch": 0.5352112676056338,
"grad_norm": 715.8986582014574,
"learning_rate": 2.584767866458007e-07,
"logits/chosen": -3.253124952316284,
"logits/rejected": -3.168750047683716,
"logps/chosen": -544.7999877929688,
"logps/rejected": -636.2000122070312,
"loss": 2.5833,
"rewards/accuracies": 0.4945312440395355,
"rewards/chosen": 6.440625190734863,
"rewards/margins": 7.7578125,
"rewards/rejected": -1.3078124523162842,
"step": 1140
},
{
"epoch": 0.539906103286385,
"grad_norm": 266.9534201019325,
"learning_rate": 2.5586854460093895e-07,
"logits/chosen": -3.1312499046325684,
"logits/rejected": -3.137500047683716,
"logps/chosen": -562.0,
"logps/rejected": -662.0,
"loss": 3.3017,
"rewards/accuracies": 0.522656261920929,
"rewards/chosen": 6.409375190734863,
"rewards/margins": 8.106249809265137,
"rewards/rejected": -1.7031738758087158,
"step": 1150
},
{
"epoch": 0.5446009389671361,
"grad_norm": 224.08328944189464,
"learning_rate": 2.5326030255607715e-07,
"logits/chosen": -3.2125000953674316,
"logits/rejected": -3.1781249046325684,
"logps/chosen": -558.2000122070312,
"logps/rejected": -636.4000244140625,
"loss": 2.9908,
"rewards/accuracies": 0.510937511920929,
"rewards/chosen": 6.909375190734863,
"rewards/margins": 8.153124809265137,
"rewards/rejected": -1.2466309070587158,
"step": 1160
},
{
"epoch": 0.5492957746478874,
"grad_norm": 279.2533762941732,
"learning_rate": 2.506520605112154e-07,
"logits/chosen": -3.2265625,
"logits/rejected": -3.2093749046325684,
"logps/chosen": -549.5999755859375,
"logps/rejected": -637.5999755859375,
"loss": 3.8627,
"rewards/accuracies": 0.46171873807907104,
"rewards/chosen": 6.125,
"rewards/margins": 6.453906059265137,
"rewards/rejected": -0.3257812559604645,
"step": 1170
},
{
"epoch": 0.5539906103286385,
"grad_norm": 240.7984295206075,
"learning_rate": 2.4804381846635365e-07,
"logits/chosen": -3.2593750953674316,
"logits/rejected": -3.2249999046325684,
"logps/chosen": -569.5999755859375,
"logps/rejected": -641.0,
"loss": 3.4113,
"rewards/accuracies": 0.49687498807907104,
"rewards/chosen": 5.7890625,
"rewards/margins": 7.547656059265137,
"rewards/rejected": -1.764062523841858,
"step": 1180
},
{
"epoch": 0.5586854460093896,
"grad_norm": 230.7573724070305,
"learning_rate": 2.454355764214919e-07,
"logits/chosen": -3.184375047683716,
"logits/rejected": -3.174999952316284,
"logps/chosen": -535.7999877929688,
"logps/rejected": -666.2000122070312,
"loss": 2.8176,
"rewards/accuracies": 0.4859375059604645,
"rewards/chosen": 6.625,
"rewards/margins": 8.231249809265137,
"rewards/rejected": -1.6062500476837158,
"step": 1190
},
{
"epoch": 0.5633802816901409,
"grad_norm": 258.81010276738493,
"learning_rate": 2.4282733437663016e-07,
"logits/chosen": -3.229687452316284,
"logits/rejected": -3.2515625953674316,
"logps/chosen": -554.4000244140625,
"logps/rejected": -615.2000122070312,
"loss": 3.2114,
"rewards/accuracies": 0.48046875,
"rewards/chosen": 7.240624904632568,
"rewards/margins": 7.359375,
"rewards/rejected": -0.12626953423023224,
"step": 1200
},
{
"epoch": 0.568075117370892,
"grad_norm": 270.47734606922626,
"learning_rate": 2.4021909233176836e-07,
"logits/chosen": -3.2046875953674316,
"logits/rejected": -3.2046875953674316,
"logps/chosen": -524.7999877929688,
"logps/rejected": -637.0,
"loss": 2.8873,
"rewards/accuracies": 0.5054687261581421,
"rewards/chosen": 6.981249809265137,
"rewards/margins": 8.546875,
"rewards/rejected": -1.56201171875,
"step": 1210
},
{
"epoch": 0.5727699530516432,
"grad_norm": 233.33661543891296,
"learning_rate": 2.376108502869066e-07,
"logits/chosen": -3.231250047683716,
"logits/rejected": -3.2328124046325684,
"logps/chosen": -563.2000122070312,
"logps/rejected": -664.5999755859375,
"loss": 3.0451,
"rewards/accuracies": 0.5015624761581421,
"rewards/chosen": 6.662499904632568,
"rewards/margins": 7.839062690734863,
"rewards/rejected": -1.1779296398162842,
"step": 1220
},
{
"epoch": 0.5774647887323944,
"grad_norm": 266.1090049394165,
"learning_rate": 2.3500260824204484e-07,
"logits/chosen": -3.1656250953674316,
"logits/rejected": -3.0859375,
"logps/chosen": -594.5999755859375,
"logps/rejected": -684.0,
"loss": 3.1565,
"rewards/accuracies": 0.48906248807907104,
"rewards/chosen": 6.103125095367432,
"rewards/margins": 8.472070693969727,
"rewards/rejected": -2.381542921066284,
"step": 1230
},
{
"epoch": 0.5821596244131455,
"grad_norm": 305.9487457845243,
"learning_rate": 2.323943661971831e-07,
"logits/chosen": -3.239062547683716,
"logits/rejected": -3.1812500953674316,
"logps/chosen": -554.2000122070312,
"logps/rejected": -642.4000244140625,
"loss": 3.7207,
"rewards/accuracies": 0.4906249940395355,
"rewards/chosen": 6.474999904632568,
"rewards/margins": 7.390625,
"rewards/rejected": -0.9126952886581421,
"step": 1240
},
{
"epoch": 0.5868544600938967,
"grad_norm": 257.48418816374345,
"learning_rate": 2.2978612415232132e-07,
"logits/chosen": -3.1890625953674316,
"logits/rejected": -3.1656250953674316,
"logps/chosen": -588.0,
"logps/rejected": -669.5999755859375,
"loss": 2.5614,
"rewards/accuracies": 0.47968751192092896,
"rewards/chosen": 7.290625095367432,
"rewards/margins": 9.240625381469727,
"rewards/rejected": -1.9429442882537842,
"step": 1250
},
{
"epoch": 0.5915492957746479,
"grad_norm": 270.4721169154024,
"learning_rate": 2.2717788210745957e-07,
"logits/chosen": -3.1656250953674316,
"logits/rejected": -3.125,
"logps/chosen": -550.2000122070312,
"logps/rejected": -659.5999755859375,
"loss": 3.9904,
"rewards/accuracies": 0.504687488079071,
"rewards/chosen": 6.263281345367432,
"rewards/margins": 8.123437881469727,
"rewards/rejected": -1.851953148841858,
"step": 1260
},
{
"epoch": 0.596244131455399,
"grad_norm": 284.5477615847768,
"learning_rate": 2.245696400625978e-07,
"logits/chosen": -3.221874952316284,
"logits/rejected": -3.198437452316284,
"logps/chosen": -567.7999877929688,
"logps/rejected": -655.5999755859375,
"loss": 3.2818,
"rewards/accuracies": 0.48046875,
"rewards/chosen": 6.384375095367432,
"rewards/margins": 7.204687595367432,
"rewards/rejected": -0.8228515386581421,
"step": 1270
},
{
"epoch": 0.6009389671361502,
"grad_norm": 220.06269466811972,
"learning_rate": 2.2196139801773602e-07,
"logits/chosen": -3.1859374046325684,
"logits/rejected": -3.1187500953674316,
"logps/chosen": -541.2000122070312,
"logps/rejected": -616.5999755859375,
"loss": 2.9687,
"rewards/accuracies": 0.49140626192092896,
"rewards/chosen": 6.400000095367432,
"rewards/margins": 7.564062595367432,
"rewards/rejected": -1.1668212413787842,
"step": 1280
},
{
"epoch": 0.6056338028169014,
"grad_norm": 336.72704282740347,
"learning_rate": 2.1935315597287428e-07,
"logits/chosen": -3.1578125953674316,
"logits/rejected": -3.171875,
"logps/chosen": -564.7999877929688,
"logps/rejected": -620.4000244140625,
"loss": 3.6345,
"rewards/accuracies": 0.526562511920929,
"rewards/chosen": 6.578125,
"rewards/margins": 6.42578125,
"rewards/rejected": 0.14431151747703552,
"step": 1290
},
{
"epoch": 0.6103286384976526,
"grad_norm": 261.5702252277783,
"learning_rate": 2.167449139280125e-07,
"logits/chosen": -3.2015624046325684,
"logits/rejected": -3.1656250953674316,
"logps/chosen": -564.0,
"logps/rejected": -655.5999755859375,
"loss": 3.1705,
"rewards/accuracies": 0.510937511920929,
"rewards/chosen": 7.034375190734863,
"rewards/margins": 7.515625,
"rewards/rejected": -0.4818359315395355,
"step": 1300
},
{
"epoch": 0.6150234741784038,
"grad_norm": 237.58920682165183,
"learning_rate": 2.1413667188315073e-07,
"logits/chosen": -3.2593750953674316,
"logits/rejected": -3.315624952316284,
"logps/chosen": -533.0,
"logps/rejected": -596.2000122070312,
"loss": 3.2553,
"rewards/accuracies": 0.48828125,
"rewards/chosen": 6.599999904632568,
"rewards/margins": 7.442187309265137,
"rewards/rejected": -0.837646484375,
"step": 1310
},
{
"epoch": 0.6197183098591549,
"grad_norm": 284.0284893595022,
"learning_rate": 2.1152842983828898e-07,
"logits/chosen": -3.129687547683716,
"logits/rejected": -3.140625,
"logps/chosen": -574.5999755859375,
"logps/rejected": -635.4000244140625,
"loss": 3.9295,
"rewards/accuracies": 0.5132812261581421,
"rewards/chosen": 5.603125095367432,
"rewards/margins": 7.658593654632568,
"rewards/rejected": -2.0562500953674316,
"step": 1320
},
{
"epoch": 0.6244131455399061,
"grad_norm": 231.7736405189656,
"learning_rate": 2.089201877934272e-07,
"logits/chosen": -3.176562547683716,
"logits/rejected": -3.151562452316284,
"logps/chosen": -581.2000122070312,
"logps/rejected": -687.4000244140625,
"loss": 3.4721,
"rewards/accuracies": 0.508593738079071,
"rewards/chosen": 6.467187404632568,
"rewards/margins": 8.949999809265137,
"rewards/rejected": -2.483105421066284,
"step": 1330
},
{
"epoch": 0.6291079812206573,
"grad_norm": 223.96093756368634,
"learning_rate": 2.0631194574856543e-07,
"logits/chosen": -3.3046875,
"logits/rejected": -3.2593750953674316,
"logps/chosen": -526.7999877929688,
"logps/rejected": -586.4000244140625,
"loss": 2.8135,
"rewards/accuracies": 0.515625,
"rewards/chosen": 7.831250190734863,
"rewards/margins": 8.024999618530273,
"rewards/rejected": -0.20263671875,
"step": 1340
},
{
"epoch": 0.6338028169014085,
"grad_norm": 262.76467763294795,
"learning_rate": 2.0370370370370369e-07,
"logits/chosen": -3.1484375,
"logits/rejected": -3.129687547683716,
"logps/chosen": -562.0,
"logps/rejected": -690.7999877929688,
"loss": 3.8592,
"rewards/accuracies": 0.507031261920929,
"rewards/chosen": 7.050000190734863,
"rewards/margins": 8.587499618530273,
"rewards/rejected": -1.530175805091858,
"step": 1350
},
{
"epoch": 0.6384976525821596,
"grad_norm": 342.0540956930175,
"learning_rate": 2.010954616588419e-07,
"logits/chosen": -3.2046875953674316,
"logits/rejected": -3.153125047683716,
"logps/chosen": -547.7999877929688,
"logps/rejected": -628.0,
"loss": 2.9989,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": 7.368750095367432,
"rewards/margins": 8.465624809265137,
"rewards/rejected": -1.0910155773162842,
"step": 1360
},
{
"epoch": 0.6431924882629108,
"grad_norm": 251.3399864703215,
"learning_rate": 1.9848721961398017e-07,
"logits/chosen": -3.260937452316284,
"logits/rejected": -3.246875047683716,
"logps/chosen": -551.4000244140625,
"logps/rejected": -613.2000122070312,
"loss": 2.5591,
"rewards/accuracies": 0.48046875,
"rewards/chosen": 7.159375190734863,
"rewards/margins": 8.4375,
"rewards/rejected": -1.281640648841858,
"step": 1370
},
{
"epoch": 0.647887323943662,
"grad_norm": 250.2966758097825,
"learning_rate": 1.958789775691184e-07,
"logits/chosen": -3.2046875953674316,
"logits/rejected": -3.140625,
"logps/chosen": -537.7999877929688,
"logps/rejected": -640.0,
"loss": 2.6926,
"rewards/accuracies": 0.4859375059604645,
"rewards/chosen": 7.137499809265137,
"rewards/margins": 9.2578125,
"rewards/rejected": -2.123828172683716,
"step": 1380
},
{
"epoch": 0.6525821596244131,
"grad_norm": 253.91294873585701,
"learning_rate": 1.9327073552425662e-07,
"logits/chosen": -3.1968750953674316,
"logits/rejected": -3.143749952316284,
"logps/chosen": -533.2000122070312,
"logps/rejected": -613.7999877929688,
"loss": 3.4994,
"rewards/accuracies": 0.49140626192092896,
"rewards/chosen": 6.696875095367432,
"rewards/margins": 7.971875190734863,
"rewards/rejected": -1.2712891101837158,
"step": 1390
},
{
"epoch": 0.6572769953051644,
"grad_norm": 223.04763809135872,
"learning_rate": 1.906624934793949e-07,
"logits/chosen": -3.1546874046325684,
"logits/rejected": -3.190624952316284,
"logps/chosen": -552.0,
"logps/rejected": -663.5999755859375,
"loss": 2.8825,
"rewards/accuracies": 0.532031238079071,
"rewards/chosen": 7.21875,
"rewards/margins": 9.815625190734863,
"rewards/rejected": -2.5973877906799316,
"step": 1400
},
{
"epoch": 0.6619718309859155,
"grad_norm": 216.17712546384078,
"learning_rate": 1.8805425143453312e-07,
"logits/chosen": -3.221874952316284,
"logits/rejected": -3.0859375,
"logps/chosen": -533.5999755859375,
"logps/rejected": -697.2000122070312,
"loss": 3.6138,
"rewards/accuracies": 0.5,
"rewards/chosen": 6.456250190734863,
"rewards/margins": 8.978124618530273,
"rewards/rejected": -2.5322265625,
"step": 1410
},
{
"epoch": 0.6666666666666666,
"grad_norm": 282.7401512724836,
"learning_rate": 1.8544600938967138e-07,
"logits/chosen": -3.200000047683716,
"logits/rejected": -3.1937499046325684,
"logps/chosen": -546.0,
"logps/rejected": -619.4000244140625,
"loss": 3.106,
"rewards/accuracies": 0.5210937261581421,
"rewards/chosen": 7.259375095367432,
"rewards/margins": 8.28125,
"rewards/rejected": -1.0185546875,
"step": 1420
},
{
"epoch": 0.6713615023474179,
"grad_norm": 214.2973371198878,
"learning_rate": 1.828377673448096e-07,
"logits/chosen": -3.1703124046325684,
"logits/rejected": -3.1937499046325684,
"logps/chosen": -586.2000122070312,
"logps/rejected": -639.5999755859375,
"loss": 3.944,
"rewards/accuracies": 0.5210937261581421,
"rewards/chosen": 6.467187404632568,
"rewards/margins": 8.125,
"rewards/rejected": -1.652929663658142,
"step": 1430
},
{
"epoch": 0.676056338028169,
"grad_norm": 219.0548111069885,
"learning_rate": 1.8022952529994783e-07,
"logits/chosen": -3.231250047683716,
"logits/rejected": -3.1640625,
"logps/chosen": -556.4000244140625,
"logps/rejected": -638.2000122070312,
"loss": 2.6025,
"rewards/accuracies": 0.532031238079071,
"rewards/chosen": 7.456250190734863,
"rewards/margins": 10.785937309265137,
"rewards/rejected": -3.3238282203674316,
"step": 1440
},
{
"epoch": 0.6807511737089202,
"grad_norm": 285.2721726652972,
"learning_rate": 1.7762128325508608e-07,
"logits/chosen": -3.21875,
"logits/rejected": -3.2171874046325684,
"logps/chosen": -576.0,
"logps/rejected": -679.5999755859375,
"loss": 3.5055,
"rewards/accuracies": 0.5101562738418579,
"rewards/chosen": 6.837500095367432,
"rewards/margins": 8.471875190734863,
"rewards/rejected": -1.632421851158142,
"step": 1450
},
{
"epoch": 0.6854460093896714,
"grad_norm": 235.54935518418984,
"learning_rate": 1.750130412102243e-07,
"logits/chosen": -3.1109375953674316,
"logits/rejected": -3.135937452316284,
"logps/chosen": -604.0,
"logps/rejected": -658.4000244140625,
"loss": 3.2522,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 6.840624809265137,
"rewards/margins": 9.0625,
"rewards/rejected": -2.216357469558716,
"step": 1460
},
{
"epoch": 0.6901408450704225,
"grad_norm": 230.7248053656623,
"learning_rate": 1.7240479916536254e-07,
"logits/chosen": -3.104687452316284,
"logits/rejected": -3.1031250953674316,
"logps/chosen": -543.0,
"logps/rejected": -674.4000244140625,
"loss": 2.7177,
"rewards/accuracies": 0.5210937261581421,
"rewards/chosen": 7.393750190734863,
"rewards/margins": 9.737500190734863,
"rewards/rejected": -2.353222608566284,
"step": 1470
},
{
"epoch": 0.6948356807511737,
"grad_norm": 190.86847796342923,
"learning_rate": 1.697965571205008e-07,
"logits/chosen": -3.265625,
"logits/rejected": -3.1953125,
"logps/chosen": -545.0,
"logps/rejected": -615.2000122070312,
"loss": 2.9373,
"rewards/accuracies": 0.52734375,
"rewards/chosen": 6.996874809265137,
"rewards/margins": 8.471875190734863,
"rewards/rejected": -1.469628930091858,
"step": 1480
},
{
"epoch": 0.6995305164319249,
"grad_norm": 292.41862144071786,
"learning_rate": 1.6718831507563902e-07,
"logits/chosen": -3.1968750953674316,
"logits/rejected": -3.200000047683716,
"logps/chosen": -555.0,
"logps/rejected": -603.7999877929688,
"loss": 3.2203,
"rewards/accuracies": 0.516406238079071,
"rewards/chosen": 6.784375190734863,
"rewards/margins": 7.028124809265137,
"rewards/rejected": -0.242431640625,
"step": 1490
},
{
"epoch": 0.704225352112676,
"grad_norm": 248.77580575495298,
"learning_rate": 1.6458007303077727e-07,
"logits/chosen": -3.2015624046325684,
"logits/rejected": -3.215625047683716,
"logps/chosen": -535.5999755859375,
"logps/rejected": -602.7999877929688,
"loss": 3.1053,
"rewards/accuracies": 0.49531251192092896,
"rewards/chosen": 7.059374809265137,
"rewards/margins": 7.209374904632568,
"rewards/rejected": -0.15102538466453552,
"step": 1500
},
{
"epoch": 0.7089201877934272,
"grad_norm": 254.34351835854568,
"learning_rate": 1.619718309859155e-07,
"logits/chosen": -3.2015624046325684,
"logits/rejected": -3.1156249046325684,
"logps/chosen": -560.0,
"logps/rejected": -657.7999877929688,
"loss": 3.6322,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 6.771874904632568,
"rewards/margins": 8.5625,
"rewards/rejected": -1.796289086341858,
"step": 1510
},
{
"epoch": 0.7136150234741784,
"grad_norm": 234.67184609027018,
"learning_rate": 1.5936358894105372e-07,
"logits/chosen": -3.221874952316284,
"logits/rejected": -3.1859374046325684,
"logps/chosen": -558.2000122070312,
"logps/rejected": -647.2000122070312,
"loss": 2.9855,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 7.949999809265137,
"rewards/margins": 8.662500381469727,
"rewards/rejected": -0.7110351324081421,
"step": 1520
},
{
"epoch": 0.7183098591549296,
"grad_norm": 238.33477876523006,
"learning_rate": 1.5675534689619197e-07,
"logits/chosen": -3.2249999046325684,
"logits/rejected": -3.1953125,
"logps/chosen": -560.4000244140625,
"logps/rejected": -598.4000244140625,
"loss": 3.1396,
"rewards/accuracies": 0.51953125,
"rewards/chosen": 6.5625,
"rewards/margins": 7.431250095367432,
"rewards/rejected": -0.86181640625,
"step": 1530
},
{
"epoch": 0.7230046948356808,
"grad_norm": 281.3784430411394,
"learning_rate": 1.541471048513302e-07,
"logits/chosen": -3.1578125953674316,
"logits/rejected": -3.1390624046325684,
"logps/chosen": -564.0,
"logps/rejected": -642.7999877929688,
"loss": 2.8664,
"rewards/accuracies": 0.524218738079071,
"rewards/chosen": 7.71875,
"rewards/margins": 8.193750381469727,
"rewards/rejected": -0.4874023497104645,
"step": 1540
},
{
"epoch": 0.7276995305164319,
"grad_norm": 206.24345933603442,
"learning_rate": 1.5153886280646843e-07,
"logits/chosen": -3.176562547683716,
"logits/rejected": -3.1859374046325684,
"logps/chosen": -525.2000122070312,
"logps/rejected": -594.5999755859375,
"loss": 3.2805,
"rewards/accuracies": 0.5054687261581421,
"rewards/chosen": 6.890625,
"rewards/margins": 7.2734375,
"rewards/rejected": -0.3929687440395355,
"step": 1550
},
{
"epoch": 0.7323943661971831,
"grad_norm": 223.79553591154388,
"learning_rate": 1.4893062076160668e-07,
"logits/chosen": -3.2437500953674316,
"logits/rejected": -3.2578125,
"logps/chosen": -544.4000244140625,
"logps/rejected": -647.4000244140625,
"loss": 3.0315,
"rewards/accuracies": 0.5171874761581421,
"rewards/chosen": 7.109375,
"rewards/margins": 9.737500190734863,
"rewards/rejected": -2.630078077316284,
"step": 1560
},
{
"epoch": 0.7370892018779343,
"grad_norm": 180.42701263391848,
"learning_rate": 1.463223787167449e-07,
"logits/chosen": -3.2109375,
"logits/rejected": -3.1781249046325684,
"logps/chosen": -560.0,
"logps/rejected": -628.0,
"loss": 3.1893,
"rewards/accuracies": 0.5132812261581421,
"rewards/chosen": 7.106249809265137,
"rewards/margins": 7.204687595367432,
"rewards/rejected": -0.11171875149011612,
"step": 1570
},
{
"epoch": 0.7417840375586855,
"grad_norm": 270.2632681106118,
"learning_rate": 1.4371413667188313e-07,
"logits/chosen": -3.2421875,
"logits/rejected": -3.200000047683716,
"logps/chosen": -544.2000122070312,
"logps/rejected": -668.4000244140625,
"loss": 3.052,
"rewards/accuracies": 0.542187511920929,
"rewards/chosen": 7.303124904632568,
"rewards/margins": 8.615625381469727,
"rewards/rejected": -1.320703148841858,
"step": 1580
},
{
"epoch": 0.7464788732394366,
"grad_norm": 222.47759597309053,
"learning_rate": 1.4110589462702139e-07,
"logits/chosen": -3.2718749046325684,
"logits/rejected": -3.2562499046325684,
"logps/chosen": -540.7999877929688,
"logps/rejected": -607.4000244140625,
"loss": 3.094,
"rewards/accuracies": 0.5140625238418579,
"rewards/chosen": 7.271874904632568,
"rewards/margins": 7.453125,
"rewards/rejected": -0.18339844048023224,
"step": 1590
},
{
"epoch": 0.7511737089201878,
"grad_norm": 229.4125834407109,
"learning_rate": 1.384976525821596e-07,
"logits/chosen": -3.3046875,
"logits/rejected": -3.2046875953674316,
"logps/chosen": -559.0,
"logps/rejected": -656.0,
"loss": 3.2896,
"rewards/accuracies": 0.49921876192092896,
"rewards/chosen": 6.934374809265137,
"rewards/margins": 8.587499618530273,
"rewards/rejected": -1.6577637195587158,
"step": 1600
},
{
"epoch": 0.755868544600939,
"grad_norm": 288.27015054065197,
"learning_rate": 1.3588941053729787e-07,
"logits/chosen": -3.176562547683716,
"logits/rejected": -3.2015624046325684,
"logps/chosen": -565.5999755859375,
"logps/rejected": -662.7999877929688,
"loss": 2.8581,
"rewards/accuracies": 0.5234375,
"rewards/chosen": 7.037499904632568,
"rewards/margins": 8.634374618530273,
"rewards/rejected": -1.6044433116912842,
"step": 1610
},
{
"epoch": 0.7605633802816901,
"grad_norm": 249.0861315317849,
"learning_rate": 1.332811684924361e-07,
"logits/chosen": -3.2203125953674316,
"logits/rejected": -3.176562547683716,
"logps/chosen": -571.0,
"logps/rejected": -634.7999877929688,
"loss": 3.0383,
"rewards/accuracies": 0.500781238079071,
"rewards/chosen": 7.037499904632568,
"rewards/margins": 8.121874809265137,
"rewards/rejected": -1.0910155773162842,
"step": 1620
},
{
"epoch": 0.7652582159624414,
"grad_norm": 336.27211809292953,
"learning_rate": 1.3067292644757432e-07,
"logits/chosen": -3.0859375,
"logits/rejected": -3.096874952316284,
"logps/chosen": -583.5999755859375,
"logps/rejected": -684.0,
"loss": 3.377,
"rewards/accuracies": 0.4921875,
"rewards/chosen": 6.793749809265137,
"rewards/margins": 8.328125,
"rewards/rejected": -1.5402343273162842,
"step": 1630
},
{
"epoch": 0.7699530516431925,
"grad_norm": 296.7983824668959,
"learning_rate": 1.2806468440271257e-07,
"logits/chosen": -3.1875,
"logits/rejected": -3.1390624046325684,
"logps/chosen": -540.2000122070312,
"logps/rejected": -625.0,
"loss": 2.6146,
"rewards/accuracies": 0.49921876192092896,
"rewards/chosen": 7.290625095367432,
"rewards/margins": 7.873437404632568,
"rewards/rejected": -0.5884033441543579,
"step": 1640
},
{
"epoch": 0.7746478873239436,
"grad_norm": 251.3318467521815,
"learning_rate": 1.254564423578508e-07,
"logits/chosen": -3.198437452316284,
"logits/rejected": -3.1546874046325684,
"logps/chosen": -547.7999877929688,
"logps/rejected": -640.0,
"loss": 2.6398,
"rewards/accuracies": 0.53125,
"rewards/chosen": 7.775000095367432,
"rewards/margins": 9.34375,
"rewards/rejected": -1.5627930164337158,
"step": 1650
},
{
"epoch": 0.7793427230046949,
"grad_norm": 220.5249442173988,
"learning_rate": 1.2284820031298902e-07,
"logits/chosen": -3.207812547683716,
"logits/rejected": -3.137500047683716,
"logps/chosen": -583.5999755859375,
"logps/rejected": -655.5999755859375,
"loss": 2.8037,
"rewards/accuracies": 0.5367187261581421,
"rewards/chosen": 7.190625190734863,
"rewards/margins": 10.106249809265137,
"rewards/rejected": -2.914843797683716,
"step": 1660
},
{
"epoch": 0.784037558685446,
"grad_norm": 275.17795087812124,
"learning_rate": 1.2023995826812728e-07,
"logits/chosen": -3.192187547683716,
"logits/rejected": -3.1421875953674316,
"logps/chosen": -550.4000244140625,
"logps/rejected": -651.2000122070312,
"loss": 3.0396,
"rewards/accuracies": 0.514843761920929,
"rewards/chosen": 7.331250190734863,
"rewards/margins": 8.3671875,
"rewards/rejected": -1.035742163658142,
"step": 1670
},
{
"epoch": 0.7887323943661971,
"grad_norm": 227.48462965497947,
"learning_rate": 1.176317162232655e-07,
"logits/chosen": -3.2265625,
"logits/rejected": -3.221874952316284,
"logps/chosen": -539.0,
"logps/rejected": -660.2000122070312,
"loss": 3.0071,
"rewards/accuracies": 0.5335937738418579,
"rewards/chosen": 7.021874904632568,
"rewards/margins": 8.871874809265137,
"rewards/rejected": -1.8449218273162842,
"step": 1680
},
{
"epoch": 0.7934272300469484,
"grad_norm": 244.60056821194243,
"learning_rate": 1.1502347417840374e-07,
"logits/chosen": -3.1734375953674316,
"logits/rejected": -3.120312452316284,
"logps/chosen": -551.4000244140625,
"logps/rejected": -642.4000244140625,
"loss": 3.3663,
"rewards/accuracies": 0.51171875,
"rewards/chosen": 6.449999809265137,
"rewards/margins": 7.964062690734863,
"rewards/rejected": -1.507226586341858,
"step": 1690
},
{
"epoch": 0.7981220657276995,
"grad_norm": 260.2800758003632,
"learning_rate": 1.1241523213354198e-07,
"logits/chosen": -3.239062547683716,
"logits/rejected": -3.1796875,
"logps/chosen": -547.4000244140625,
"logps/rejected": -656.7999877929688,
"loss": 2.7516,
"rewards/accuracies": 0.5015624761581421,
"rewards/chosen": 7.525000095367432,
"rewards/margins": 8.653124809265137,
"rewards/rejected": -1.129980444908142,
"step": 1700
},
{
"epoch": 0.8028169014084507,
"grad_norm": 537.1358038182277,
"learning_rate": 1.0980699008868022e-07,
"logits/chosen": -3.2093749046325684,
"logits/rejected": -3.1640625,
"logps/chosen": -566.0,
"logps/rejected": -623.2000122070312,
"loss": 3.5979,
"rewards/accuracies": 0.516406238079071,
"rewards/chosen": 7.125,
"rewards/margins": 7.8046875,
"rewards/rejected": -0.683398425579071,
"step": 1710
},
{
"epoch": 0.8075117370892019,
"grad_norm": 226.49875791421053,
"learning_rate": 1.0719874804381846e-07,
"logits/chosen": -3.1546874046325684,
"logits/rejected": -3.15625,
"logps/chosen": -553.4000244140625,
"logps/rejected": -593.0,
"loss": 2.4992,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": 7.696875095367432,
"rewards/margins": 8.106249809265137,
"rewards/rejected": -0.42167967557907104,
"step": 1720
},
{
"epoch": 0.812206572769953,
"grad_norm": 208.7589975935314,
"learning_rate": 1.045905059989567e-07,
"logits/chosen": -3.1953125,
"logits/rejected": -3.2125000953674316,
"logps/chosen": -523.2000122070312,
"logps/rejected": -582.4000244140625,
"loss": 3.0689,
"rewards/accuracies": 0.507031261920929,
"rewards/chosen": 7.025000095367432,
"rewards/margins": 7.3125,
"rewards/rejected": -0.28300780057907104,
"step": 1730
},
{
"epoch": 0.8169014084507042,
"grad_norm": 283.0112631119428,
"learning_rate": 1.0198226395409494e-07,
"logits/chosen": -3.1421875953674316,
"logits/rejected": -3.082812547683716,
"logps/chosen": -532.5999755859375,
"logps/rejected": -614.0,
"loss": 3.0318,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 7.181250095367432,
"rewards/margins": 7.328125,
"rewards/rejected": -0.13925781846046448,
"step": 1740
},
{
"epoch": 0.8215962441314554,
"grad_norm": 214.30492446037974,
"learning_rate": 9.937402190923318e-08,
"logits/chosen": -3.129687547683716,
"logits/rejected": -3.1156249046325684,
"logps/chosen": -527.2000122070312,
"logps/rejected": -649.5999755859375,
"loss": 2.7622,
"rewards/accuracies": 0.507031261920929,
"rewards/chosen": 7.393750190734863,
"rewards/margins": 9.978124618530273,
"rewards/rejected": -2.5888671875,
"step": 1750
},
{
"epoch": 0.8262910798122066,
"grad_norm": 239.2913674845102,
"learning_rate": 9.676577986437141e-08,
"logits/chosen": -3.1546874046325684,
"logits/rejected": -3.0843749046325684,
"logps/chosen": -581.2000122070312,
"logps/rejected": -643.5999755859375,
"loss": 3.0143,
"rewards/accuracies": 0.5179687738418579,
"rewards/chosen": 7.084374904632568,
"rewards/margins": 9.653124809265137,
"rewards/rejected": -2.5667967796325684,
"step": 1760
},
{
"epoch": 0.8309859154929577,
"grad_norm": 261.7289874476712,
"learning_rate": 9.415753781950965e-08,
"logits/chosen": -3.206249952316284,
"logits/rejected": -3.1953125,
"logps/chosen": -535.7999877929688,
"logps/rejected": -627.0,
"loss": 3.1962,
"rewards/accuracies": 0.52734375,
"rewards/chosen": 7.068749904632568,
"rewards/margins": 8.881250381469727,
"rewards/rejected": -1.8058593273162842,
"step": 1770
},
{
"epoch": 0.8356807511737089,
"grad_norm": 278.27984402382043,
"learning_rate": 9.154929577464789e-08,
"logits/chosen": -3.2328124046325684,
"logits/rejected": -3.1875,
"logps/chosen": -548.0,
"logps/rejected": -632.0,
"loss": 2.6488,
"rewards/accuracies": 0.5367187261581421,
"rewards/chosen": 7.543749809265137,
"rewards/margins": 10.512499809265137,
"rewards/rejected": -2.9665770530700684,
"step": 1780
},
{
"epoch": 0.8403755868544601,
"grad_norm": 869.0934065585606,
"learning_rate": 8.894105372978613e-08,
"logits/chosen": -3.1484375,
"logits/rejected": -3.15625,
"logps/chosen": -572.7999877929688,
"logps/rejected": -643.0,
"loss": 3.476,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 7.753125190734863,
"rewards/margins": 8.867968559265137,
"rewards/rejected": -1.105078101158142,
"step": 1790
},
{
"epoch": 0.8450704225352113,
"grad_norm": 177.18669328761814,
"learning_rate": 8.633281168492435e-08,
"logits/chosen": -3.2421875,
"logits/rejected": -3.237499952316284,
"logps/chosen": -546.2000122070312,
"logps/rejected": -620.4000244140625,
"loss": 2.5794,
"rewards/accuracies": 0.542187511920929,
"rewards/chosen": 7.628125190734863,
"rewards/margins": 9.643750190734863,
"rewards/rejected": -1.998046875,
"step": 1800
},
{
"epoch": 0.8497652582159625,
"grad_norm": 320.75249929826015,
"learning_rate": 8.372456964006259e-08,
"logits/chosen": -3.145312547683716,
"logits/rejected": -3.0953125953674316,
"logps/chosen": -550.5999755859375,
"logps/rejected": -637.2000122070312,
"loss": 3.1061,
"rewards/accuracies": 0.510937511920929,
"rewards/chosen": 6.485937595367432,
"rewards/margins": 8.915624618530273,
"rewards/rejected": -2.4320311546325684,
"step": 1810
},
{
"epoch": 0.8544600938967136,
"grad_norm": 298.35819523439216,
"learning_rate": 8.111632759520083e-08,
"logits/chosen": -3.1859374046325684,
"logits/rejected": -3.1624999046325684,
"logps/chosen": -586.5999755859375,
"logps/rejected": -648.7999877929688,
"loss": 2.7858,
"rewards/accuracies": 0.528124988079071,
"rewards/chosen": 7.328125,
"rewards/margins": 10.743749618530273,
"rewards/rejected": -3.421337842941284,
"step": 1820
},
{
"epoch": 0.8591549295774648,
"grad_norm": 260.70360175198056,
"learning_rate": 7.850808555033907e-08,
"logits/chosen": -3.198437452316284,
"logits/rejected": -3.120312452316284,
"logps/chosen": -554.0,
"logps/rejected": -660.7999877929688,
"loss": 2.9332,
"rewards/accuracies": 0.4945312440395355,
"rewards/chosen": 7.34375,
"rewards/margins": 10.267187118530273,
"rewards/rejected": -2.930468797683716,
"step": 1830
},
{
"epoch": 0.863849765258216,
"grad_norm": 336.5324358151134,
"learning_rate": 7.58998435054773e-08,
"logits/chosen": -3.2203125953674316,
"logits/rejected": -3.1484375,
"logps/chosen": -565.4000244140625,
"logps/rejected": -649.2000122070312,
"loss": 3.133,
"rewards/accuracies": 0.52734375,
"rewards/chosen": 7.131249904632568,
"rewards/margins": 9.384374618530273,
"rewards/rejected": -2.2672362327575684,
"step": 1840
},
{
"epoch": 0.8685446009389671,
"grad_norm": 311.72767948233286,
"learning_rate": 7.329160146061554e-08,
"logits/chosen": -3.239062547683716,
"logits/rejected": -3.285937547683716,
"logps/chosen": -541.4000244140625,
"logps/rejected": -631.5999755859375,
"loss": 3.441,
"rewards/accuracies": 0.520312488079071,
"rewards/chosen": 7.084374904632568,
"rewards/margins": 7.295312404632568,
"rewards/rejected": -0.20380859076976776,
"step": 1850
},
{
"epoch": 0.8732394366197183,
"grad_norm": 295.9764703306369,
"learning_rate": 7.068335941575378e-08,
"logits/chosen": -3.143749952316284,
"logits/rejected": -3.174999952316284,
"logps/chosen": -544.5999755859375,
"logps/rejected": -616.7999877929688,
"loss": 3.3042,
"rewards/accuracies": 0.47968751192092896,
"rewards/chosen": 5.900000095367432,
"rewards/margins": 7.462500095367432,
"rewards/rejected": -1.560156226158142,
"step": 1860
},
{
"epoch": 0.8779342723004695,
"grad_norm": 338.46224187494545,
"learning_rate": 6.807511737089202e-08,
"logits/chosen": -3.2359375953674316,
"logits/rejected": -3.1937499046325684,
"logps/chosen": -540.7999877929688,
"logps/rejected": -641.2000122070312,
"loss": 2.9456,
"rewards/accuracies": 0.51171875,
"rewards/chosen": 6.884375095367432,
"rewards/margins": 8.481249809265137,
"rewards/rejected": -1.6007812023162842,
"step": 1870
},
{
"epoch": 0.8826291079812206,
"grad_norm": 221.38172987684965,
"learning_rate": 6.546687532603024e-08,
"logits/chosen": -3.2093749046325684,
"logits/rejected": -3.176562547683716,
"logps/chosen": -545.7999877929688,
"logps/rejected": -642.0,
"loss": 2.7403,
"rewards/accuracies": 0.516406238079071,
"rewards/chosen": 7.334374904632568,
"rewards/margins": 8.307812690734863,
"rewards/rejected": -0.9735351800918579,
"step": 1880
},
{
"epoch": 0.8873239436619719,
"grad_norm": 240.72999782160588,
"learning_rate": 6.285863328116848e-08,
"logits/chosen": -3.190624952316284,
"logits/rejected": -3.1578125953674316,
"logps/chosen": -499.20001220703125,
"logps/rejected": -654.4000244140625,
"loss": 2.6771,
"rewards/accuracies": 0.522656261920929,
"rewards/chosen": 7.665625095367432,
"rewards/margins": 10.246874809265137,
"rewards/rejected": -2.581835985183716,
"step": 1890
},
{
"epoch": 0.892018779342723,
"grad_norm": 232.53227092972915,
"learning_rate": 6.025039123630672e-08,
"logits/chosen": -3.200000047683716,
"logits/rejected": -3.1578125953674316,
"logps/chosen": -558.4000244140625,
"logps/rejected": -610.0,
"loss": 2.9898,
"rewards/accuracies": 0.48359376192092896,
"rewards/chosen": 7.087500095367432,
"rewards/margins": 8.743749618530273,
"rewards/rejected": -1.6506836414337158,
"step": 1900
},
{
"epoch": 0.8967136150234741,
"grad_norm": 259.5993100857981,
"learning_rate": 5.764214919144496e-08,
"logits/chosen": -3.2109375,
"logits/rejected": -3.2249999046325684,
"logps/chosen": -567.2000122070312,
"logps/rejected": -604.0,
"loss": 3.2302,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": 7.474999904632568,
"rewards/margins": 7.65625,
"rewards/rejected": -0.17148438096046448,
"step": 1910
},
{
"epoch": 0.9014084507042254,
"grad_norm": 272.03270384560665,
"learning_rate": 5.50339071465832e-08,
"logits/chosen": -3.1953125,
"logits/rejected": -3.21875,
"logps/chosen": -561.0,
"logps/rejected": -589.4000244140625,
"loss": 3.044,
"rewards/accuracies": 0.5101562738418579,
"rewards/chosen": 7.71875,
"rewards/margins": 7.587500095367432,
"rewards/rejected": 0.13071289658546448,
"step": 1920
},
{
"epoch": 0.9061032863849765,
"grad_norm": 347.3443264035594,
"learning_rate": 5.2425665101721436e-08,
"logits/chosen": -3.2046875953674316,
"logits/rejected": -3.089062452316284,
"logps/chosen": -564.2000122070312,
"logps/rejected": -682.7999877929688,
"loss": 3.1516,
"rewards/accuracies": 0.500781238079071,
"rewards/chosen": 6.857812404632568,
"rewards/margins": 8.010937690734863,
"rewards/rejected": -1.1521484851837158,
"step": 1930
},
{
"epoch": 0.9107981220657277,
"grad_norm": 262.9951136929276,
"learning_rate": 4.9817423056859675e-08,
"logits/chosen": -3.2171874046325684,
"logits/rejected": -3.1656250953674316,
"logps/chosen": -559.0,
"logps/rejected": -652.4000244140625,
"loss": 2.9059,
"rewards/accuracies": 0.5218750238418579,
"rewards/chosen": 7.0625,
"rewards/margins": 9.043749809265137,
"rewards/rejected": -1.981054663658142,
"step": 1940
},
{
"epoch": 0.9154929577464789,
"grad_norm": 253.21404195030644,
"learning_rate": 4.720918101199791e-08,
"logits/chosen": -3.1390624046325684,
"logits/rejected": -3.075000047683716,
"logps/chosen": -573.2000122070312,
"logps/rejected": -708.0,
"loss": 3.2897,
"rewards/accuracies": 0.514843761920929,
"rewards/chosen": 6.673437595367432,
"rewards/margins": 9.121874809265137,
"rewards/rejected": -2.451367139816284,
"step": 1950
},
{
"epoch": 0.92018779342723,
"grad_norm": 415.35862908474485,
"learning_rate": 4.460093896713615e-08,
"logits/chosen": -3.2265625,
"logits/rejected": -3.1421875953674316,
"logps/chosen": -549.2000122070312,
"logps/rejected": -660.7999877929688,
"loss": 3.0769,
"rewards/accuracies": 0.5210937261581421,
"rewards/chosen": 7.340624809265137,
"rewards/margins": 8.334375381469727,
"rewards/rejected": -0.9888671636581421,
"step": 1960
},
{
"epoch": 0.9248826291079812,
"grad_norm": 211.19149426581953,
"learning_rate": 4.199269692227438e-08,
"logits/chosen": -3.1937499046325684,
"logits/rejected": -3.2109375,
"logps/chosen": -544.7999877929688,
"logps/rejected": -597.2000122070312,
"loss": 3.3169,
"rewards/accuracies": 0.514843761920929,
"rewards/chosen": 7.521874904632568,
"rewards/margins": 7.584374904632568,
"rewards/rejected": -0.05854492262005806,
"step": 1970
},
{
"epoch": 0.9295774647887324,
"grad_norm": 246.97599476259882,
"learning_rate": 3.938445487741262e-08,
"logits/chosen": -3.1031250953674316,
"logits/rejected": -3.059375047683716,
"logps/chosen": -574.5999755859375,
"logps/rejected": -638.0,
"loss": 2.8105,
"rewards/accuracies": 0.528124988079071,
"rewards/chosen": 7.303124904632568,
"rewards/margins": 9.528124809265137,
"rewards/rejected": -2.2249999046325684,
"step": 1980
},
{
"epoch": 0.9342723004694836,
"grad_norm": 279.61160915300826,
"learning_rate": 3.677621283255086e-08,
"logits/chosen": -3.246875047683716,
"logits/rejected": -3.2359375953674316,
"logps/chosen": -567.7999877929688,
"logps/rejected": -628.4000244140625,
"loss": 3.3836,
"rewards/accuracies": 0.5289062261581421,
"rewards/chosen": 7.5,
"rewards/margins": 8.337499618530273,
"rewards/rejected": -0.842089831829071,
"step": 1990
},
{
"epoch": 0.9389671361502347,
"grad_norm": 205.70295434604552,
"learning_rate": 3.41679707876891e-08,
"logits/chosen": -3.2203125953674316,
"logits/rejected": -3.125,
"logps/chosen": -554.4000244140625,
"logps/rejected": -696.4000244140625,
"loss": 3.1675,
"rewards/accuracies": 0.5289062261581421,
"rewards/chosen": 7.099999904632568,
"rewards/margins": 10.25,
"rewards/rejected": -3.1507811546325684,
"step": 2000
},
{
"epoch": 0.9436619718309859,
"grad_norm": 254.40619070914613,
"learning_rate": 3.155972874282733e-08,
"logits/chosen": -3.237499952316284,
"logits/rejected": -3.184375047683716,
"logps/chosen": -533.2000122070312,
"logps/rejected": -630.4000244140625,
"loss": 3.343,
"rewards/accuracies": 0.507031261920929,
"rewards/chosen": 6.965624809265137,
"rewards/margins": 7.915625095367432,
"rewards/rejected": -0.9505859613418579,
"step": 2010
},
{
"epoch": 0.9483568075117371,
"grad_norm": 220.33132929742834,
"learning_rate": 2.8951486697965573e-08,
"logits/chosen": -3.1937499046325684,
"logits/rejected": -3.1656250953674316,
"logps/chosen": -565.7999877929688,
"logps/rejected": -644.7999877929688,
"loss": 3.3804,
"rewards/accuracies": 0.508593738079071,
"rewards/chosen": 7.599999904632568,
"rewards/margins": 7.759375095367432,
"rewards/rejected": -0.16523437201976776,
"step": 2020
},
{
"epoch": 0.9530516431924883,
"grad_norm": 272.37246310322604,
"learning_rate": 2.634324465310381e-08,
"logits/chosen": -3.203125,
"logits/rejected": -3.1484375,
"logps/chosen": -542.0,
"logps/rejected": -635.5999755859375,
"loss": 3.0197,
"rewards/accuracies": 0.5179687738418579,
"rewards/chosen": 6.753125190734863,
"rewards/margins": 8.303125381469727,
"rewards/rejected": -1.5498046875,
"step": 2030
},
{
"epoch": 0.9577464788732394,
"grad_norm": 245.31063589482577,
"learning_rate": 2.3735002608242045e-08,
"logits/chosen": -3.21875,
"logits/rejected": -3.184375047683716,
"logps/chosen": -564.0,
"logps/rejected": -606.7999877929688,
"loss": 3.171,
"rewards/accuracies": 0.5179687738418579,
"rewards/chosen": 6.828125,
"rewards/margins": 8.393750190734863,
"rewards/rejected": -1.563085913658142,
"step": 2040
},
{
"epoch": 0.9624413145539906,
"grad_norm": 245.63046415376243,
"learning_rate": 2.1126760563380282e-08,
"logits/chosen": -3.231250047683716,
"logits/rejected": -3.265625,
"logps/chosen": -545.7999877929688,
"logps/rejected": -580.0,
"loss": 3.0817,
"rewards/accuracies": 0.5078125,
"rewards/chosen": 7.0625,
"rewards/margins": 7.109375,
"rewards/rejected": -0.04501952975988388,
"step": 2050
},
{
"epoch": 0.9671361502347418,
"grad_norm": 1711.8411407572087,
"learning_rate": 1.8518518518518518e-08,
"logits/chosen": -3.2171874046325684,
"logits/rejected": -3.1234374046325684,
"logps/chosen": -570.4000244140625,
"logps/rejected": -638.0,
"loss": 3.162,
"rewards/accuracies": 0.514843761920929,
"rewards/chosen": 7.546875,
"rewards/margins": 8.324999809265137,
"rewards/rejected": -0.786914050579071,
"step": 2060
},
{
"epoch": 0.971830985915493,
"grad_norm": 267.19856736301995,
"learning_rate": 1.5910276473656755e-08,
"logits/chosen": -3.168750047683716,
"logits/rejected": -3.0703125,
"logps/chosen": -549.2000122070312,
"logps/rejected": -635.5999755859375,
"loss": 3.026,
"rewards/accuracies": 0.508593738079071,
"rewards/chosen": 7.5078125,
"rewards/margins": 10.239843368530273,
"rewards/rejected": -2.7281250953674316,
"step": 2070
},
{
"epoch": 0.9765258215962441,
"grad_norm": 216.4347507416067,
"learning_rate": 1.3302034428794991e-08,
"logits/chosen": -3.253124952316284,
"logits/rejected": -3.2671875953674316,
"logps/chosen": -523.7999877929688,
"logps/rejected": -592.5999755859375,
"loss": 3.0064,
"rewards/accuracies": 0.5140625238418579,
"rewards/chosen": 7.137499809265137,
"rewards/margins": 7.170312404632568,
"rewards/rejected": -0.02773437462747097,
"step": 2080
},
{
"epoch": 0.9812206572769953,
"grad_norm": 260.8350621859381,
"learning_rate": 1.0693792383933229e-08,
"logits/chosen": -3.2515625953674316,
"logits/rejected": -3.2796874046325684,
"logps/chosen": -555.5999755859375,
"logps/rejected": -642.4000244140625,
"loss": 2.6919,
"rewards/accuracies": 0.510937511920929,
"rewards/chosen": 7.987500190734863,
"rewards/margins": 8.418749809265137,
"rewards/rejected": -0.4306640625,
"step": 2090
},
{
"epoch": 0.9859154929577465,
"grad_norm": 472.6432154571831,
"learning_rate": 8.085550339071465e-09,
"logits/chosen": -3.167187452316284,
"logits/rejected": -3.1343750953674316,
"logps/chosen": -573.4000244140625,
"logps/rejected": -673.7999877929688,
"loss": 2.7546,
"rewards/accuracies": 0.530468761920929,
"rewards/chosen": 7.675000190734863,
"rewards/margins": 8.987500190734863,
"rewards/rejected": -1.3237793445587158,
"step": 2100
},
{
"epoch": 0.9906103286384976,
"grad_norm": 572.7812509906279,
"learning_rate": 5.4773082942097025e-09,
"logits/chosen": -3.214062452316284,
"logits/rejected": -3.2125000953674316,
"logps/chosen": -541.4000244140625,
"logps/rejected": -610.7999877929688,
"loss": 3.079,
"rewards/accuracies": 0.522656261920929,
"rewards/chosen": 7.303124904632568,
"rewards/margins": 8.268750190734863,
"rewards/rejected": -0.9698241949081421,
"step": 2110
},
{
"epoch": 0.9953051643192489,
"grad_norm": 253.50766618560678,
"learning_rate": 2.8690662493479393e-09,
"logits/chosen": -3.206249952316284,
"logits/rejected": -3.120312452316284,
"logps/chosen": -548.7999877929688,
"logps/rejected": -624.7999877929688,
"loss": 3.3381,
"rewards/accuracies": 0.520312488079071,
"rewards/chosen": 7.053124904632568,
"rewards/margins": 8.865625381469727,
"rewards/rejected": -1.8049805164337158,
"step": 2120
},
{
"epoch": 1.0,
"grad_norm": 349.3079567367046,
"learning_rate": 2.608242044861763e-10,
"logits/chosen": -3.160937547683716,
"logits/rejected": -3.1968750953674316,
"logps/chosen": -544.7999877929688,
"logps/rejected": -640.7999877929688,
"loss": 3.374,
"rewards/accuracies": 0.5139768719673157,
"rewards/chosen": 7.634375095367432,
"rewards/margins": 11.021875381469727,
"rewards/rejected": -3.389453172683716,
"step": 2130
},
{
"epoch": 1.0,
"step": 2130,
"total_flos": 0.0,
"train_loss": 3.0317540988116196,
"train_runtime": 18550.6661,
"train_samples_per_second": 14.694,
"train_steps_per_second": 0.115
}
],
"logging_steps": 10,
"max_steps": 2130,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}