Files
Qwen3-8B-SOCIALIQA-DPO/checkpoint-1984/trainer_state.json

3309 lines
112 KiB
JSON
Raw Permalink Normal View History

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 1984,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005040957781978576,
"grad_norm": 25.75,
"learning_rate": 3e-08,
"logits/chosen": -0.07891461253166199,
"logits/rejected": 0.004119270481169224,
"logps/chosen": -62.022430419921875,
"logps/rejected": -65.60428619384766,
"loss": 0.6949,
"rewards/accuracies": 0.3687500059604645,
"rewards/chosen": 0.0029429681599140167,
"rewards/margins": -0.0032222606241703033,
"rewards/rejected": 0.006165228318423033,
"step": 10
},
{
"epoch": 0.010081915563957152,
"grad_norm": 19.5,
"learning_rate": 6.333333333333333e-08,
"logits/chosen": -0.058114223182201385,
"logits/rejected": 0.02890823781490326,
"logps/chosen": -61.93231201171875,
"logps/rejected": -65.11712646484375,
"loss": 0.6907,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.008925376459956169,
"rewards/margins": 0.005283808801323175,
"rewards/rejected": 0.0036415669601410627,
"step": 20
},
{
"epoch": 0.015122873345935728,
"grad_norm": 25.0,
"learning_rate": 9.666666666666666e-08,
"logits/chosen": -0.04496127367019653,
"logits/rejected": 0.017042722553014755,
"logps/chosen": -61.5279541015625,
"logps/rejected": -65.24421691894531,
"loss": 0.691,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.0035877160262316465,
"rewards/margins": 0.004559223540127277,
"rewards/rejected": -0.0009715079213492572,
"step": 30
},
{
"epoch": 0.020163831127914304,
"grad_norm": 29.25,
"learning_rate": 1.3e-07,
"logits/chosen": -0.0775897353887558,
"logits/rejected": -0.00422413507476449,
"logps/chosen": -61.86296463012695,
"logps/rejected": -65.46966552734375,
"loss": 0.6852,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.006382000632584095,
"rewards/margins": 0.016284234821796417,
"rewards/rejected": -0.009902234189212322,
"step": 40
},
{
"epoch": 0.02520478890989288,
"grad_norm": 32.0,
"learning_rate": 1.6333333333333331e-07,
"logits/chosen": -0.10396875441074371,
"logits/rejected": -0.028726909309625626,
"logps/chosen": -61.53949737548828,
"logps/rejected": -65.4787826538086,
"loss": 0.6777,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": 0.0072999573312699795,
"rewards/margins": 0.03146541863679886,
"rewards/rejected": -0.02416546270251274,
"step": 50
},
{
"epoch": 0.030245746691871456,
"grad_norm": 28.125,
"learning_rate": 1.9666666666666665e-07,
"logits/chosen": -0.15684179961681366,
"logits/rejected": -0.08579285442829132,
"logps/chosen": -62.388755798339844,
"logps/rejected": -65.70417785644531,
"loss": 0.6749,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.004609875846654177,
"rewards/margins": 0.03752168267965317,
"rewards/rejected": -0.04213155806064606,
"step": 60
},
{
"epoch": 0.03528670447385003,
"grad_norm": 32.75,
"learning_rate": 2.3e-07,
"logits/chosen": -0.14031846821308136,
"logits/rejected": -0.054399728775024414,
"logps/chosen": -62.0086669921875,
"logps/rejected": -66.37232971191406,
"loss": 0.6629,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -0.01638166978955269,
"rewards/margins": 0.0622851625084877,
"rewards/rejected": -0.07866682857275009,
"step": 70
},
{
"epoch": 0.04032766225582861,
"grad_norm": 30.25,
"learning_rate": 2.633333333333333e-07,
"logits/chosen": -0.19501006603240967,
"logits/rejected": -0.1102059856057167,
"logps/chosen": -62.68883514404297,
"logps/rejected": -66.286865234375,
"loss": 0.6564,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -0.03311951085925102,
"rewards/margins": 0.07680130004882812,
"rewards/rejected": -0.10992081463336945,
"step": 80
},
{
"epoch": 0.045368620037807186,
"grad_norm": 32.0,
"learning_rate": 2.966666666666667e-07,
"logits/chosen": -0.24648718535900116,
"logits/rejected": -0.16621707379817963,
"logps/chosen": -62.35799026489258,
"logps/rejected": -67.0835189819336,
"loss": 0.6371,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.039066143333911896,
"rewards/margins": 0.1181197538971901,
"rewards/rejected": -0.1571858823299408,
"step": 90
},
{
"epoch": 0.05040957781978576,
"grad_norm": 26.0,
"learning_rate": 3.3e-07,
"logits/chosen": -0.2515576481819153,
"logits/rejected": -0.19324862957000732,
"logps/chosen": -62.55232620239258,
"logps/rejected": -67.55314636230469,
"loss": 0.6217,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -0.06582323461771011,
"rewards/margins": 0.15158048272132874,
"rewards/rejected": -0.21740372478961945,
"step": 100
},
{
"epoch": 0.05040957781978576,
"eval_logits/chosen": -0.3329714834690094,
"eval_logits/rejected": -0.27068039774894714,
"eval_logps/chosen": -62.82643508911133,
"eval_logps/rejected": -67.86115264892578,
"eval_loss": 0.6163578629493713,
"eval_rewards/accuracies": 0.8772454857826233,
"eval_rewards/chosen": -0.08201639354228973,
"eval_rewards/margins": 0.16533808410167694,
"eval_rewards/rejected": -0.24735447764396667,
"eval_runtime": 71.6288,
"eval_samples_per_second": 23.315,
"eval_steps_per_second": 23.315,
"step": 100
},
{
"epoch": 0.055450535601764335,
"grad_norm": 34.5,
"learning_rate": 3.6333333333333333e-07,
"logits/chosen": -0.38161173462867737,
"logits/rejected": -0.3071076273918152,
"logps/chosen": -62.72125244140625,
"logps/rejected": -68.94508361816406,
"loss": 0.5933,
"rewards/accuracies": 0.90625,
"rewards/chosen": -0.07404644042253494,
"rewards/margins": 0.2169165164232254,
"rewards/rejected": -0.29096299409866333,
"step": 110
},
{
"epoch": 0.06049149338374291,
"grad_norm": 25.375,
"learning_rate": 3.9666666666666665e-07,
"logits/chosen": -0.40286582708358765,
"logits/rejected": -0.34470418095588684,
"logps/chosen": -63.3223762512207,
"logps/rejected": -68.89364624023438,
"loss": 0.5858,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -0.1222512498497963,
"rewards/margins": 0.23771443963050842,
"rewards/rejected": -0.3599656820297241,
"step": 120
},
{
"epoch": 0.06553245116572148,
"grad_norm": 23.125,
"learning_rate": 4.2999999999999996e-07,
"logits/chosen": -0.4781390130519867,
"logits/rejected": -0.4242860674858093,
"logps/chosen": -63.421775817871094,
"logps/rejected": -69.99397277832031,
"loss": 0.5631,
"rewards/accuracies": 0.90625,
"rewards/chosen": -0.15090402960777283,
"rewards/margins": 0.29352977871894836,
"rewards/rejected": -0.4444337785243988,
"step": 130
},
{
"epoch": 0.07057340894770006,
"grad_norm": 23.25,
"learning_rate": 4.633333333333333e-07,
"logits/chosen": -0.5767303705215454,
"logits/rejected": -0.5158231854438782,
"logps/chosen": -64.02821350097656,
"logps/rejected": -70.48841857910156,
"loss": 0.5487,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -0.18629249930381775,
"rewards/margins": 0.34241610765457153,
"rewards/rejected": -0.5287086367607117,
"step": 140
},
{
"epoch": 0.07561436672967864,
"grad_norm": 23.125,
"learning_rate": 4.966666666666666e-07,
"logits/chosen": -0.6403996348381042,
"logits/rejected": -0.6139777898788452,
"logps/chosen": -63.662750244140625,
"logps/rejected": -71.99102020263672,
"loss": 0.5035,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -0.180726557970047,
"rewards/margins": 0.4561690390110016,
"rewards/rejected": -0.6368955373764038,
"step": 150
},
{
"epoch": 0.08065532451165722,
"grad_norm": 24.0,
"learning_rate": 4.975463467829879e-07,
"logits/chosen": -0.6926871538162231,
"logits/rejected": -0.6587594747543335,
"logps/chosen": -63.4420051574707,
"logps/rejected": -72.86981201171875,
"loss": 0.4668,
"rewards/accuracies": 0.90625,
"rewards/chosen": -0.1938735544681549,
"rewards/margins": 0.5592324137687683,
"rewards/rejected": -0.7531059980392456,
"step": 160
},
{
"epoch": 0.0856962822936358,
"grad_norm": 23.125,
"learning_rate": 4.948200654307524e-07,
"logits/chosen": -0.7586608529090881,
"logits/rejected": -0.737440824508667,
"logps/chosen": -64.16552734375,
"logps/rejected": -72.98350524902344,
"loss": 0.4687,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.22095069289207458,
"rewards/margins": 0.5723496675491333,
"rewards/rejected": -0.7933003306388855,
"step": 170
},
{
"epoch": 0.09073724007561437,
"grad_norm": 22.5,
"learning_rate": 4.920937840785169e-07,
"logits/chosen": -0.8342425227165222,
"logits/rejected": -0.8266761898994446,
"logps/chosen": -64.02726745605469,
"logps/rejected": -74.54716491699219,
"loss": 0.438,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -0.22078147530555725,
"rewards/margins": 0.6804903745651245,
"rewards/rejected": -0.9012719392776489,
"step": 180
},
{
"epoch": 0.09577819785759294,
"grad_norm": 22.125,
"learning_rate": 4.893675027262814e-07,
"logits/chosen": -0.8242881894111633,
"logits/rejected": -0.8162325620651245,
"logps/chosen": -64.03907012939453,
"logps/rejected": -75.18043518066406,
"loss": 0.4107,
"rewards/accuracies": 0.90625,
"rewards/chosen": -0.21485848724842072,
"rewards/margins": 0.7709285616874695,
"rewards/rejected": -0.985787034034729,
"step": 190
},
{
"epoch": 0.10081915563957151,
"grad_norm": 19.375,
"learning_rate": 4.866412213740458e-07,
"logits/chosen": -0.8690522909164429,
"logits/rejected": -0.8740390539169312,
"logps/chosen": -63.641624450683594,
"logps/rejected": -75.82588195800781,
"loss": 0.3957,
"rewards/accuracies": 0.90625,
"rewards/chosen": -0.1833191215991974,
"rewards/margins": 0.8537132143974304,
"rewards/rejected": -1.0370323657989502,
"step": 200
},
{
"epoch": 0.10081915563957151,
"eval_logits/chosen": -0.8788526058197021,
"eval_logits/rejected": -0.8778823018074036,
"eval_logps/chosen": -64.24597930908203,
"eval_logps/rejected": -75.78997802734375,
"eval_loss": 0.41245341300964355,
"eval_rewards/accuracies": 0.8772454857826233,
"eval_rewards/chosen": -0.22397060692310333,
"eval_rewards/margins": 0.8162661790847778,
"eval_rewards/rejected": -1.0402368307113647,
"eval_runtime": 73.4377,
"eval_samples_per_second": 22.74,
"eval_steps_per_second": 22.74,
"step": 200
},
{
"epoch": 0.10586011342155009,
"grad_norm": 16.375,
"learning_rate": 4.839149400218102e-07,
"logits/chosen": -0.9034843444824219,
"logits/rejected": -0.9223787188529968,
"logps/chosen": -63.76435089111328,
"logps/rejected": -76.31708526611328,
"loss": 0.3748,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -0.18892225623130798,
"rewards/margins": 0.9179438352584839,
"rewards/rejected": -1.1068661212921143,
"step": 210
},
{
"epoch": 0.11090107120352867,
"grad_norm": 19.25,
"learning_rate": 4.811886586695747e-07,
"logits/chosen": -0.9078966379165649,
"logits/rejected": -0.9262178540229797,
"logps/chosen": -63.7225456237793,
"logps/rejected": -76.8105239868164,
"loss": 0.3902,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.18831631541252136,
"rewards/margins": 0.9241956472396851,
"rewards/rejected": -1.1125118732452393,
"step": 220
},
{
"epoch": 0.11594202898550725,
"grad_norm": 20.5,
"learning_rate": 4.784623773173392e-07,
"logits/chosen": -0.9634265899658203,
"logits/rejected": -0.9702705144882202,
"logps/chosen": -63.542388916015625,
"logps/rejected": -77.10391998291016,
"loss": 0.3663,
"rewards/accuracies": 0.90625,
"rewards/chosen": -0.19163846969604492,
"rewards/margins": 0.9870219230651855,
"rewards/rejected": -1.1786603927612305,
"step": 230
},
{
"epoch": 0.12098298676748583,
"grad_norm": 19.125,
"learning_rate": 4.7573609596510354e-07,
"logits/chosen": -0.920444130897522,
"logits/rejected": -0.9512295722961426,
"logps/chosen": -63.34272384643555,
"logps/rejected": -77.4976577758789,
"loss": 0.3607,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -0.14525040984153748,
"rewards/margins": 1.072211503982544,
"rewards/rejected": -1.2174618244171143,
"step": 240
},
{
"epoch": 0.1260239445494644,
"grad_norm": 19.125,
"learning_rate": 4.7300981461286803e-07,
"logits/chosen": -0.9372714757919312,
"logits/rejected": -0.9608826637268066,
"logps/chosen": -63.630760192871094,
"logps/rejected": -77.2890625,
"loss": 0.3813,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.14888228476047516,
"rewards/margins": 1.0466125011444092,
"rewards/rejected": -1.1954947710037231,
"step": 250
},
{
"epoch": 0.13106490233144297,
"grad_norm": 16.125,
"learning_rate": 4.7028353326063247e-07,
"logits/chosen": -0.9241989254951477,
"logits/rejected": -0.9565703272819519,
"logps/chosen": -62.576759338378906,
"logps/rejected": -78.99192810058594,
"loss": 0.2995,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.06501423567533493,
"rewards/margins": 1.2511422634124756,
"rewards/rejected": -1.3161563873291016,
"step": 260
},
{
"epoch": 0.13610586011342155,
"grad_norm": 18.375,
"learning_rate": 4.675572519083969e-07,
"logits/chosen": -0.9289599657058716,
"logits/rejected": -0.959551990032196,
"logps/chosen": -64.06565856933594,
"logps/rejected": -78.49452209472656,
"loss": 0.365,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.16143986582756042,
"rewards/margins": 1.1138832569122314,
"rewards/rejected": -1.2753230333328247,
"step": 270
},
{
"epoch": 0.14114681789540012,
"grad_norm": 14.75,
"learning_rate": 4.648309705561614e-07,
"logits/chosen": -0.9531835317611694,
"logits/rejected": -0.9772550463676453,
"logps/chosen": -63.0753173828125,
"logps/rejected": -78.2010726928711,
"loss": 0.3479,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -0.13170328736305237,
"rewards/margins": 1.1704909801483154,
"rewards/rejected": -1.3021942377090454,
"step": 280
},
{
"epoch": 0.1461877756773787,
"grad_norm": 14.125,
"learning_rate": 4.6210468920392583e-07,
"logits/chosen": -0.9216547012329102,
"logits/rejected": -0.9838453531265259,
"logps/chosen": -62.03778076171875,
"logps/rejected": -79.33902740478516,
"loss": 0.2986,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.029102539643645287,
"rewards/margins": 1.3270413875579834,
"rewards/rejected": -1.3561439514160156,
"step": 290
},
{
"epoch": 0.15122873345935728,
"grad_norm": 18.375,
"learning_rate": 4.5937840785169027e-07,
"logits/chosen": -0.9648447036743164,
"logits/rejected": -1.0225803852081299,
"logps/chosen": -61.83771514892578,
"logps/rejected": -79.71820831298828,
"loss": 0.2767,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.035991497337818146,
"rewards/margins": 1.3972457647323608,
"rewards/rejected": -1.4332373142242432,
"step": 300
},
{
"epoch": 0.15122873345935728,
"eval_logits/chosen": -0.977104663848877,
"eval_logits/rejected": -1.0141139030456543,
"eval_logps/chosen": -63.36873245239258,
"eval_logps/rejected": -79.43486022949219,
"eval_loss": 0.3420425057411194,
"eval_rewards/accuracies": 0.8808383345603943,
"eval_rewards/chosen": -0.1362457126379013,
"eval_rewards/margins": 1.2684792280197144,
"eval_rewards/rejected": -1.4047249555587769,
"eval_runtime": 70.6339,
"eval_samples_per_second": 23.643,
"eval_steps_per_second": 23.643,
"step": 300
},
{
"epoch": 0.15626969124133586,
"grad_norm": 13.75,
"learning_rate": 4.566521264994547e-07,
"logits/chosen": -0.9742618799209595,
"logits/rejected": -1.0176128149032593,
"logps/chosen": -63.59131622314453,
"logps/rejected": -79.64852905273438,
"loss": 0.3397,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.14291712641716003,
"rewards/margins": 1.2807643413543701,
"rewards/rejected": -1.4236814975738525,
"step": 310
},
{
"epoch": 0.16131064902331443,
"grad_norm": 21.625,
"learning_rate": 4.5392584514721915e-07,
"logits/chosen": -0.9295794367790222,
"logits/rejected": -0.9652513265609741,
"logps/chosen": -62.136070251464844,
"logps/rejected": -80.7836685180664,
"loss": 0.2774,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -0.03530962020158768,
"rewards/margins": 1.468548059463501,
"rewards/rejected": -1.5038578510284424,
"step": 320
},
{
"epoch": 0.166351606805293,
"grad_norm": 14.4375,
"learning_rate": 4.5119956379498364e-07,
"logits/chosen": -0.9235115051269531,
"logits/rejected": -0.9803950190544128,
"logps/chosen": -62.91417694091797,
"logps/rejected": -80.86149597167969,
"loss": 0.292,
"rewards/accuracies": 0.90625,
"rewards/chosen": -0.07058247923851013,
"rewards/margins": 1.4306252002716064,
"rewards/rejected": -1.5012075901031494,
"step": 330
},
{
"epoch": 0.1713925645872716,
"grad_norm": 22.875,
"learning_rate": 4.484732824427481e-07,
"logits/chosen": -0.9881145358085632,
"logits/rejected": -1.0382649898529053,
"logps/chosen": -62.81303787231445,
"logps/rejected": -81.26265716552734,
"loss": 0.278,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -0.061654943972826004,
"rewards/margins": 1.5001782178878784,
"rewards/rejected": -1.561833143234253,
"step": 340
},
{
"epoch": 0.17643352236925017,
"grad_norm": 10.25,
"learning_rate": 4.457470010905125e-07,
"logits/chosen": -0.9716188311576843,
"logits/rejected": -1.0232843160629272,
"logps/chosen": -63.168006896972656,
"logps/rejected": -79.84440612792969,
"loss": 0.3661,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.10739569365978241,
"rewards/margins": 1.3512216806411743,
"rewards/rejected": -1.4586174488067627,
"step": 350
},
{
"epoch": 0.18147448015122875,
"grad_norm": 16.125,
"learning_rate": 4.43020719738277e-07,
"logits/chosen": -0.9138787388801575,
"logits/rejected": -0.9851690530776978,
"logps/chosen": -61.79378128051758,
"logps/rejected": -81.6797866821289,
"loss": 0.2695,
"rewards/accuracies": 0.90625,
"rewards/chosen": 0.00970448087900877,
"rewards/margins": 1.6192023754119873,
"rewards/rejected": -1.6094980239868164,
"step": 360
},
{
"epoch": 0.18651543793320732,
"grad_norm": 12.0625,
"learning_rate": 4.402944383860414e-07,
"logits/chosen": -0.932845950126648,
"logits/rejected": -0.98590487241745,
"logps/chosen": -62.03528594970703,
"logps/rejected": -81.13326263427734,
"loss": 0.2783,
"rewards/accuracies": 0.90625,
"rewards/chosen": -0.0034210742451250553,
"rewards/margins": 1.5679662227630615,
"rewards/rejected": -1.5713872909545898,
"step": 370
},
{
"epoch": 0.19155639571518587,
"grad_norm": 9.625,
"learning_rate": 4.375681570338059e-07,
"logits/chosen": -0.9335094690322876,
"logits/rejected": -0.991866946220398,
"logps/chosen": -61.888038635253906,
"logps/rejected": -80.99397277832031,
"loss": 0.3039,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.0033101304434239864,
"rewards/margins": 1.5600807666778564,
"rewards/rejected": -1.5567705631256104,
"step": 380
},
{
"epoch": 0.19659735349716445,
"grad_norm": 24.5,
"learning_rate": 4.348418756815703e-07,
"logits/chosen": -0.9722970724105835,
"logits/rejected": -1.0437185764312744,
"logps/chosen": -62.06909942626953,
"logps/rejected": -80.99043273925781,
"loss": 0.2994,
"rewards/accuracies": 0.90625,
"rewards/chosen": -0.017526980489492416,
"rewards/margins": 1.57229483127594,
"rewards/rejected": -1.5898219347000122,
"step": 390
},
{
"epoch": 0.20163831127914303,
"grad_norm": 14.3125,
"learning_rate": 4.3211559432933476e-07,
"logits/chosen": -0.9651460647583008,
"logits/rejected": -1.0433061122894287,
"logps/chosen": -61.74712371826172,
"logps/rejected": -82.36299133300781,
"loss": 0.2577,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": 0.03247659280896187,
"rewards/margins": 1.6827892065048218,
"rewards/rejected": -1.6503126621246338,
"step": 400
},
{
"epoch": 0.20163831127914303,
"eval_logits/chosen": -0.9533343315124512,
"eval_logits/rejected": -1.0109245777130127,
"eval_logps/chosen": -62.41316604614258,
"eval_logps/rejected": -81.344482421875,
"eval_loss": 0.3170239329338074,
"eval_rewards/accuracies": 0.8856287598609924,
"eval_rewards/chosen": -0.040688931941986084,
"eval_rewards/margins": 1.5549986362457275,
"eval_rewards/rejected": -1.5956875085830688,
"eval_runtime": 71.3317,
"eval_samples_per_second": 23.412,
"eval_steps_per_second": 23.412,
"step": 400
},
{
"epoch": 0.2066792690611216,
"grad_norm": 24.25,
"learning_rate": 4.2938931297709925e-07,
"logits/chosen": -0.9435351490974426,
"logits/rejected": -1.0250452756881714,
"logps/chosen": -61.98693084716797,
"logps/rejected": -82.12709045410156,
"loss": 0.2696,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": 0.0036094195675104856,
"rewards/margins": 1.654160737991333,
"rewards/rejected": -1.650551199913025,
"step": 410
},
{
"epoch": 0.21172022684310018,
"grad_norm": 11.5625,
"learning_rate": 4.266630316248637e-07,
"logits/chosen": -0.9397176504135132,
"logits/rejected": -1.0133641958236694,
"logps/chosen": -61.4076042175293,
"logps/rejected": -83.43376159667969,
"loss": 0.258,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.0406290665268898,
"rewards/margins": 1.7928526401519775,
"rewards/rejected": -1.7522236108779907,
"step": 420
},
{
"epoch": 0.21676118462507876,
"grad_norm": 17.75,
"learning_rate": 4.239367502726281e-07,
"logits/chosen": -0.9263733625411987,
"logits/rejected": -1.007187843322754,
"logps/chosen": -60.838951110839844,
"logps/rejected": -82.61154174804688,
"loss": 0.2287,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": 0.09760002046823502,
"rewards/margins": 1.8123610019683838,
"rewards/rejected": -1.7147607803344727,
"step": 430
},
{
"epoch": 0.22180214240705734,
"grad_norm": 18.375,
"learning_rate": 4.2121046892039257e-07,
"logits/chosen": -0.9172664880752563,
"logits/rejected": -0.98748779296875,
"logps/chosen": -61.5716667175293,
"logps/rejected": -82.24566650390625,
"loss": 0.2758,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.008749181404709816,
"rewards/margins": 1.7142765522003174,
"rewards/rejected": -1.7055273056030273,
"step": 440
},
{
"epoch": 0.22684310018903592,
"grad_norm": 28.75,
"learning_rate": 4.18484187568157e-07,
"logits/chosen": -0.9780591726303101,
"logits/rejected": -1.02444326877594,
"logps/chosen": -62.16161346435547,
"logps/rejected": -81.88623046875,
"loss": 0.2933,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -0.024681804701685905,
"rewards/margins": 1.6520192623138428,
"rewards/rejected": -1.6767011880874634,
"step": 450
},
{
"epoch": 0.2318840579710145,
"grad_norm": 15.375,
"learning_rate": 4.1575790621592144e-07,
"logits/chosen": -0.9287646412849426,
"logits/rejected": -0.9916049838066101,
"logps/chosen": -61.4310188293457,
"logps/rejected": -82.45499420166016,
"loss": 0.2939,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": 0.04269082099199295,
"rewards/margins": 1.7475837469100952,
"rewards/rejected": -1.704892873764038,
"step": 460
},
{
"epoch": 0.23692501575299307,
"grad_norm": 13.75,
"learning_rate": 4.1303162486368593e-07,
"logits/chosen": -0.9281194806098938,
"logits/rejected": -1.0078294277191162,
"logps/chosen": -60.9696159362793,
"logps/rejected": -82.53529357910156,
"loss": 0.2433,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": 0.07616719603538513,
"rewards/margins": 1.8099626302719116,
"rewards/rejected": -1.7337955236434937,
"step": 470
},
{
"epoch": 0.24196597353497165,
"grad_norm": 24.125,
"learning_rate": 4.1030534351145037e-07,
"logits/chosen": -0.9563441276550293,
"logits/rejected": -1.0490442514419556,
"logps/chosen": -61.71998977661133,
"logps/rejected": -81.29328918457031,
"loss": 0.3422,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": 0.017765596508979797,
"rewards/margins": 1.6064462661743164,
"rewards/rejected": -1.5886808633804321,
"step": 480
},
{
"epoch": 0.24700693131695023,
"grad_norm": 18.75,
"learning_rate": 4.075790621592148e-07,
"logits/chosen": -0.9509885907173157,
"logits/rejected": -1.0152177810668945,
"logps/chosen": -60.49787139892578,
"logps/rejected": -82.62560272216797,
"loss": 0.2712,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": 0.13019904494285583,
"rewards/margins": 1.8366725444793701,
"rewards/rejected": -1.7064735889434814,
"step": 490
},
{
"epoch": 0.2520478890989288,
"grad_norm": 20.0,
"learning_rate": 4.0485278080697925e-07,
"logits/chosen": -0.9022938013076782,
"logits/rejected": -0.9922693371772766,
"logps/chosen": -61.09675216674805,
"logps/rejected": -82.68385314941406,
"loss": 0.2781,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": 0.08039947599172592,
"rewards/margins": 1.8100239038467407,
"rewards/rejected": -1.7296245098114014,
"step": 500
},
{
"epoch": 0.2520478890989288,
"eval_logits/chosen": -0.9490191340446472,
"eval_logits/rejected": -1.0179123878479004,
"eval_logps/chosen": -61.8110466003418,
"eval_logps/rejected": -82.53450775146484,
"eval_loss": 0.30638858675956726,
"eval_rewards/accuracies": 0.8856287598609924,
"eval_rewards/chosen": 0.019522832706570625,
"eval_rewards/margins": 1.734211802482605,
"eval_rewards/rejected": -1.7146891355514526,
"eval_runtime": 72.5423,
"eval_samples_per_second": 23.021,
"eval_steps_per_second": 23.021,
"step": 500
},
{
"epoch": 0.2570888468809074,
"grad_norm": 32.25,
"learning_rate": 4.021264994547437e-07,
"logits/chosen": -0.9350666999816895,
"logits/rejected": -1.0141990184783936,
"logps/chosen": -61.0509033203125,
"logps/rejected": -82.5855712890625,
"loss": 0.2704,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": 0.08123103529214859,
"rewards/margins": 1.7908775806427002,
"rewards/rejected": -1.7096465826034546,
"step": 510
},
{
"epoch": 0.26212980466288593,
"grad_norm": 25.75,
"learning_rate": 3.994002181025082e-07,
"logits/chosen": -0.9520455598831177,
"logits/rejected": -1.0035117864608765,
"logps/chosen": -60.493560791015625,
"logps/rejected": -82.90555572509766,
"loss": 0.2678,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": 0.1298852264881134,
"rewards/margins": 1.8753770589828491,
"rewards/rejected": -1.7454917430877686,
"step": 520
},
{
"epoch": 0.26717076244486454,
"grad_norm": 9.375,
"learning_rate": 3.966739367502726e-07,
"logits/chosen": -0.9617950320243835,
"logits/rejected": -1.0328407287597656,
"logps/chosen": -61.49128341674805,
"logps/rejected": -81.98192596435547,
"loss": 0.3276,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": 0.05107005685567856,
"rewards/margins": 1.7408430576324463,
"rewards/rejected": -1.6897728443145752,
"step": 530
},
{
"epoch": 0.2722117202268431,
"grad_norm": 24.375,
"learning_rate": 3.9394765539803705e-07,
"logits/chosen": -0.9145433306694031,
"logits/rejected": -0.9903620481491089,
"logps/chosen": -60.99120330810547,
"logps/rejected": -83.13854217529297,
"loss": 0.2787,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": 0.09986008703708649,
"rewards/margins": 1.8553259372711182,
"rewards/rejected": -1.7554658651351929,
"step": 540
},
{
"epoch": 0.2772526780088217,
"grad_norm": 24.125,
"learning_rate": 3.9122137404580155e-07,
"logits/chosen": -0.9315235018730164,
"logits/rejected": -1.0184853076934814,
"logps/chosen": -61.380882263183594,
"logps/rejected": -83.67707061767578,
"loss": 0.2747,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 0.0887889489531517,
"rewards/margins": 1.894553780555725,
"rewards/rejected": -1.8057647943496704,
"step": 550
},
{
"epoch": 0.28229363579080025,
"grad_norm": 11.875,
"learning_rate": 3.8849509269356593e-07,
"logits/chosen": -0.8931863903999329,
"logits/rejected": -0.9775617718696594,
"logps/chosen": -60.56890869140625,
"logps/rejected": -83.23638916015625,
"loss": 0.2602,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": 0.1255245953798294,
"rewards/margins": 1.8955485820770264,
"rewards/rejected": -1.770024061203003,
"step": 560
},
{
"epoch": 0.28733459357277885,
"grad_norm": 13.0,
"learning_rate": 3.8576881134133037e-07,
"logits/chosen": -0.9146768450737,
"logits/rejected": -1.018970251083374,
"logps/chosen": -60.158790588378906,
"logps/rejected": -83.05947875976562,
"loss": 0.258,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": 0.1570456475019455,
"rewards/margins": 1.9360195398330688,
"rewards/rejected": -1.778973937034607,
"step": 570
},
{
"epoch": 0.2923755513547574,
"grad_norm": 25.0,
"learning_rate": 3.8304252998909486e-07,
"logits/chosen": -0.899819552898407,
"logits/rejected": -0.990888774394989,
"logps/chosen": -61.66679763793945,
"logps/rejected": -81.74058532714844,
"loss": 0.3258,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.05306190997362137,
"rewards/margins": 1.7001397609710693,
"rewards/rejected": -1.6470777988433838,
"step": 580
},
{
"epoch": 0.297416509136736,
"grad_norm": 15.1875,
"learning_rate": 3.803162486368593e-07,
"logits/chosen": -0.9033399820327759,
"logits/rejected": -0.9803060293197632,
"logps/chosen": -60.785057067871094,
"logps/rejected": -83.88690185546875,
"loss": 0.2538,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": 0.10141563415527344,
"rewards/margins": 1.9382518529891968,
"rewards/rejected": -1.8368362188339233,
"step": 590
},
{
"epoch": 0.30245746691871456,
"grad_norm": 15.25,
"learning_rate": 3.775899672846238e-07,
"logits/chosen": -0.9281272888183594,
"logits/rejected": -1.005056619644165,
"logps/chosen": -62.294593811035156,
"logps/rejected": -82.4925308227539,
"loss": 0.3627,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": 0.008905407041311264,
"rewards/margins": 1.7188129425048828,
"rewards/rejected": -1.7099075317382812,
"step": 600
},
{
"epoch": 0.30245746691871456,
"eval_logits/chosen": -0.9133957028388977,
"eval_logits/rejected": -0.9896692037582397,
"eval_logps/chosen": -61.07933807373047,
"eval_logps/rejected": -82.90621948242188,
"eval_loss": 0.30104419589042664,
"eval_rewards/accuracies": 0.8844311237335205,
"eval_rewards/chosen": 0.09269363433122635,
"eval_rewards/margins": 1.8445546627044678,
"eval_rewards/rejected": -1.7518609762191772,
"eval_runtime": 73.1888,
"eval_samples_per_second": 22.818,
"eval_steps_per_second": 22.818,
"step": 600
},
{
"epoch": 0.3074984247006931,
"grad_norm": 30.5,
"learning_rate": 3.7486368593238823e-07,
"logits/chosen": -0.9185993075370789,
"logits/rejected": -1.0144493579864502,
"logps/chosen": -61.656349182128906,
"logps/rejected": -81.76588439941406,
"loss": 0.3439,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": 0.051804833114147186,
"rewards/margins": 1.708714485168457,
"rewards/rejected": -1.6569095849990845,
"step": 610
},
{
"epoch": 0.3125393824826717,
"grad_norm": 18.0,
"learning_rate": 3.721374045801526e-07,
"logits/chosen": -0.9089874029159546,
"logits/rejected": -0.9995435476303101,
"logps/chosen": -60.14899444580078,
"logps/rejected": -82.75482177734375,
"loss": 0.2812,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": 0.16136237978935242,
"rewards/margins": 1.9296493530273438,
"rewards/rejected": -1.768286943435669,
"step": 620
},
{
"epoch": 0.31758034026465026,
"grad_norm": 28.0,
"learning_rate": 3.694111232279171e-07,
"logits/chosen": -0.9505274891853333,
"logits/rejected": -1.028907060623169,
"logps/chosen": -60.5921516418457,
"logps/rejected": -83.64252471923828,
"loss": 0.2844,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.13824741542339325,
"rewards/margins": 1.9466102123260498,
"rewards/rejected": -1.808362603187561,
"step": 630
},
{
"epoch": 0.32262129804662887,
"grad_norm": 22.625,
"learning_rate": 3.6668484187568154e-07,
"logits/chosen": -0.9480802416801453,
"logits/rejected": -1.0301778316497803,
"logps/chosen": -60.25774002075195,
"logps/rejected": -82.69186401367188,
"loss": 0.2888,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": 0.15573999285697937,
"rewards/margins": 1.877264380455017,
"rewards/rejected": -1.7215244770050049,
"step": 640
},
{
"epoch": 0.3276622558286074,
"grad_norm": 28.25,
"learning_rate": 3.63958560523446e-07,
"logits/chosen": -0.9399679899215698,
"logits/rejected": -0.9984865188598633,
"logps/chosen": -61.39536666870117,
"logps/rejected": -82.51799774169922,
"loss": 0.3268,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.06745560467243195,
"rewards/margins": 1.796059250831604,
"rewards/rejected": -1.7286036014556885,
"step": 650
},
{
"epoch": 0.332703213610586,
"grad_norm": 30.125,
"learning_rate": 3.6123227917121047e-07,
"logits/chosen": -0.9196429252624512,
"logits/rejected": -1.0079903602600098,
"logps/chosen": -60.20268630981445,
"logps/rejected": -84.8389663696289,
"loss": 0.2459,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": 0.16887502372264862,
"rewards/margins": 2.105717420578003,
"rewards/rejected": -1.9368425607681274,
"step": 660
},
{
"epoch": 0.3377441713925646,
"grad_norm": 26.75,
"learning_rate": 3.585059978189749e-07,
"logits/chosen": -0.9201717376708984,
"logits/rejected": -1.001564621925354,
"logps/chosen": -60.38823318481445,
"logps/rejected": -83.207275390625,
"loss": 0.2703,
"rewards/accuracies": 0.90625,
"rewards/chosen": 0.17042402923107147,
"rewards/margins": 1.9416764974594116,
"rewards/rejected": -1.7712528705596924,
"step": 670
},
{
"epoch": 0.3427851291745432,
"grad_norm": 10.3125,
"learning_rate": 3.557797164667394e-07,
"logits/chosen": -0.8961941003799438,
"logits/rejected": -0.9779438972473145,
"logps/chosen": -59.930824279785156,
"logps/rejected": -83.56443786621094,
"loss": 0.2806,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 0.18050600588321686,
"rewards/margins": 1.9991801977157593,
"rewards/rejected": -1.818674087524414,
"step": 680
},
{
"epoch": 0.34782608695652173,
"grad_norm": 22.25,
"learning_rate": 3.530534351145038e-07,
"logits/chosen": -0.9111859202384949,
"logits/rejected": -1.0103601217269897,
"logps/chosen": -59.5981559753418,
"logps/rejected": -84.48451232910156,
"loss": 0.2155,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.22089195251464844,
"rewards/margins": 2.098818063735962,
"rewards/rejected": -1.8779258728027344,
"step": 690
},
{
"epoch": 0.35286704473850034,
"grad_norm": 8.4375,
"learning_rate": 3.503271537622682e-07,
"logits/chosen": -0.8836026191711426,
"logits/rejected": -0.9876821637153625,
"logps/chosen": -59.78227615356445,
"logps/rejected": -84.5053939819336,
"loss": 0.2411,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": 0.20316052436828613,
"rewards/margins": 2.080014944076538,
"rewards/rejected": -1.8768543004989624,
"step": 700
},
{
"epoch": 0.35286704473850034,
"eval_logits/chosen": -0.9257686138153076,
"eval_logits/rejected": -1.0098994970321655,
"eval_logps/chosen": -60.61675262451172,
"eval_logps/rejected": -83.34400939941406,
"eval_loss": 0.2979341447353363,
"eval_rewards/accuracies": 0.886227548122406,
"eval_rewards/chosen": 0.13895215094089508,
"eval_rewards/margins": 1.9345914125442505,
"eval_rewards/rejected": -1.7956393957138062,
"eval_runtime": 73.2808,
"eval_samples_per_second": 22.789,
"eval_steps_per_second": 22.789,
"step": 700
},
{
"epoch": 0.3579080025204789,
"grad_norm": 11.8125,
"learning_rate": 3.476008724100327e-07,
"logits/chosen": -0.8866975903511047,
"logits/rejected": -0.9669508934020996,
"logps/chosen": -60.970123291015625,
"logps/rejected": -83.34266662597656,
"loss": 0.3138,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 0.0989324077963829,
"rewards/margins": 1.9064470529556274,
"rewards/rejected": -1.8075145483016968,
"step": 710
},
{
"epoch": 0.3629489603024575,
"grad_norm": 12.125,
"learning_rate": 3.4487459105779716e-07,
"logits/chosen": -0.8704292178153992,
"logits/rejected": -0.9828470349311829,
"logps/chosen": -58.7971076965332,
"logps/rejected": -83.87742614746094,
"loss": 0.2295,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": 0.2823036015033722,
"rewards/margins": 2.1361794471740723,
"rewards/rejected": -1.8538758754730225,
"step": 720
},
{
"epoch": 0.36798991808443604,
"grad_norm": 27.75,
"learning_rate": 3.421483097055616e-07,
"logits/chosen": -0.8949785232543945,
"logits/rejected": -0.9850034713745117,
"logps/chosen": -59.54541778564453,
"logps/rejected": -85.09830474853516,
"loss": 0.1848,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": 0.2564947009086609,
"rewards/margins": 2.1756319999694824,
"rewards/rejected": -1.9191375970840454,
"step": 730
},
{
"epoch": 0.37303087586641465,
"grad_norm": 30.625,
"learning_rate": 3.394220283533261e-07,
"logits/chosen": -0.9334227442741394,
"logits/rejected": -1.010840654373169,
"logps/chosen": -60.2901725769043,
"logps/rejected": -84.69157409667969,
"loss": 0.2416,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": 0.15744934976100922,
"rewards/margins": 2.060612678527832,
"rewards/rejected": -1.9031633138656616,
"step": 740
},
{
"epoch": 0.3780718336483932,
"grad_norm": 20.0,
"learning_rate": 3.3669574700109047e-07,
"logits/chosen": -0.9105051159858704,
"logits/rejected": -1.0369561910629272,
"logps/chosen": -57.91829299926758,
"logps/rejected": -85.75810241699219,
"loss": 0.1725,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 0.35346147418022156,
"rewards/margins": 2.3853201866149902,
"rewards/rejected": -2.0318589210510254,
"step": 750
},
{
"epoch": 0.38311279143037175,
"grad_norm": 24.375,
"learning_rate": 3.339694656488549e-07,
"logits/chosen": -0.8942914009094238,
"logits/rejected": -1.0089881420135498,
"logps/chosen": -60.97709274291992,
"logps/rejected": -84.78167724609375,
"loss": 0.2952,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 0.10825882852077484,
"rewards/margins": 2.0440988540649414,
"rewards/rejected": -1.9358398914337158,
"step": 760
},
{
"epoch": 0.38815374921235035,
"grad_norm": 21.875,
"learning_rate": 3.312431842966194e-07,
"logits/chosen": -0.8745867609977722,
"logits/rejected": -0.9834270477294922,
"logps/chosen": -60.55878829956055,
"logps/rejected": -84.3352279663086,
"loss": 0.2894,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.13102281093597412,
"rewards/margins": 2.0422165393829346,
"rewards/rejected": -1.91119384765625,
"step": 770
},
{
"epoch": 0.3931947069943289,
"grad_norm": 25.125,
"learning_rate": 3.2851690294438384e-07,
"logits/chosen": -0.8937376141548157,
"logits/rejected": -0.9868464469909668,
"logps/chosen": -59.818359375,
"logps/rejected": -85.10429382324219,
"loss": 0.2596,
"rewards/accuracies": 0.90625,
"rewards/chosen": 0.20219922065734863,
"rewards/margins": 2.1438941955566406,
"rewards/rejected": -1.9416948556900024,
"step": 780
},
{
"epoch": 0.3982356647763075,
"grad_norm": 32.25,
"learning_rate": 3.2579062159214833e-07,
"logits/chosen": -0.92058265209198,
"logits/rejected": -1.0123975276947021,
"logps/chosen": -60.107933044433594,
"logps/rejected": -84.3521957397461,
"loss": 0.294,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": 0.1681869924068451,
"rewards/margins": 2.0432465076446533,
"rewards/rejected": -1.8750594854354858,
"step": 790
},
{
"epoch": 0.40327662255828606,
"grad_norm": 15.0625,
"learning_rate": 3.2306434023991277e-07,
"logits/chosen": -0.896182656288147,
"logits/rejected": -0.9941380620002747,
"logps/chosen": -61.2158203125,
"logps/rejected": -82.98789978027344,
"loss": 0.3677,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": 0.1341857612133026,
"rewards/margins": 1.88623046875,
"rewards/rejected": -1.752044677734375,
"step": 800
},
{
"epoch": 0.40327662255828606,
"eval_logits/chosen": -0.911089301109314,
"eval_logits/rejected": -0.9992658495903015,
"eval_logps/chosen": -60.45991897583008,
"eval_logps/rejected": -83.99901580810547,
"eval_loss": 0.2951066493988037,
"eval_rewards/accuracies": 0.8886227607727051,
"eval_rewards/chosen": 0.1546352505683899,
"eval_rewards/margins": 2.0157766342163086,
"eval_rewards/rejected": -1.861141324043274,
"eval_runtime": 71.0897,
"eval_samples_per_second": 23.491,
"eval_steps_per_second": 23.491,
"step": 800
},
{
"epoch": 0.40831758034026466,
"grad_norm": 16.625,
"learning_rate": 3.2033805888767715e-07,
"logits/chosen": -0.9287660717964172,
"logits/rejected": -1.0095508098602295,
"logps/chosen": -60.53706741333008,
"logps/rejected": -83.99447631835938,
"loss": 0.2838,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": 0.1390010416507721,
"rewards/margins": 1.9923112392425537,
"rewards/rejected": -1.8533103466033936,
"step": 810
},
{
"epoch": 0.4133585381222432,
"grad_norm": 33.0,
"learning_rate": 3.1761177753544164e-07,
"logits/chosen": -0.9320980310440063,
"logits/rejected": -1.0176376104354858,
"logps/chosen": -59.87189483642578,
"logps/rejected": -84.4596176147461,
"loss": 0.2483,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": 0.1814599335193634,
"rewards/margins": 2.1097805500030518,
"rewards/rejected": -1.9283206462860107,
"step": 820
},
{
"epoch": 0.4183994959042218,
"grad_norm": 12.9375,
"learning_rate": 3.148854961832061e-07,
"logits/chosen": -0.9213443994522095,
"logits/rejected": -1.0197076797485352,
"logps/chosen": -59.96300506591797,
"logps/rejected": -84.28089141845703,
"loss": 0.2707,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": 0.1989632546901703,
"rewards/margins": 2.1028213500976562,
"rewards/rejected": -1.9038581848144531,
"step": 830
},
{
"epoch": 0.42344045368620037,
"grad_norm": 5.90625,
"learning_rate": 3.121592148309705e-07,
"logits/chosen": -0.914442241191864,
"logits/rejected": -0.9979844093322754,
"logps/chosen": -59.945274353027344,
"logps/rejected": -83.28790283203125,
"loss": 0.2793,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": 0.17169013619422913,
"rewards/margins": 2.031377077102661,
"rewards/rejected": -1.8596868515014648,
"step": 840
},
{
"epoch": 0.428481411468179,
"grad_norm": 13.4375,
"learning_rate": 3.09432933478735e-07,
"logits/chosen": -0.9071733355522156,
"logits/rejected": -1.0141820907592773,
"logps/chosen": -60.1943244934082,
"logps/rejected": -84.26617431640625,
"loss": 0.2751,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": 0.1853206306695938,
"rewards/margins": 2.0591628551483154,
"rewards/rejected": -1.8738422393798828,
"step": 850
},
{
"epoch": 0.4335223692501575,
"grad_norm": 12.875,
"learning_rate": 3.0670665212649945e-07,
"logits/chosen": -0.9004698991775513,
"logits/rejected": -0.9726663827896118,
"logps/chosen": -60.701194763183594,
"logps/rejected": -83.21146392822266,
"loss": 0.3464,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.12978795170783997,
"rewards/margins": 1.9474796056747437,
"rewards/rejected": -1.8176918029785156,
"step": 860
},
{
"epoch": 0.43856332703213613,
"grad_norm": 13.125,
"learning_rate": 3.0398037077426394e-07,
"logits/chosen": -0.8763822317123413,
"logits/rejected": -0.9599603414535522,
"logps/chosen": -60.947471618652344,
"logps/rejected": -84.69038391113281,
"loss": 0.3711,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": 0.12584984302520752,
"rewards/margins": 2.0357413291931152,
"rewards/rejected": -1.9098914861679077,
"step": 870
},
{
"epoch": 0.4436042848141147,
"grad_norm": 12.0,
"learning_rate": 3.0125408942202833e-07,
"logits/chosen": -0.9101651906967163,
"logits/rejected": -1.014981985092163,
"logps/chosen": -59.863014221191406,
"logps/rejected": -83.40458679199219,
"loss": 0.3145,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": 0.2130097895860672,
"rewards/margins": 2.023256301879883,
"rewards/rejected": -1.810246467590332,
"step": 880
},
{
"epoch": 0.44864524259609323,
"grad_norm": 8.5,
"learning_rate": 2.9852780806979277e-07,
"logits/chosen": -0.8842908143997192,
"logits/rejected": -1.0119690895080566,
"logps/chosen": -58.0797233581543,
"logps/rejected": -84.85250091552734,
"loss": 0.1971,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.34588780999183655,
"rewards/margins": 2.315692186355591,
"rewards/rejected": -1.9698044061660767,
"step": 890
},
{
"epoch": 0.45368620037807184,
"grad_norm": 21.125,
"learning_rate": 2.9580152671755726e-07,
"logits/chosen": -0.861519455909729,
"logits/rejected": -0.9699891805648804,
"logps/chosen": -58.99384689331055,
"logps/rejected": -84.5435791015625,
"loss": 0.2497,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 0.2806755006313324,
"rewards/margins": 2.2056772708892822,
"rewards/rejected": -1.925001859664917,
"step": 900
},
{
"epoch": 0.45368620037807184,
"eval_logits/chosen": -0.9106476902961731,
"eval_logits/rejected": -1.0023449659347534,
"eval_logps/chosen": -60.15101623535156,
"eval_logps/rejected": -84.17161560058594,
"eval_loss": 0.29339829087257385,
"eval_rewards/accuracies": 0.886227548122406,
"eval_rewards/chosen": 0.185525581240654,
"eval_rewards/margins": 2.0639255046844482,
"eval_rewards/rejected": -1.8783999681472778,
"eval_runtime": 70.7305,
"eval_samples_per_second": 23.611,
"eval_steps_per_second": 23.611,
"step": 900
},
{
"epoch": 0.4587271581600504,
"grad_norm": 21.5,
"learning_rate": 2.930752453653217e-07,
"logits/chosen": -0.8937481641769409,
"logits/rejected": -0.9957500696182251,
"logps/chosen": -59.79345703125,
"logps/rejected": -83.85224914550781,
"loss": 0.3282,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": 0.231471985578537,
"rewards/margins": 2.0565638542175293,
"rewards/rejected": -1.8250917196273804,
"step": 910
},
{
"epoch": 0.463768115942029,
"grad_norm": 18.75,
"learning_rate": 2.9034896401308613e-07,
"logits/chosen": -0.8847681283950806,
"logits/rejected": -0.9873048663139343,
"logps/chosen": -59.479347229003906,
"logps/rejected": -86.35008239746094,
"loss": 0.2292,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": 0.2320963442325592,
"rewards/margins": 2.310370683670044,
"rewards/rejected": -2.0782742500305176,
"step": 920
},
{
"epoch": 0.46880907372400754,
"grad_norm": 22.25,
"learning_rate": 2.876226826608506e-07,
"logits/chosen": -0.915035605430603,
"logits/rejected": -1.0067721605300903,
"logps/chosen": -59.74500274658203,
"logps/rejected": -82.90184020996094,
"loss": 0.3143,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": 0.18378940224647522,
"rewards/margins": 1.998040795326233,
"rewards/rejected": -1.814251184463501,
"step": 930
},
{
"epoch": 0.47385003150598615,
"grad_norm": 33.75,
"learning_rate": 2.84896401308615e-07,
"logits/chosen": -0.9178056716918945,
"logits/rejected": -1.0208479166030884,
"logps/chosen": -59.2294807434082,
"logps/rejected": -84.78729248046875,
"loss": 0.2393,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": 0.25158706307411194,
"rewards/margins": 2.1938788890838623,
"rewards/rejected": -1.9422919750213623,
"step": 940
},
{
"epoch": 0.4788909892879647,
"grad_norm": 34.75,
"learning_rate": 2.8217011995637945e-07,
"logits/chosen": -0.8805239796638489,
"logits/rejected": -0.9753271341323853,
"logps/chosen": -59.95941162109375,
"logps/rejected": -84.86714172363281,
"loss": 0.2666,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": 0.19955232739448547,
"rewards/margins": 2.1361844539642334,
"rewards/rejected": -1.936632513999939,
"step": 950
},
{
"epoch": 0.4839319470699433,
"grad_norm": 23.5,
"learning_rate": 2.7944383860414394e-07,
"logits/chosen": -0.8861673474311829,
"logits/rejected": -0.9975956082344055,
"logps/chosen": -59.12894821166992,
"logps/rejected": -85.26177978515625,
"loss": 0.2303,
"rewards/accuracies": 0.90625,
"rewards/chosen": 0.2717575132846832,
"rewards/margins": 2.239091396331787,
"rewards/rejected": -1.9673335552215576,
"step": 960
},
{
"epoch": 0.48897290485192185,
"grad_norm": 19.625,
"learning_rate": 2.767175572519084e-07,
"logits/chosen": -0.9173597097396851,
"logits/rejected": -1.0085917711257935,
"logps/chosen": -59.86204147338867,
"logps/rejected": -84.22465515136719,
"loss": 0.2877,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": 0.20364825427532196,
"rewards/margins": 2.122812509536743,
"rewards/rejected": -1.9191640615463257,
"step": 970
},
{
"epoch": 0.49401386263390046,
"grad_norm": 20.125,
"learning_rate": 2.7399127589967287e-07,
"logits/chosen": -0.8748540878295898,
"logits/rejected": -0.9848943948745728,
"logps/chosen": -58.594139099121094,
"logps/rejected": -84.79752349853516,
"loss": 0.2749,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": 0.31691378355026245,
"rewards/margins": 2.249189615249634,
"rewards/rejected": -1.9322757720947266,
"step": 980
},
{
"epoch": 0.499054820415879,
"grad_norm": 11.5625,
"learning_rate": 2.712649945474373e-07,
"logits/chosen": -0.8857355117797852,
"logits/rejected": -0.9768050909042358,
"logps/chosen": -59.4735107421875,
"logps/rejected": -84.06812286376953,
"loss": 0.2732,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": 0.2545910179615021,
"rewards/margins": 2.087873935699463,
"rewards/rejected": -1.8332828283309937,
"step": 990
},
{
"epoch": 0.5040957781978576,
"grad_norm": 35.0,
"learning_rate": 2.685387131952017e-07,
"logits/chosen": -0.8867548108100891,
"logits/rejected": -1.0059692859649658,
"logps/chosen": -59.520362854003906,
"logps/rejected": -84.2868881225586,
"loss": 0.282,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 0.25644558668136597,
"rewards/margins": 2.160256862640381,
"rewards/rejected": -1.9038112163543701,
"step": 1000
},
{
"epoch": 0.5040957781978576,
"eval_logits/chosen": -0.9006710052490234,
"eval_logits/rejected": -0.9953308701515198,
"eval_logps/chosen": -59.89181900024414,
"eval_logps/rejected": -84.33870697021484,
"eval_loss": 0.29226598143577576,
"eval_rewards/accuracies": 0.8874251246452332,
"eval_rewards/chosen": 0.21144606173038483,
"eval_rewards/margins": 2.106555700302124,
"eval_rewards/rejected": -1.8951095342636108,
"eval_runtime": 71.1587,
"eval_samples_per_second": 23.469,
"eval_steps_per_second": 23.469,
"step": 1000
},
{
"epoch": 0.5091367359798362,
"grad_norm": 13.375,
"learning_rate": 2.658124318429662e-07,
"logits/chosen": -0.8491979837417603,
"logits/rejected": -0.9555536508560181,
"logps/chosen": -59.57502365112305,
"logps/rejected": -83.54940795898438,
"loss": 0.3238,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.27595460414886475,
"rewards/margins": 2.066524028778076,
"rewards/rejected": -1.7905696630477905,
"step": 1010
},
{
"epoch": 0.5141776937618148,
"grad_norm": 14.375,
"learning_rate": 2.630861504907306e-07,
"logits/chosen": -0.871178150177002,
"logits/rejected": -0.9892231225967407,
"logps/chosen": -59.257232666015625,
"logps/rejected": -86.75993347167969,
"loss": 0.1845,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": 0.26664021611213684,
"rewards/margins": 2.3842711448669434,
"rewards/rejected": -2.11763072013855,
"step": 1020
},
{
"epoch": 0.5192186515437933,
"grad_norm": 8.1875,
"learning_rate": 2.6035986913849506e-07,
"logits/chosen": -0.9025937914848328,
"logits/rejected": -0.9637918472290039,
"logps/chosen": -59.708038330078125,
"logps/rejected": -83.67890930175781,
"loss": 0.3105,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.22001886367797852,
"rewards/margins": 2.053025722503662,
"rewards/rejected": -1.8330070972442627,
"step": 1030
},
{
"epoch": 0.5242596093257719,
"grad_norm": 27.625,
"learning_rate": 2.5763358778625955e-07,
"logits/chosen": -0.8830963373184204,
"logits/rejected": -0.9901777505874634,
"logps/chosen": -58.61798095703125,
"logps/rejected": -84.18465423583984,
"loss": 0.2793,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": 0.33424392342567444,
"rewards/margins": 2.2203192710876465,
"rewards/rejected": -1.8860752582550049,
"step": 1040
},
{
"epoch": 0.5293005671077504,
"grad_norm": 11.875,
"learning_rate": 2.54907306434024e-07,
"logits/chosen": -0.8822509050369263,
"logits/rejected": -1.008209466934204,
"logps/chosen": -60.164794921875,
"logps/rejected": -84.7709732055664,
"loss": 0.2963,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": 0.22792068123817444,
"rewards/margins": 2.1442337036132812,
"rewards/rejected": -1.91631281375885,
"step": 1050
},
{
"epoch": 0.5343415248897291,
"grad_norm": 7.46875,
"learning_rate": 2.521810250817885e-07,
"logits/chosen": -0.8839899897575378,
"logits/rejected": -1.0129756927490234,
"logps/chosen": -59.81916046142578,
"logps/rejected": -85.43331909179688,
"loss": 0.2275,
"rewards/accuracies": 0.90625,
"rewards/chosen": 0.21425476670265198,
"rewards/margins": 2.2582240104675293,
"rewards/rejected": -2.04396915435791,
"step": 1060
},
{
"epoch": 0.5393824826717076,
"grad_norm": 15.5625,
"learning_rate": 2.4945474372955287e-07,
"logits/chosen": -0.82720947265625,
"logits/rejected": -0.907505989074707,
"logps/chosen": -59.22252655029297,
"logps/rejected": -83.96528625488281,
"loss": 0.2807,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": 0.2757384181022644,
"rewards/margins": 2.1067721843719482,
"rewards/rejected": -1.8310333490371704,
"step": 1070
},
{
"epoch": 0.5444234404536862,
"grad_norm": 26.75,
"learning_rate": 2.467284623773173e-07,
"logits/chosen": -0.8436921834945679,
"logits/rejected": -0.942459225654602,
"logps/chosen": -59.066001892089844,
"logps/rejected": -84.55767059326172,
"loss": 0.2846,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.2789579927921295,
"rewards/margins": 2.1750502586364746,
"rewards/rejected": -1.896092176437378,
"step": 1080
},
{
"epoch": 0.5494643982356647,
"grad_norm": 26.75,
"learning_rate": 2.440021810250818e-07,
"logits/chosen": -0.9082972407341003,
"logits/rejected": -1.0078270435333252,
"logps/chosen": -58.6378059387207,
"logps/rejected": -86.2591323852539,
"loss": 0.224,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": 0.31546300649642944,
"rewards/margins": 2.3998687267303467,
"rewards/rejected": -2.0844056606292725,
"step": 1090
},
{
"epoch": 0.5545053560176434,
"grad_norm": 21.75,
"learning_rate": 2.4127589967284623e-07,
"logits/chosen": -0.8725331425666809,
"logits/rejected": -0.9712456464767456,
"logps/chosen": -58.98591995239258,
"logps/rejected": -84.76383972167969,
"loss": 0.2557,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 0.29521211981773376,
"rewards/margins": 2.201046943664551,
"rewards/rejected": -1.9058347940444946,
"step": 1100
},
{
"epoch": 0.5545053560176434,
"eval_logits/chosen": -0.9060633182525635,
"eval_logits/rejected": -1.0051296949386597,
"eval_logps/chosen": -59.636837005615234,
"eval_logps/rejected": -84.44316864013672,
"eval_loss": 0.2921236455440521,
"eval_rewards/accuracies": 0.8874251246452332,
"eval_rewards/chosen": 0.23694448173046112,
"eval_rewards/margins": 2.142500638961792,
"eval_rewards/rejected": -1.9055562019348145,
"eval_runtime": 72.2931,
"eval_samples_per_second": 23.1,
"eval_steps_per_second": 23.1,
"step": 1100
},
{
"epoch": 0.5595463137996219,
"grad_norm": 19.25,
"learning_rate": 2.3854961832061067e-07,
"logits/chosen": -0.8947960734367371,
"logits/rejected": -1.0010361671447754,
"logps/chosen": -58.5375862121582,
"logps/rejected": -84.85458374023438,
"loss": 0.2887,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": 0.3221047520637512,
"rewards/margins": 2.255513906478882,
"rewards/rejected": -1.9334090948104858,
"step": 1110
},
{
"epoch": 0.5645872715816005,
"grad_norm": 12.5625,
"learning_rate": 2.358233369683751e-07,
"logits/chosen": -0.8644863367080688,
"logits/rejected": -0.971636950969696,
"logps/chosen": -58.843421936035156,
"logps/rejected": -84.93423461914062,
"loss": 0.2627,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.3323742747306824,
"rewards/margins": 2.243507146835327,
"rewards/rejected": -1.9111328125,
"step": 1120
},
{
"epoch": 0.569628229363579,
"grad_norm": 16.25,
"learning_rate": 2.3309705561613957e-07,
"logits/chosen": -0.8721977472305298,
"logits/rejected": -0.9687450528144836,
"logps/chosen": -59.44976806640625,
"logps/rejected": -82.7881851196289,
"loss": 0.3487,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": 0.25436902046203613,
"rewards/margins": 2.020705461502075,
"rewards/rejected": -1.766336441040039,
"step": 1130
},
{
"epoch": 0.5746691871455577,
"grad_norm": 22.25,
"learning_rate": 2.3037077426390404e-07,
"logits/chosen": -0.889614462852478,
"logits/rejected": -1.0026637315750122,
"logps/chosen": -58.558433532714844,
"logps/rejected": -84.0979232788086,
"loss": 0.2587,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.33784663677215576,
"rewards/margins": 2.1925177574157715,
"rewards/rejected": -1.8546711206436157,
"step": 1140
},
{
"epoch": 0.5797101449275363,
"grad_norm": 27.5,
"learning_rate": 2.2764449291166848e-07,
"logits/chosen": -0.9427415132522583,
"logits/rejected": -1.0442800521850586,
"logps/chosen": -58.565757751464844,
"logps/rejected": -84.64384460449219,
"loss": 0.2617,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 0.3408879339694977,
"rewards/margins": 2.2868194580078125,
"rewards/rejected": -1.9459314346313477,
"step": 1150
},
{
"epoch": 0.5847511027095148,
"grad_norm": 20.625,
"learning_rate": 2.2491821155943292e-07,
"logits/chosen": -0.8788965344429016,
"logits/rejected": -0.9990353584289551,
"logps/chosen": -58.735572814941406,
"logps/rejected": -84.48644256591797,
"loss": 0.2425,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.3037385046482086,
"rewards/margins": 2.239537477493286,
"rewards/rejected": -1.9357990026474,
"step": 1160
},
{
"epoch": 0.5897920604914934,
"grad_norm": 20.0,
"learning_rate": 2.2219193020719738e-07,
"logits/chosen": -0.874742865562439,
"logits/rejected": -0.980475902557373,
"logps/chosen": -58.799041748046875,
"logps/rejected": -85.21891784667969,
"loss": 0.2426,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": 0.3289235234260559,
"rewards/margins": 2.2946279048919678,
"rewards/rejected": -1.965704321861267,
"step": 1170
},
{
"epoch": 0.594833018273472,
"grad_norm": 29.5,
"learning_rate": 2.1946564885496182e-07,
"logits/chosen": -0.875636875629425,
"logits/rejected": -0.9794307947158813,
"logps/chosen": -59.88359451293945,
"logps/rejected": -84.40530395507812,
"loss": 0.2653,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.22515201568603516,
"rewards/margins": 2.1366682052612305,
"rewards/rejected": -1.9115161895751953,
"step": 1180
},
{
"epoch": 0.5998739760554506,
"grad_norm": 9.875,
"learning_rate": 2.1673936750272628e-07,
"logits/chosen": -0.8744710683822632,
"logits/rejected": -0.9897273182868958,
"logps/chosen": -57.4598388671875,
"logps/rejected": -86.65351867675781,
"loss": 0.1861,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": 0.42013850808143616,
"rewards/margins": 2.5188748836517334,
"rewards/rejected": -2.098736524581909,
"step": 1190
},
{
"epoch": 0.6049149338374291,
"grad_norm": 11.1875,
"learning_rate": 2.1401308615049072e-07,
"logits/chosen": -0.8216146230697632,
"logits/rejected": -0.9422334432601929,
"logps/chosen": -58.2786979675293,
"logps/rejected": -86.90044403076172,
"loss": 0.2244,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.3587431311607361,
"rewards/margins": 2.450103282928467,
"rewards/rejected": -2.091360569000244,
"step": 1200
},
{
"epoch": 0.6049149338374291,
"eval_logits/chosen": -0.8834900856018066,
"eval_logits/rejected": -0.9822787642478943,
"eval_logps/chosen": -59.49919891357422,
"eval_logps/rejected": -84.61227416992188,
"eval_loss": 0.2913074493408203,
"eval_rewards/accuracies": 0.8892215490341187,
"eval_rewards/chosen": 0.2507072687149048,
"eval_rewards/margins": 2.1731746196746826,
"eval_rewards/rejected": -1.9224671125411987,
"eval_runtime": 72.5811,
"eval_samples_per_second": 23.009,
"eval_steps_per_second": 23.009,
"step": 1200
},
{
"epoch": 0.6099558916194077,
"grad_norm": 7.84375,
"learning_rate": 2.1128680479825516e-07,
"logits/chosen": -0.8642328977584839,
"logits/rejected": -0.9527546167373657,
"logps/chosen": -59.0882682800293,
"logps/rejected": -84.75911712646484,
"loss": 0.2518,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.28083863854408264,
"rewards/margins": 2.2171006202697754,
"rewards/rejected": -1.936261773109436,
"step": 1210
},
{
"epoch": 0.6149968494013862,
"grad_norm": 7.21875,
"learning_rate": 2.0856052344601963e-07,
"logits/chosen": -0.8976170420646667,
"logits/rejected": -0.9783880114555359,
"logps/chosen": -60.150733947753906,
"logps/rejected": -84.77664947509766,
"loss": 0.3227,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 0.18833044171333313,
"rewards/margins": 2.1451687812805176,
"rewards/rejected": -1.9568383693695068,
"step": 1220
},
{
"epoch": 0.6200378071833649,
"grad_norm": 16.875,
"learning_rate": 2.058342420937841e-07,
"logits/chosen": -0.8992708325386047,
"logits/rejected": -0.9903928637504578,
"logps/chosen": -58.91829299926758,
"logps/rejected": -84.00004577636719,
"loss": 0.3041,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": 0.2912471294403076,
"rewards/margins": 2.173732280731201,
"rewards/rejected": -1.8824853897094727,
"step": 1230
},
{
"epoch": 0.6250787649653434,
"grad_norm": 17.125,
"learning_rate": 2.031079607415485e-07,
"logits/chosen": -0.9080449938774109,
"logits/rejected": -1.0125758647918701,
"logps/chosen": -59.39638137817383,
"logps/rejected": -84.40299224853516,
"loss": 0.322,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.2707884907722473,
"rewards/margins": 2.1304423809051514,
"rewards/rejected": -1.8596540689468384,
"step": 1240
},
{
"epoch": 0.630119722747322,
"grad_norm": 6.84375,
"learning_rate": 2.0038167938931297e-07,
"logits/chosen": -0.8706466555595398,
"logits/rejected": -0.9584230184555054,
"logps/chosen": -58.84540939331055,
"logps/rejected": -83.47660064697266,
"loss": 0.3036,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.30719193816185,
"rewards/margins": 2.1285505294799805,
"rewards/rejected": -1.8213586807250977,
"step": 1250
},
{
"epoch": 0.6351606805293005,
"grad_norm": 25.75,
"learning_rate": 1.9765539803707743e-07,
"logits/chosen": -0.8904057741165161,
"logits/rejected": -0.9874610900878906,
"logps/chosen": -59.1779899597168,
"logps/rejected": -84.42768859863281,
"loss": 0.3073,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": 0.2561061382293701,
"rewards/margins": 2.1732170581817627,
"rewards/rejected": -1.917110800743103,
"step": 1260
},
{
"epoch": 0.6402016383112792,
"grad_norm": 16.75,
"learning_rate": 1.9492911668484184e-07,
"logits/chosen": -0.8631388545036316,
"logits/rejected": -0.9788234829902649,
"logps/chosen": -58.17034912109375,
"logps/rejected": -86.0931167602539,
"loss": 0.2073,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.3628115653991699,
"rewards/margins": 2.3852200508117676,
"rewards/rejected": -2.0224084854125977,
"step": 1270
},
{
"epoch": 0.6452425960932577,
"grad_norm": 7.78125,
"learning_rate": 1.922028353326063e-07,
"logits/chosen": -0.886549174785614,
"logits/rejected": -1.004765510559082,
"logps/chosen": -58.592750549316406,
"logps/rejected": -84.68618774414062,
"loss": 0.2295,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.322457492351532,
"rewards/margins": 2.2541778087615967,
"rewards/rejected": -1.9317200183868408,
"step": 1280
},
{
"epoch": 0.6502835538752363,
"grad_norm": 13.6875,
"learning_rate": 1.8947655398037077e-07,
"logits/chosen": -0.8659318685531616,
"logits/rejected": -0.9652425646781921,
"logps/chosen": -60.613441467285156,
"logps/rejected": -84.11669158935547,
"loss": 0.3235,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.18255123496055603,
"rewards/margins": 2.038862705230713,
"rewards/rejected": -1.8563114404678345,
"step": 1290
},
{
"epoch": 0.6553245116572148,
"grad_norm": 23.375,
"learning_rate": 1.8675027262813524e-07,
"logits/chosen": -0.8807634115219116,
"logits/rejected": -1.0138846635818481,
"logps/chosen": -58.674583435058594,
"logps/rejected": -85.5340576171875,
"loss": 0.2442,
"rewards/accuracies": 0.90625,
"rewards/chosen": 0.3219856321811676,
"rewards/margins": 2.348407030105591,
"rewards/rejected": -2.026421308517456,
"step": 1300
},
{
"epoch": 0.6553245116572148,
"eval_logits/chosen": -0.9014637470245361,
"eval_logits/rejected": -1.0028033256530762,
"eval_logps/chosen": -59.48044204711914,
"eval_logps/rejected": -84.73458099365234,
"eval_loss": 0.29055994749069214,
"eval_rewards/accuracies": 0.8856287598609924,
"eval_rewards/chosen": 0.25258320569992065,
"eval_rewards/margins": 2.1872806549072266,
"eval_rewards/rejected": -1.9346975088119507,
"eval_runtime": 70.3028,
"eval_samples_per_second": 23.754,
"eval_steps_per_second": 23.754,
"step": 1300
},
{
"epoch": 0.6603654694391935,
"grad_norm": 10.0625,
"learning_rate": 1.8402399127589965e-07,
"logits/chosen": -0.8660598993301392,
"logits/rejected": -0.965117335319519,
"logps/chosen": -58.19904708862305,
"logps/rejected": -85.5027084350586,
"loss": 0.2252,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.3830675482749939,
"rewards/margins": 2.3513522148132324,
"rewards/rejected": -1.9682846069335938,
"step": 1310
},
{
"epoch": 0.665406427221172,
"grad_norm": 19.75,
"learning_rate": 1.8129770992366411e-07,
"logits/chosen": -0.8987786173820496,
"logits/rejected": -0.9898836016654968,
"logps/chosen": -58.44793701171875,
"logps/rejected": -85.73442077636719,
"loss": 0.2653,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 0.3300415873527527,
"rewards/margins": 2.320068836212158,
"rewards/rejected": -1.9900271892547607,
"step": 1320
},
{
"epoch": 0.6704473850031506,
"grad_norm": 26.625,
"learning_rate": 1.7857142857142858e-07,
"logits/chosen": -0.8488761186599731,
"logits/rejected": -0.9688565135002136,
"logps/chosen": -58.56892776489258,
"logps/rejected": -84.93611907958984,
"loss": 0.2507,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.33988556265830994,
"rewards/margins": 2.270322799682617,
"rewards/rejected": -1.9304373264312744,
"step": 1330
},
{
"epoch": 0.6754883427851291,
"grad_norm": 14.25,
"learning_rate": 1.7584514721919302e-07,
"logits/chosen": -0.8936493992805481,
"logits/rejected": -0.9925310015678406,
"logps/chosen": -57.687774658203125,
"logps/rejected": -86.47679138183594,
"loss": 0.1893,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": 0.372401624917984,
"rewards/margins": 2.4566822052001953,
"rewards/rejected": -2.084280490875244,
"step": 1340
},
{
"epoch": 0.6805293005671077,
"grad_norm": 12.6875,
"learning_rate": 1.7311886586695746e-07,
"logits/chosen": -0.8652948141098022,
"logits/rejected": -0.9912670254707336,
"logps/chosen": -59.039756774902344,
"logps/rejected": -84.45045471191406,
"loss": 0.2659,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.2832827866077423,
"rewards/margins": 2.2053096294403076,
"rewards/rejected": -1.9220268726348877,
"step": 1350
},
{
"epoch": 0.6855702583490864,
"grad_norm": 8.9375,
"learning_rate": 1.7039258451472192e-07,
"logits/chosen": -0.830104649066925,
"logits/rejected": -0.9250701069831848,
"logps/chosen": -59.46904754638672,
"logps/rejected": -84.41246032714844,
"loss": 0.292,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 0.28448396921157837,
"rewards/margins": 2.1490626335144043,
"rewards/rejected": -1.8645786046981812,
"step": 1360
},
{
"epoch": 0.6906112161310649,
"grad_norm": 16.25,
"learning_rate": 1.6766630316248636e-07,
"logits/chosen": -0.9015452265739441,
"logits/rejected": -0.9891805648803711,
"logps/chosen": -59.392555236816406,
"logps/rejected": -84.06771087646484,
"loss": 0.31,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 0.22625617682933807,
"rewards/margins": 2.116712808609009,
"rewards/rejected": -1.8904565572738647,
"step": 1370
},
{
"epoch": 0.6956521739130435,
"grad_norm": 27.25,
"learning_rate": 1.6494002181025082e-07,
"logits/chosen": -0.8577421307563782,
"logits/rejected": -0.9831596612930298,
"logps/chosen": -59.06340408325195,
"logps/rejected": -84.84831237792969,
"loss": 0.2915,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": 0.3013680577278137,
"rewards/margins": 2.284956455230713,
"rewards/rejected": -1.983588457107544,
"step": 1380
},
{
"epoch": 0.700693131695022,
"grad_norm": 28.375,
"learning_rate": 1.6221374045801526e-07,
"logits/chosen": -0.8520832061767578,
"logits/rejected": -0.9697486162185669,
"logps/chosen": -59.848876953125,
"logps/rejected": -85.40953063964844,
"loss": 0.3043,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.2575295567512512,
"rewards/margins": 2.2461721897125244,
"rewards/rejected": -1.9886424541473389,
"step": 1390
},
{
"epoch": 0.7057340894770007,
"grad_norm": 14.1875,
"learning_rate": 1.594874591057797e-07,
"logits/chosen": -0.8636363744735718,
"logits/rejected": -0.9646850824356079,
"logps/chosen": -59.90193557739258,
"logps/rejected": -84.7515869140625,
"loss": 0.2823,
"rewards/accuracies": 0.90625,
"rewards/chosen": 0.22327394783496857,
"rewards/margins": 2.1452901363372803,
"rewards/rejected": -1.9220161437988281,
"step": 1400
},
{
"epoch": 0.7057340894770007,
"eval_logits/chosen": -0.8801774382591248,
"eval_logits/rejected": -0.9814472794532776,
"eval_logps/chosen": -59.3859748840332,
"eval_logps/rejected": -84.68141174316406,
"eval_loss": 0.29091569781303406,
"eval_rewards/accuracies": 0.8892215490341187,
"eval_rewards/chosen": 0.2620302140712738,
"eval_rewards/margins": 2.191411256790161,
"eval_rewards/rejected": -1.9293811321258545,
"eval_runtime": 70.8997,
"eval_samples_per_second": 23.554,
"eval_steps_per_second": 23.554,
"step": 1400
},
{
"epoch": 0.7107750472589792,
"grad_norm": 15.9375,
"learning_rate": 1.5676117775354416e-07,
"logits/chosen": -0.9009099006652832,
"logits/rejected": -0.9855210185050964,
"logps/chosen": -59.35606002807617,
"logps/rejected": -83.26178741455078,
"loss": 0.291,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.2705304026603699,
"rewards/margins": 2.0694515705108643,
"rewards/rejected": -1.7989212274551392,
"step": 1410
},
{
"epoch": 0.7158160050409578,
"grad_norm": 16.375,
"learning_rate": 1.540348964013086e-07,
"logits/chosen": -0.8992331624031067,
"logits/rejected": -0.9897601008415222,
"logps/chosen": -58.76881790161133,
"logps/rejected": -85.6570816040039,
"loss": 0.2559,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": 0.3151175379753113,
"rewards/margins": 2.322927951812744,
"rewards/rejected": -2.007810354232788,
"step": 1420
},
{
"epoch": 0.7208569628229363,
"grad_norm": 20.5,
"learning_rate": 1.5130861504907304e-07,
"logits/chosen": -0.8569045066833496,
"logits/rejected": -0.9533321261405945,
"logps/chosen": -58.429908752441406,
"logps/rejected": -84.6368179321289,
"loss": 0.2602,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 0.35319021344184875,
"rewards/margins": 2.2607696056365967,
"rewards/rejected": -1.9075794219970703,
"step": 1430
},
{
"epoch": 0.725897920604915,
"grad_norm": 18.5,
"learning_rate": 1.485823336968375e-07,
"logits/chosen": -0.8877646327018738,
"logits/rejected": -0.9839455485343933,
"logps/chosen": -59.098785400390625,
"logps/rejected": -84.78980255126953,
"loss": 0.2648,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 0.26629897952079773,
"rewards/margins": 2.2212090492248535,
"rewards/rejected": -1.9549100399017334,
"step": 1440
},
{
"epoch": 0.7309388783868935,
"grad_norm": 21.75,
"learning_rate": 1.4585605234460197e-07,
"logits/chosen": -0.8866798281669617,
"logits/rejected": -0.9940204620361328,
"logps/chosen": -58.56513214111328,
"logps/rejected": -85.08773040771484,
"loss": 0.2643,
"rewards/accuracies": 0.90625,
"rewards/chosen": 0.32275161147117615,
"rewards/margins": 2.3079631328582764,
"rewards/rejected": -1.9852116107940674,
"step": 1450
},
{
"epoch": 0.7359798361688721,
"grad_norm": 39.25,
"learning_rate": 1.4312977099236638e-07,
"logits/chosen": -0.9192501306533813,
"logits/rejected": -1.0168657302856445,
"logps/chosen": -59.260459899902344,
"logps/rejected": -83.24293518066406,
"loss": 0.3559,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": 0.2648276388645172,
"rewards/margins": 2.0748324394226074,
"rewards/rejected": -1.8100048303604126,
"step": 1460
},
{
"epoch": 0.7410207939508506,
"grad_norm": 15.6875,
"learning_rate": 1.4040348964013085e-07,
"logits/chosen": -0.8752067685127258,
"logits/rejected": -0.991533100605011,
"logps/chosen": -58.654205322265625,
"logps/rejected": -85.35607147216797,
"loss": 0.3037,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.31618744134902954,
"rewards/margins": 2.2922956943511963,
"rewards/rejected": -1.976108193397522,
"step": 1470
},
{
"epoch": 0.7460617517328293,
"grad_norm": 16.75,
"learning_rate": 1.376772082878953e-07,
"logits/chosen": -0.8595184087753296,
"logits/rejected": -0.9737972021102905,
"logps/chosen": -58.14202880859375,
"logps/rejected": -84.88631439208984,
"loss": 0.2482,
"rewards/accuracies": 0.90625,
"rewards/chosen": 0.375678151845932,
"rewards/margins": 2.3022727966308594,
"rewards/rejected": -1.9265944957733154,
"step": 1480
},
{
"epoch": 0.7511027095148078,
"grad_norm": 22.5,
"learning_rate": 1.3495092693565978e-07,
"logits/chosen": -0.8643430471420288,
"logits/rejected": -0.9914228320121765,
"logps/chosen": -57.81464767456055,
"logps/rejected": -85.24693298339844,
"loss": 0.2338,
"rewards/accuracies": 0.90625,
"rewards/chosen": 0.395480215549469,
"rewards/margins": 2.3924388885498047,
"rewards/rejected": -1.9969587326049805,
"step": 1490
},
{
"epoch": 0.7561436672967864,
"grad_norm": 17.5,
"learning_rate": 1.322246455834242e-07,
"logits/chosen": -0.8798907995223999,
"logits/rejected": -0.9934282302856445,
"logps/chosen": -58.668060302734375,
"logps/rejected": -85.68354034423828,
"loss": 0.2743,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 0.3188169598579407,
"rewards/margins": 2.338467836380005,
"rewards/rejected": -2.019650936126709,
"step": 1500
},
{
"epoch": 0.7561436672967864,
"eval_logits/chosen": -0.8938608169555664,
"eval_logits/rejected": -0.9942108988761902,
"eval_logps/chosen": -59.54497528076172,
"eval_logps/rejected": -84.85215759277344,
"eval_loss": 0.29076066613197327,
"eval_rewards/accuracies": 0.8868263363838196,
"eval_rewards/chosen": 0.24613051116466522,
"eval_rewards/margins": 2.1925852298736572,
"eval_rewards/rejected": -1.946454644203186,
"eval_runtime": 72.3546,
"eval_samples_per_second": 23.081,
"eval_steps_per_second": 23.081,
"step": 1500
},
{
"epoch": 0.7611846250787649,
"grad_norm": 18.875,
"learning_rate": 1.2949836423118865e-07,
"logits/chosen": -0.8818739652633667,
"logits/rejected": -1.0159088373184204,
"logps/chosen": -57.401039123535156,
"logps/rejected": -87.64556121826172,
"loss": 0.1741,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.4094136357307434,
"rewards/margins": 2.643099784851074,
"rewards/rejected": -2.2336859703063965,
"step": 1510
},
{
"epoch": 0.7662255828607435,
"grad_norm": 20.625,
"learning_rate": 1.2677208287895312e-07,
"logits/chosen": -0.8670439720153809,
"logits/rejected": -0.9854429364204407,
"logps/chosen": -58.494041442871094,
"logps/rejected": -85.44778442382812,
"loss": 0.2207,
"rewards/accuracies": 0.90625,
"rewards/chosen": 0.35261982679367065,
"rewards/margins": 2.3209452629089355,
"rewards/rejected": -1.9683253765106201,
"step": 1520
},
{
"epoch": 0.7712665406427222,
"grad_norm": 21.25,
"learning_rate": 1.2404580152671756e-07,
"logits/chosen": -0.8820363879203796,
"logits/rejected": -1.001293420791626,
"logps/chosen": -59.6056022644043,
"logps/rejected": -85.07635498046875,
"loss": 0.3197,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": 0.25649771094322205,
"rewards/margins": 2.231887102127075,
"rewards/rejected": -1.9753892421722412,
"step": 1530
},
{
"epoch": 0.7763074984247007,
"grad_norm": 25.25,
"learning_rate": 1.21319520174482e-07,
"logits/chosen": -0.9083970785140991,
"logits/rejected": -1.029203176498413,
"logps/chosen": -58.7678108215332,
"logps/rejected": -84.92195129394531,
"loss": 0.2812,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 0.2764410376548767,
"rewards/margins": 2.2596542835235596,
"rewards/rejected": -1.983213186264038,
"step": 1540
},
{
"epoch": 0.7813484562066793,
"grad_norm": 35.5,
"learning_rate": 1.1859323882224645e-07,
"logits/chosen": -0.848700225353241,
"logits/rejected": -0.9551340341567993,
"logps/chosen": -59.077735900878906,
"logps/rejected": -86.9412612915039,
"loss": 0.2465,
"rewards/accuracies": 0.90625,
"rewards/chosen": 0.3101661801338196,
"rewards/margins": 2.430529832839966,
"rewards/rejected": -2.120363712310791,
"step": 1550
},
{
"epoch": 0.7863894139886578,
"grad_norm": 26.75,
"learning_rate": 1.1586695747001091e-07,
"logits/chosen": -0.8691787719726562,
"logits/rejected": -0.977543830871582,
"logps/chosen": -59.203521728515625,
"logps/rejected": -84.72015380859375,
"loss": 0.2758,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": 0.28436484932899475,
"rewards/margins": 2.2119786739349365,
"rewards/rejected": -1.9276138544082642,
"step": 1560
},
{
"epoch": 0.7914303717706365,
"grad_norm": 14.0625,
"learning_rate": 1.1314067611777535e-07,
"logits/chosen": -0.8494786024093628,
"logits/rejected": -0.9657021760940552,
"logps/chosen": -58.790069580078125,
"logps/rejected": -85.69227600097656,
"loss": 0.2438,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.30834752321243286,
"rewards/margins": 2.33400297164917,
"rewards/rejected": -2.0256552696228027,
"step": 1570
},
{
"epoch": 0.796471329552615,
"grad_norm": 9.9375,
"learning_rate": 1.1041439476553979e-07,
"logits/chosen": -0.898410975933075,
"logits/rejected": -0.9827295541763306,
"logps/chosen": -59.0511360168457,
"logps/rejected": -84.7381591796875,
"loss": 0.3078,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": 0.2760066092014313,
"rewards/margins": 2.209929943084717,
"rewards/rejected": -1.9339231252670288,
"step": 1580
},
{
"epoch": 0.8015122873345936,
"grad_norm": 11.375,
"learning_rate": 1.0768811341330425e-07,
"logits/chosen": -0.8804594874382019,
"logits/rejected": -1.0089373588562012,
"logps/chosen": -57.877601623535156,
"logps/rejected": -86.3949966430664,
"loss": 0.2237,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": 0.39419177174568176,
"rewards/margins": 2.4698028564453125,
"rewards/rejected": -2.075611114501953,
"step": 1590
},
{
"epoch": 0.8065532451165721,
"grad_norm": 13.5625,
"learning_rate": 1.0496183206106869e-07,
"logits/chosen": -0.885684609413147,
"logits/rejected": -1.007638931274414,
"logps/chosen": -57.24601364135742,
"logps/rejected": -85.00395202636719,
"loss": 0.2164,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": 0.43452557921409607,
"rewards/margins": 2.38159441947937,
"rewards/rejected": -1.9470688104629517,
"step": 1600
},
{
"epoch": 0.8065532451165721,
"eval_logits/chosen": -0.8853088617324829,
"eval_logits/rejected": -0.9853202700614929,
"eval_logps/chosen": -59.518455505371094,
"eval_logps/rejected": -84.9808349609375,
"eval_loss": 0.2906578481197357,
"eval_rewards/accuracies": 0.886227548122406,
"eval_rewards/chosen": 0.24878181517124176,
"eval_rewards/margins": 2.2081050872802734,
"eval_rewards/rejected": -1.959323525428772,
"eval_runtime": 72.4075,
"eval_samples_per_second": 23.064,
"eval_steps_per_second": 23.064,
"step": 1600
},
{
"epoch": 0.8115942028985508,
"grad_norm": 22.0,
"learning_rate": 1.0223555070883315e-07,
"logits/chosen": -0.8960712552070618,
"logits/rejected": -1.01849365234375,
"logps/chosen": -59.022682189941406,
"logps/rejected": -85.11215209960938,
"loss": 0.234,
"rewards/accuracies": 0.90625,
"rewards/chosen": 0.30705201625823975,
"rewards/margins": 2.2740187644958496,
"rewards/rejected": -1.966966986656189,
"step": 1610
},
{
"epoch": 0.8166351606805293,
"grad_norm": 15.3125,
"learning_rate": 9.950926935659759e-08,
"logits/chosen": -0.8351577520370483,
"logits/rejected": -0.950478196144104,
"logps/chosen": -61.142723083496094,
"logps/rejected": -83.6924819946289,
"loss": 0.3591,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.1616726666688919,
"rewards/margins": 1.9606307744979858,
"rewards/rejected": -1.7989578247070312,
"step": 1620
},
{
"epoch": 0.8216761184625079,
"grad_norm": 20.125,
"learning_rate": 9.678298800436204e-08,
"logits/chosen": -0.8941848874092102,
"logits/rejected": -0.9981807470321655,
"logps/chosen": -59.347694396972656,
"logps/rejected": -83.65950012207031,
"loss": 0.3604,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": 0.27368858456611633,
"rewards/margins": 2.1373450756073,
"rewards/rejected": -1.8636566400527954,
"step": 1630
},
{
"epoch": 0.8267170762444864,
"grad_norm": 22.0,
"learning_rate": 9.40567066521265e-08,
"logits/chosen": -0.8934575915336609,
"logits/rejected": -1.0223013162612915,
"logps/chosen": -58.4257698059082,
"logps/rejected": -85.53926086425781,
"loss": 0.2539,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.34174391627311707,
"rewards/margins": 2.34285569190979,
"rewards/rejected": -2.0011115074157715,
"step": 1640
},
{
"epoch": 0.831758034026465,
"grad_norm": 36.0,
"learning_rate": 9.133042529989095e-08,
"logits/chosen": -0.834098219871521,
"logits/rejected": -0.95869380235672,
"logps/chosen": -60.06525802612305,
"logps/rejected": -84.23382568359375,
"loss": 0.3107,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": 0.22542810440063477,
"rewards/margins": 2.11167573928833,
"rewards/rejected": -1.8862476348876953,
"step": 1650
},
{
"epoch": 0.8367989918084436,
"grad_norm": 13.375,
"learning_rate": 8.860414394765539e-08,
"logits/chosen": -0.8735988736152649,
"logits/rejected": -0.9876992106437683,
"logps/chosen": -58.84014892578125,
"logps/rejected": -85.5834732055664,
"loss": 0.2594,
"rewards/accuracies": 0.90625,
"rewards/chosen": 0.31726759672164917,
"rewards/margins": 2.3258700370788574,
"rewards/rejected": -2.0086026191711426,
"step": 1660
},
{
"epoch": 0.8418399495904222,
"grad_norm": 32.25,
"learning_rate": 8.587786259541985e-08,
"logits/chosen": -0.845401406288147,
"logits/rejected": -0.9550067782402039,
"logps/chosen": -58.90953826904297,
"logps/rejected": -85.50616455078125,
"loss": 0.293,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": 0.3285283148288727,
"rewards/margins": 2.3436825275421143,
"rewards/rejected": -2.0151543617248535,
"step": 1670
},
{
"epoch": 0.8468809073724007,
"grad_norm": 29.625,
"learning_rate": 8.315158124318429e-08,
"logits/chosen": -0.846836268901825,
"logits/rejected": -0.9689435958862305,
"logps/chosen": -59.2591552734375,
"logps/rejected": -84.11135864257812,
"loss": 0.3235,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": 0.30486565828323364,
"rewards/margins": 2.1752853393554688,
"rewards/rejected": -1.8704197406768799,
"step": 1680
},
{
"epoch": 0.8519218651543793,
"grad_norm": 18.25,
"learning_rate": 8.042529989094875e-08,
"logits/chosen": -0.8394562005996704,
"logits/rejected": -0.9826558828353882,
"logps/chosen": -58.2289924621582,
"logps/rejected": -85.12105560302734,
"loss": 0.2287,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": 0.386637419462204,
"rewards/margins": 2.350778341293335,
"rewards/rejected": -1.9641408920288086,
"step": 1690
},
{
"epoch": 0.856962822936358,
"grad_norm": 14.1875,
"learning_rate": 7.769901853871319e-08,
"logits/chosen": -0.8795615434646606,
"logits/rejected": -0.9935046434402466,
"logps/chosen": -56.979454040527344,
"logps/rejected": -86.98414611816406,
"loss": 0.1638,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.47562724351882935,
"rewards/margins": 2.5870258808135986,
"rewards/rejected": -2.111398220062256,
"step": 1700
},
{
"epoch": 0.856962822936358,
"eval_logits/chosen": -0.8943283557891846,
"eval_logits/rejected": -0.9974154829978943,
"eval_logps/chosen": -59.26322937011719,
"eval_logps/rejected": -84.75007629394531,
"eval_loss": 0.29021164774894714,
"eval_rewards/accuracies": 0.8892215490341187,
"eval_rewards/chosen": 0.2743041515350342,
"eval_rewards/margins": 2.2105515003204346,
"eval_rewards/rejected": -1.9362471103668213,
"eval_runtime": 70.6147,
"eval_samples_per_second": 23.649,
"eval_steps_per_second": 23.649,
"step": 1700
},
{
"epoch": 0.8620037807183365,
"grad_norm": 31.125,
"learning_rate": 7.497273718647764e-08,
"logits/chosen": -0.9100006818771362,
"logits/rejected": -1.028938889503479,
"logps/chosen": -58.05731964111328,
"logps/rejected": -84.74702453613281,
"loss": 0.2487,
"rewards/accuracies": 0.90625,
"rewards/chosen": 0.3694465756416321,
"rewards/margins": 2.2923641204833984,
"rewards/rejected": -1.9229179620742798,
"step": 1710
},
{
"epoch": 0.867044738500315,
"grad_norm": 16.625,
"learning_rate": 7.22464558342421e-08,
"logits/chosen": -0.8805274963378906,
"logits/rejected": -0.9727686047554016,
"logps/chosen": -58.530845642089844,
"logps/rejected": -84.3791275024414,
"loss": 0.2809,
"rewards/accuracies": 0.90625,
"rewards/chosen": 0.3172784447669983,
"rewards/margins": 2.2292685508728027,
"rewards/rejected": -1.9119901657104492,
"step": 1720
},
{
"epoch": 0.8720856962822936,
"grad_norm": 24.125,
"learning_rate": 6.952017448200655e-08,
"logits/chosen": -0.903597354888916,
"logits/rejected": -1.0219088792800903,
"logps/chosen": -58.6904411315918,
"logps/rejected": -84.50187683105469,
"loss": 0.252,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.3121050298213959,
"rewards/margins": 2.2391486167907715,
"rewards/rejected": -1.9270436763763428,
"step": 1730
},
{
"epoch": 0.8771266540642723,
"grad_norm": 8.8125,
"learning_rate": 6.679389312977098e-08,
"logits/chosen": -0.8927151560783386,
"logits/rejected": -0.9953739047050476,
"logps/chosen": -59.15825271606445,
"logps/rejected": -85.2041244506836,
"loss": 0.2347,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.29449015855789185,
"rewards/margins": 2.263671398162842,
"rewards/rejected": -1.9691814184188843,
"step": 1740
},
{
"epoch": 0.8821676118462508,
"grad_norm": 8.25,
"learning_rate": 6.406761177753544e-08,
"logits/chosen": -0.857822597026825,
"logits/rejected": -0.9733842015266418,
"logps/chosen": -58.579246520996094,
"logps/rejected": -85.5455551147461,
"loss": 0.2629,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": 0.3299351930618286,
"rewards/margins": 2.326323986053467,
"rewards/rejected": -1.9963890314102173,
"step": 1750
},
{
"epoch": 0.8872085696282294,
"grad_norm": 12.5,
"learning_rate": 6.134133042529989e-08,
"logits/chosen": -0.9069849848747253,
"logits/rejected": -1.0028917789459229,
"logps/chosen": -58.58331298828125,
"logps/rejected": -86.31389617919922,
"loss": 0.2365,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 0.3112293779850006,
"rewards/margins": 2.403074026107788,
"rewards/rejected": -2.0918445587158203,
"step": 1760
},
{
"epoch": 0.8922495274102079,
"grad_norm": 26.375,
"learning_rate": 5.861504907306434e-08,
"logits/chosen": -0.8625677227973938,
"logits/rejected": -0.9746575355529785,
"logps/chosen": -57.2651252746582,
"logps/rejected": -85.93891906738281,
"loss": 0.2094,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.417421817779541,
"rewards/margins": 2.4767680168151855,
"rewards/rejected": -2.0593464374542236,
"step": 1770
},
{
"epoch": 0.8972904851921865,
"grad_norm": 29.875,
"learning_rate": 5.588876772082879e-08,
"logits/chosen": -0.8995459675788879,
"logits/rejected": -1.0156245231628418,
"logps/chosen": -58.09168243408203,
"logps/rejected": -85.68585968017578,
"loss": 0.2295,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": 0.3603942096233368,
"rewards/margins": 2.3807685375213623,
"rewards/rejected": -2.020374298095703,
"step": 1780
},
{
"epoch": 0.9023314429741651,
"grad_norm": 11.75,
"learning_rate": 5.316248636859324e-08,
"logits/chosen": -0.8375666737556458,
"logits/rejected": -0.9661946296691895,
"logps/chosen": -59.528541564941406,
"logps/rejected": -84.20811462402344,
"loss": 0.3174,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 0.2562711834907532,
"rewards/margins": 2.144699811935425,
"rewards/rejected": -1.8884284496307373,
"step": 1790
},
{
"epoch": 0.9073724007561437,
"grad_norm": 26.25,
"learning_rate": 5.043620501635769e-08,
"logits/chosen": -0.8782273530960083,
"logits/rejected": -0.9897629618644714,
"logps/chosen": -58.66547393798828,
"logps/rejected": -84.6033706665039,
"loss": 0.2588,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.3361870348453522,
"rewards/margins": 2.2399086952209473,
"rewards/rejected": -1.903721570968628,
"step": 1800
},
{
"epoch": 0.9073724007561437,
"eval_logits/chosen": -0.891097903251648,
"eval_logits/rejected": -0.9927994608879089,
"eval_logps/chosen": -59.322288513183594,
"eval_logps/rejected": -84.85601806640625,
"eval_loss": 0.2902388572692871,
"eval_rewards/accuracies": 0.8886227607727051,
"eval_rewards/chosen": 0.2683981657028198,
"eval_rewards/margins": 2.215238094329834,
"eval_rewards/rejected": -1.9468399286270142,
"eval_runtime": 73.6165,
"eval_samples_per_second": 22.685,
"eval_steps_per_second": 22.685,
"step": 1800
},
{
"epoch": 0.9124133585381222,
"grad_norm": 23.125,
"learning_rate": 4.770992366412214e-08,
"logits/chosen": -0.9097870588302612,
"logits/rejected": -1.0100575685501099,
"logps/chosen": -58.4896354675293,
"logps/rejected": -85.04032897949219,
"loss": 0.2725,
"rewards/accuracies": 0.90625,
"rewards/chosen": 0.3351896405220032,
"rewards/margins": 2.258465528488159,
"rewards/rejected": -1.9232757091522217,
"step": 1810
},
{
"epoch": 0.9174543163201008,
"grad_norm": 10.0625,
"learning_rate": 4.498364231188658e-08,
"logits/chosen": -0.9119001626968384,
"logits/rejected": -0.9829280972480774,
"logps/chosen": -59.015846252441406,
"logps/rejected": -85.16139221191406,
"loss": 0.2527,
"rewards/accuracies": 0.90625,
"rewards/chosen": 0.29453665018081665,
"rewards/margins": 2.282649517059326,
"rewards/rejected": -1.9881130456924438,
"step": 1820
},
{
"epoch": 0.9224952741020794,
"grad_norm": 37.0,
"learning_rate": 4.225736095965103e-08,
"logits/chosen": -0.8899482488632202,
"logits/rejected": -1.003125786781311,
"logps/chosen": -58.26552200317383,
"logps/rejected": -84.43729400634766,
"loss": 0.2777,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 0.3805728554725647,
"rewards/margins": 2.2908310890197754,
"rewards/rejected": -1.9102582931518555,
"step": 1830
},
{
"epoch": 0.927536231884058,
"grad_norm": 28.5,
"learning_rate": 3.953107960741548e-08,
"logits/chosen": -0.8962399363517761,
"logits/rejected": -1.0039699077606201,
"logps/chosen": -59.460975646972656,
"logps/rejected": -84.71418762207031,
"loss": 0.3002,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": 0.2653732895851135,
"rewards/margins": 2.18709135055542,
"rewards/rejected": -1.9217180013656616,
"step": 1840
},
{
"epoch": 0.9325771896660365,
"grad_norm": 25.375,
"learning_rate": 3.680479825517993e-08,
"logits/chosen": -0.8921709060668945,
"logits/rejected": -0.9834814071655273,
"logps/chosen": -59.8410530090332,
"logps/rejected": -84.02848052978516,
"loss": 0.2857,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.2075158655643463,
"rewards/margins": 2.0984721183776855,
"rewards/rejected": -1.890955924987793,
"step": 1850
},
{
"epoch": 0.9376181474480151,
"grad_norm": 17.0,
"learning_rate": 3.407851690294438e-08,
"logits/chosen": -0.8759490847587585,
"logits/rejected": -0.9916941523551941,
"logps/chosen": -58.60956954956055,
"logps/rejected": -84.21783447265625,
"loss": 0.2933,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.34195294976234436,
"rewards/margins": 2.2317111492156982,
"rewards/rejected": -1.8897583484649658,
"step": 1860
},
{
"epoch": 0.9426591052299937,
"grad_norm": 16.125,
"learning_rate": 3.135223555070883e-08,
"logits/chosen": -0.8751128315925598,
"logits/rejected": -0.9996623992919922,
"logps/chosen": -58.18467330932617,
"logps/rejected": -85.35218811035156,
"loss": 0.2311,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": 0.3810577392578125,
"rewards/margins": 2.361508369445801,
"rewards/rejected": -1.9804503917694092,
"step": 1870
},
{
"epoch": 0.9477000630119723,
"grad_norm": 20.625,
"learning_rate": 2.862595419847328e-08,
"logits/chosen": -0.8567900657653809,
"logits/rejected": -0.96808922290802,
"logps/chosen": -58.792396545410156,
"logps/rejected": -84.65996551513672,
"loss": 0.2498,
"rewards/accuracies": 0.90625,
"rewards/chosen": 0.33555370569229126,
"rewards/margins": 2.2720112800598145,
"rewards/rejected": -1.9364579916000366,
"step": 1880
},
{
"epoch": 0.9527410207939508,
"grad_norm": 16.25,
"learning_rate": 2.589967284623773e-08,
"logits/chosen": -0.8800870776176453,
"logits/rejected": -1.0060070753097534,
"logps/chosen": -57.40453338623047,
"logps/rejected": -85.77619171142578,
"loss": 0.2096,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.4209045469760895,
"rewards/margins": 2.4640016555786133,
"rewards/rejected": -2.0430970191955566,
"step": 1890
},
{
"epoch": 0.9577819785759294,
"grad_norm": 32.75,
"learning_rate": 2.317339149400218e-08,
"logits/chosen": -0.8548176884651184,
"logits/rejected": -0.9700508117675781,
"logps/chosen": -59.573455810546875,
"logps/rejected": -84.24436950683594,
"loss": 0.2916,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": 0.2536848485469818,
"rewards/margins": 2.134523391723633,
"rewards/rejected": -1.8808386325836182,
"step": 1900
},
{
"epoch": 0.9577819785759294,
"eval_logits/chosen": -0.8832784295082092,
"eval_logits/rejected": -0.9866493344306946,
"eval_logps/chosen": -59.40935516357422,
"eval_logps/rejected": -84.93498229980469,
"eval_loss": 0.2900025248527527,
"eval_rewards/accuracies": 0.8886227607727051,
"eval_rewards/chosen": 0.25969186425209045,
"eval_rewards/margins": 2.2144293785095215,
"eval_rewards/rejected": -1.9547375440597534,
"eval_runtime": 70.3437,
"eval_samples_per_second": 23.741,
"eval_steps_per_second": 23.741,
"step": 1900
},
{
"epoch": 0.962822936357908,
"grad_norm": 22.125,
"learning_rate": 2.044711014176663e-08,
"logits/chosen": -0.9094161987304688,
"logits/rejected": -1.0066895484924316,
"logps/chosen": -58.97810745239258,
"logps/rejected": -83.76020812988281,
"loss": 0.2996,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": 0.2789512574672699,
"rewards/margins": 2.176806688308716,
"rewards/rejected": -1.897855520248413,
"step": 1910
},
{
"epoch": 0.9678638941398866,
"grad_norm": 16.75,
"learning_rate": 1.772082878953108e-08,
"logits/chosen": -0.8881312608718872,
"logits/rejected": -0.9979362487792969,
"logps/chosen": -58.24678421020508,
"logps/rejected": -84.76899719238281,
"loss": 0.2573,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": 0.3678635060787201,
"rewards/margins": 2.299140453338623,
"rewards/rejected": -1.9312770366668701,
"step": 1920
},
{
"epoch": 0.9729048519218652,
"grad_norm": 10.75,
"learning_rate": 1.4994547437295527e-08,
"logits/chosen": -0.8843402862548828,
"logits/rejected": -0.9742966890335083,
"logps/chosen": -59.153114318847656,
"logps/rejected": -85.64107513427734,
"loss": 0.2745,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 0.2713843286037445,
"rewards/margins": 2.270181179046631,
"rewards/rejected": -1.998796820640564,
"step": 1930
},
{
"epoch": 0.9779458097038437,
"grad_norm": 25.375,
"learning_rate": 1.2268266085059978e-08,
"logits/chosen": -0.872350811958313,
"logits/rejected": -0.9762080907821655,
"logps/chosen": -58.68449783325195,
"logps/rejected": -84.45391845703125,
"loss": 0.2923,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 0.31912150979042053,
"rewards/margins": 2.221708059310913,
"rewards/rejected": -1.9025866985321045,
"step": 1940
},
{
"epoch": 0.9829867674858223,
"grad_norm": 17.875,
"learning_rate": 9.541984732824428e-09,
"logits/chosen": -0.8793309926986694,
"logits/rejected": -0.9880158305168152,
"logps/chosen": -58.13665771484375,
"logps/rejected": -85.4889144897461,
"loss": 0.2236,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 0.3792751729488373,
"rewards/margins": 2.364917039871216,
"rewards/rejected": -1.9856418371200562,
"step": 1950
},
{
"epoch": 0.9880277252678009,
"grad_norm": 14.8125,
"learning_rate": 6.815703380588876e-09,
"logits/chosen": -0.8793843984603882,
"logits/rejected": -0.9873378872871399,
"logps/chosen": -57.81184005737305,
"logps/rejected": -85.71204376220703,
"loss": 0.235,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": 0.3983749747276306,
"rewards/margins": 2.4253978729248047,
"rewards/rejected": -2.0270228385925293,
"step": 1960
},
{
"epoch": 0.9930686830497795,
"grad_norm": 22.75,
"learning_rate": 4.089422028353326e-09,
"logits/chosen": -0.8643286824226379,
"logits/rejected": -0.9733026623725891,
"logps/chosen": -58.93427658081055,
"logps/rejected": -84.8313980102539,
"loss": 0.2905,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.29614943265914917,
"rewards/margins": 2.2761871814727783,
"rewards/rejected": -1.9800374507904053,
"step": 1970
},
{
"epoch": 0.998109640831758,
"grad_norm": 31.75,
"learning_rate": 1.3631406761177753e-09,
"logits/chosen": -0.8575819134712219,
"logits/rejected": -0.9438503980636597,
"logps/chosen": -60.40618896484375,
"logps/rejected": -82.56617736816406,
"loss": 0.3551,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.2122369259595871,
"rewards/margins": 1.9236053228378296,
"rewards/rejected": -1.7113683223724365,
"step": 1980
}
],
"logging_steps": 10,
"max_steps": 1984,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}