Files
avibe/trainer_state.json
2025-10-20 10:48:00 +00:00

13884 lines
414 KiB
JSON

{
"best_global_step": 850,
"best_metric": 0.31901347637176514,
"best_model_checkpoint": "/experiment_results/dpo/A-vibe_OPEN_SOURCE_checkpoint-1600_dpo_chosen_OUR_super_unsafe_from_PR_x15_NEW_CORRECT_04_10_25_v9/checkpoint-850",
"epoch": 1.0,
"eval_steps": 50,
"global_step": 904,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0011061946902654867,
"grad_norm": 21.33904266357422,
"learning_rate": 0.0,
"logits/chosen": -1.55078125,
"logits/rejected": -1.46875,
"logps/chosen": -288.0,
"logps/rejected": -235.5,
"loss": 0.7017,
"rewards/accuracies": 0.078125,
"rewards/chosen": -0.007916450500488281,
"rewards/margins": -0.0164794921875,
"rewards/rejected": 0.00848388671875,
"step": 1
},
{
"epoch": 0.0022123893805309734,
"grad_norm": 19.44487762451172,
"learning_rate": 1.7857142857142856e-08,
"logits/chosen": -1.5,
"logits/rejected": -1.43359375,
"logps/chosen": -259.0,
"logps/rejected": -226.0,
"loss": 0.6987,
"rewards/accuracies": 0.171875,
"rewards/chosen": -0.006072998046875,
"rewards/margins": -0.007415771484375,
"rewards/rejected": 0.0013580322265625,
"step": 2
},
{
"epoch": 0.00331858407079646,
"grad_norm": 21.772796630859375,
"learning_rate": 3.571428571428571e-08,
"logits/chosen": -1.58984375,
"logits/rejected": -1.54296875,
"logps/chosen": -288.0,
"logps/rejected": -286.0,
"loss": 0.6943,
"rewards/accuracies": 0.296875,
"rewards/chosen": -0.0041046142578125,
"rewards/margins": 0.002166748046875,
"rewards/rejected": -0.0062713623046875,
"step": 3
},
{
"epoch": 0.004424778761061947,
"grad_norm": 20.7520751953125,
"learning_rate": 5.3571428571428564e-08,
"logits/chosen": -1.65625,
"logits/rejected": -1.6015625,
"logps/chosen": -257.5,
"logps/rejected": -243.0,
"loss": 0.6858,
"rewards/accuracies": 0.328125,
"rewards/chosen": 0.0084075927734375,
"rewards/margins": 0.0184326171875,
"rewards/rejected": -0.009979248046875,
"step": 4
},
{
"epoch": 0.0055309734513274336,
"grad_norm": 22.113121032714844,
"learning_rate": 7.142857142857142e-08,
"logits/chosen": -1.5234375,
"logits/rejected": -1.53515625,
"logps/chosen": -263.0,
"logps/rejected": -262.5,
"loss": 0.6965,
"rewards/accuracies": 0.28125,
"rewards/chosen": -0.008130073547363281,
"rewards/margins": -0.003143310546875,
"rewards/rejected": -0.0049991607666015625,
"step": 5
},
{
"epoch": 0.00663716814159292,
"grad_norm": 22.67697525024414,
"learning_rate": 8.928571428571429e-08,
"logits/chosen": -1.4609375,
"logits/rejected": -1.62109375,
"logps/chosen": -252.5,
"logps/rejected": -259.5,
"loss": 0.6851,
"rewards/accuracies": 0.3046875,
"rewards/chosen": 0.00469970703125,
"rewards/margins": 0.013885498046875,
"rewards/rejected": -0.009189605712890625,
"step": 6
},
{
"epoch": 0.007743362831858407,
"grad_norm": 23.316373825073242,
"learning_rate": 1.0714285714285713e-07,
"logits/chosen": -1.46484375,
"logits/rejected": -1.3984375,
"logps/chosen": -279.0,
"logps/rejected": -271.0,
"loss": 0.698,
"rewards/accuracies": 0.3125,
"rewards/chosen": 0.00156402587890625,
"rewards/margins": -0.0079498291015625,
"rewards/rejected": 0.00946044921875,
"step": 7
},
{
"epoch": 0.008849557522123894,
"grad_norm": 24.865726470947266,
"learning_rate": 1.25e-07,
"logits/chosen": -1.43359375,
"logits/rejected": -1.546875,
"logps/chosen": -275.0,
"logps/rejected": -292.0,
"loss": 0.7039,
"rewards/accuracies": 0.203125,
"rewards/chosen": -0.006072998046875,
"rewards/margins": -0.01904296875,
"rewards/rejected": 0.01300048828125,
"step": 8
},
{
"epoch": 0.00995575221238938,
"grad_norm": 20.924415588378906,
"learning_rate": 1.4285714285714285e-07,
"logits/chosen": -1.55859375,
"logits/rejected": -1.51953125,
"logps/chosen": -238.5,
"logps/rejected": -238.5,
"loss": 0.6892,
"rewards/accuracies": 0.296875,
"rewards/chosen": 0.0125274658203125,
"rewards/margins": 0.0072021484375,
"rewards/rejected": 0.0052642822265625,
"step": 9
},
{
"epoch": 0.011061946902654867,
"grad_norm": 19.864246368408203,
"learning_rate": 1.6071428571428573e-07,
"logits/chosen": -1.56640625,
"logits/rejected": -1.48046875,
"logps/chosen": -249.0,
"logps/rejected": -230.0,
"loss": 0.6956,
"rewards/accuracies": 0.3046875,
"rewards/chosen": 0.0086822509765625,
"rewards/margins": 0.00128173828125,
"rewards/rejected": 0.0074615478515625,
"step": 10
},
{
"epoch": 0.012168141592920354,
"grad_norm": 22.528316497802734,
"learning_rate": 1.7857142857142858e-07,
"logits/chosen": -1.59375,
"logits/rejected": -1.5,
"logps/chosen": -272.0,
"logps/rejected": -290.0,
"loss": 0.6936,
"rewards/accuracies": 0.3359375,
"rewards/chosen": -0.0045032501220703125,
"rewards/margins": 0.0057544708251953125,
"rewards/rejected": -0.01029062271118164,
"step": 11
},
{
"epoch": 0.01327433628318584,
"grad_norm": 21.385112762451172,
"learning_rate": 1.964285714285714e-07,
"logits/chosen": -1.43359375,
"logits/rejected": -1.38671875,
"logps/chosen": -270.0,
"logps/rejected": -281.0,
"loss": 0.6895,
"rewards/accuracies": 0.3359375,
"rewards/chosen": 0.001373291015625,
"rewards/margins": 0.0108795166015625,
"rewards/rejected": -0.009471893310546875,
"step": 12
},
{
"epoch": 0.014380530973451327,
"grad_norm": 21.703392028808594,
"learning_rate": 2.1428571428571426e-07,
"logits/chosen": -1.51953125,
"logits/rejected": -1.35546875,
"logps/chosen": -258.0,
"logps/rejected": -263.0,
"loss": 0.7104,
"rewards/accuracies": 0.2421875,
"rewards/chosen": -0.0135040283203125,
"rewards/margins": -0.0324249267578125,
"rewards/rejected": 0.01898193359375,
"step": 13
},
{
"epoch": 0.015486725663716814,
"grad_norm": 19.697071075439453,
"learning_rate": 2.3214285714285714e-07,
"logits/chosen": -1.4140625,
"logits/rejected": -1.56640625,
"logps/chosen": -248.0,
"logps/rejected": -233.5,
"loss": 0.6953,
"rewards/accuracies": 0.3125,
"rewards/chosen": 0.005401611328125,
"rewards/margins": -0.001678466796875,
"rewards/rejected": 0.007049560546875,
"step": 14
},
{
"epoch": 0.016592920353982302,
"grad_norm": 21.335206985473633,
"learning_rate": 2.5e-07,
"logits/chosen": -1.56640625,
"logits/rejected": -1.51953125,
"logps/chosen": -272.0,
"logps/rejected": -270.0,
"loss": 0.6838,
"rewards/accuracies": 0.34375,
"rewards/chosen": 0.0098724365234375,
"rewards/margins": 0.0190277099609375,
"rewards/rejected": -0.009204864501953125,
"step": 15
},
{
"epoch": 0.017699115044247787,
"grad_norm": 21.42949867248535,
"learning_rate": 2.6785714285714284e-07,
"logits/chosen": -1.515625,
"logits/rejected": -1.6328125,
"logps/chosen": -248.5,
"logps/rejected": -244.5,
"loss": 0.6785,
"rewards/accuracies": 0.34375,
"rewards/chosen": 0.00927734375,
"rewards/margins": 0.0289306640625,
"rewards/rejected": -0.0196533203125,
"step": 16
},
{
"epoch": 0.018805309734513276,
"grad_norm": 20.796878814697266,
"learning_rate": 2.857142857142857e-07,
"logits/chosen": -1.60546875,
"logits/rejected": -1.625,
"logps/chosen": -231.5,
"logps/rejected": -231.5,
"loss": 0.6899,
"rewards/accuracies": 0.34375,
"rewards/chosen": 0.00724029541015625,
"rewards/margins": 0.011138916015625,
"rewards/rejected": -0.00391387939453125,
"step": 17
},
{
"epoch": 0.01991150442477876,
"grad_norm": 20.082786560058594,
"learning_rate": 3.0357142857142855e-07,
"logits/chosen": -1.46875,
"logits/rejected": -1.40625,
"logps/chosen": -251.0,
"logps/rejected": -248.5,
"loss": 0.688,
"rewards/accuracies": 0.40625,
"rewards/chosen": 0.016357421875,
"rewards/margins": 0.0147705078125,
"rewards/rejected": 0.0015716552734375,
"step": 18
},
{
"epoch": 0.02101769911504425,
"grad_norm": 21.640682220458984,
"learning_rate": 3.2142857142857145e-07,
"logits/chosen": -1.59765625,
"logits/rejected": -1.3515625,
"logps/chosen": -264.0,
"logps/rejected": -262.0,
"loss": 0.6912,
"rewards/accuracies": 0.3359375,
"rewards/chosen": 0.01506805419921875,
"rewards/margins": 0.005462646484375,
"rewards/rejected": 0.00958251953125,
"step": 19
},
{
"epoch": 0.022123893805309734,
"grad_norm": 22.128896713256836,
"learning_rate": 3.392857142857143e-07,
"logits/chosen": -1.57421875,
"logits/rejected": -1.47265625,
"logps/chosen": -267.5,
"logps/rejected": -267.0,
"loss": 0.6917,
"rewards/accuracies": 0.3515625,
"rewards/chosen": 0.01458740234375,
"rewards/margins": 0.0075225830078125,
"rewards/rejected": 0.007049560546875,
"step": 20
},
{
"epoch": 0.023230088495575223,
"grad_norm": 20.139122009277344,
"learning_rate": 3.5714285714285716e-07,
"logits/chosen": -1.58203125,
"logits/rejected": -1.46484375,
"logps/chosen": -251.5,
"logps/rejected": -251.0,
"loss": 0.699,
"rewards/accuracies": 0.3203125,
"rewards/chosen": 0.003238677978515625,
"rewards/margins": -0.0052642822265625,
"rewards/rejected": 0.008502960205078125,
"step": 21
},
{
"epoch": 0.024336283185840708,
"grad_norm": 20.964323043823242,
"learning_rate": 3.75e-07,
"logits/chosen": -1.55859375,
"logits/rejected": -1.49609375,
"logps/chosen": -236.0,
"logps/rejected": -260.5,
"loss": 0.6882,
"rewards/accuracies": 0.3984375,
"rewards/chosen": 0.02685546875,
"rewards/margins": 0.013885498046875,
"rewards/rejected": 0.01297760009765625,
"step": 22
},
{
"epoch": 0.025442477876106196,
"grad_norm": 19.556018829345703,
"learning_rate": 3.928571428571428e-07,
"logits/chosen": -1.6015625,
"logits/rejected": -1.4296875,
"logps/chosen": -234.0,
"logps/rejected": -210.0,
"loss": 0.6941,
"rewards/accuracies": 0.359375,
"rewards/chosen": 0.01114654541015625,
"rewards/margins": 0.00146484375,
"rewards/rejected": 0.0096893310546875,
"step": 23
},
{
"epoch": 0.02654867256637168,
"grad_norm": 195.61749267578125,
"learning_rate": 4.1071428571428566e-07,
"logits/chosen": -1.59375,
"logits/rejected": -1.328125,
"logps/chosen": -264.0,
"logps/rejected": -329.5,
"loss": 0.676,
"rewards/accuracies": 0.4921875,
"rewards/chosen": 0.0570068359375,
"rewards/margins": -0.006103515625,
"rewards/rejected": 0.0631866455078125,
"step": 24
},
{
"epoch": 0.02765486725663717,
"grad_norm": 21.722719192504883,
"learning_rate": 4.285714285714285e-07,
"logits/chosen": -1.4375,
"logits/rejected": -1.546875,
"logps/chosen": -259.0,
"logps/rejected": -269.0,
"loss": 0.6887,
"rewards/accuracies": 0.453125,
"rewards/chosen": 0.0538330078125,
"rewards/margins": 0.0157470703125,
"rewards/rejected": 0.03802490234375,
"step": 25
},
{
"epoch": 0.028761061946902654,
"grad_norm": 22.364490509033203,
"learning_rate": 4.464285714285714e-07,
"logits/chosen": -1.41015625,
"logits/rejected": -1.36328125,
"logps/chosen": -296.0,
"logps/rejected": -305.0,
"loss": 0.6882,
"rewards/accuracies": 0.4375,
"rewards/chosen": 0.0601806640625,
"rewards/margins": 0.01422119140625,
"rewards/rejected": 0.0460205078125,
"step": 26
},
{
"epoch": 0.029867256637168143,
"grad_norm": 20.38817024230957,
"learning_rate": 4.6428571428571427e-07,
"logits/chosen": -1.44140625,
"logits/rejected": -1.390625,
"logps/chosen": -280.0,
"logps/rejected": -265.0,
"loss": 0.6743,
"rewards/accuracies": 0.4609375,
"rewards/chosen": 0.0726318359375,
"rewards/margins": 0.0447998046875,
"rewards/rejected": 0.02783203125,
"step": 27
},
{
"epoch": 0.030973451327433628,
"grad_norm": 21.340524673461914,
"learning_rate": 4.821428571428571e-07,
"logits/chosen": -1.4609375,
"logits/rejected": -1.49609375,
"logps/chosen": -263.0,
"logps/rejected": -233.5,
"loss": 0.6704,
"rewards/accuracies": 0.4921875,
"rewards/chosen": 0.0859375,
"rewards/margins": 0.05419921875,
"rewards/rejected": 0.03167724609375,
"step": 28
},
{
"epoch": 0.032079646017699116,
"grad_norm": 22.794097900390625,
"learning_rate": 5e-07,
"logits/chosen": -1.47265625,
"logits/rejected": -1.52734375,
"logps/chosen": -251.5,
"logps/rejected": -277.0,
"loss": 0.6665,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.100341796875,
"rewards/margins": 0.0615234375,
"rewards/rejected": 0.03863525390625,
"step": 29
},
{
"epoch": 0.033185840707964605,
"grad_norm": 21.916282653808594,
"learning_rate": 4.999983923145526e-07,
"logits/chosen": -1.45703125,
"logits/rejected": -1.44140625,
"logps/chosen": -268.0,
"logps/rejected": -271.0,
"loss": 0.6672,
"rewards/accuracies": 0.5078125,
"rewards/chosen": 0.087158203125,
"rewards/margins": 0.052978515625,
"rewards/rejected": 0.03411865234375,
"step": 30
},
{
"epoch": 0.034292035398230086,
"grad_norm": 20.50246810913086,
"learning_rate": 4.999935692788877e-07,
"logits/chosen": -1.44140625,
"logits/rejected": -1.42578125,
"logps/chosen": -263.0,
"logps/rejected": -280.0,
"loss": 0.6626,
"rewards/accuracies": 0.4921875,
"rewards/chosen": 0.1103515625,
"rewards/margins": 0.071533203125,
"rewards/rejected": 0.03887939453125,
"step": 31
},
{
"epoch": 0.035398230088495575,
"grad_norm": 21.142545700073242,
"learning_rate": 4.999855309550366e-07,
"logits/chosen": -1.54296875,
"logits/rejected": -1.5859375,
"logps/chosen": -291.0,
"logps/rejected": -268.0,
"loss": 0.6704,
"rewards/accuracies": 0.5390625,
"rewards/chosen": 0.091552734375,
"rewards/margins": 0.0552978515625,
"rewards/rejected": 0.03607177734375,
"step": 32
},
{
"epoch": 0.03650442477876106,
"grad_norm": 20.50800895690918,
"learning_rate": 4.999742774463842e-07,
"logits/chosen": -1.4375,
"logits/rejected": -1.40234375,
"logps/chosen": -256.5,
"logps/rejected": -270.0,
"loss": 0.6494,
"rewards/accuracies": 0.6171875,
"rewards/chosen": 0.1484375,
"rewards/margins": 0.092041015625,
"rewards/rejected": 0.0565185546875,
"step": 33
},
{
"epoch": 0.03761061946902655,
"grad_norm": 19.532590866088867,
"learning_rate": 4.999598088976672e-07,
"logits/chosen": -1.49609375,
"logits/rejected": -1.4765625,
"logps/chosen": -250.0,
"logps/rejected": -260.0,
"loss": 0.6445,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.177734375,
"rewards/margins": 0.105224609375,
"rewards/rejected": 0.072509765625,
"step": 34
},
{
"epoch": 0.03871681415929203,
"grad_norm": 20.883621215820312,
"learning_rate": 4.999421254949727e-07,
"logits/chosen": -1.5390625,
"logits/rejected": -1.4140625,
"logps/chosen": -271.0,
"logps/rejected": -270.0,
"loss": 0.6501,
"rewards/accuracies": 0.5234375,
"rewards/chosen": 0.16943359375,
"rewards/margins": 0.103271484375,
"rewards/rejected": 0.0655517578125,
"step": 35
},
{
"epoch": 0.03982300884955752,
"grad_norm": 20.3232479095459,
"learning_rate": 4.999212274657353e-07,
"logits/chosen": -1.51953125,
"logits/rejected": -1.46484375,
"logps/chosen": -257.0,
"logps/rejected": -255.5,
"loss": 0.6428,
"rewards/accuracies": 0.6015625,
"rewards/chosen": 0.2080078125,
"rewards/margins": 0.114013671875,
"rewards/rejected": 0.09375,
"step": 36
},
{
"epoch": 0.04092920353982301,
"grad_norm": 21.007495880126953,
"learning_rate": 4.99897115078735e-07,
"logits/chosen": -1.4609375,
"logits/rejected": -1.58984375,
"logps/chosen": -259.5,
"logps/rejected": -253.0,
"loss": 0.636,
"rewards/accuracies": 0.5859375,
"rewards/chosen": 0.24609375,
"rewards/margins": 0.13525390625,
"rewards/rejected": 0.110595703125,
"step": 37
},
{
"epoch": 0.0420353982300885,
"grad_norm": 18.463993072509766,
"learning_rate": 4.998697886440926e-07,
"logits/chosen": -1.5078125,
"logits/rejected": -1.4375,
"logps/chosen": -242.0,
"logps/rejected": -246.0,
"loss": 0.6384,
"rewards/accuracies": 0.5234375,
"rewards/chosen": 0.23681640625,
"rewards/margins": 0.130859375,
"rewards/rejected": 0.106201171875,
"step": 38
},
{
"epoch": 0.04314159292035398,
"grad_norm": 20.67741584777832,
"learning_rate": 4.998392485132666e-07,
"logits/chosen": -1.49609375,
"logits/rejected": -1.375,
"logps/chosen": -267.0,
"logps/rejected": -275.0,
"loss": 0.6331,
"rewards/accuracies": 0.5546875,
"rewards/chosen": 0.275390625,
"rewards/margins": 0.14990234375,
"rewards/rejected": 0.12548828125,
"step": 39
},
{
"epoch": 0.04424778761061947,
"grad_norm": 20.392776489257812,
"learning_rate": 4.998054950790485e-07,
"logits/chosen": -1.359375,
"logits/rejected": -1.4609375,
"logps/chosen": -275.0,
"logps/rejected": -285.0,
"loss": 0.6218,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.2841796875,
"rewards/margins": 0.1650390625,
"rewards/rejected": 0.11865234375,
"step": 40
},
{
"epoch": 0.04535398230088496,
"grad_norm": 19.42493438720703,
"learning_rate": 4.997685287755575e-07,
"logits/chosen": -1.515625,
"logits/rejected": -1.3828125,
"logps/chosen": -271.0,
"logps/rejected": -262.5,
"loss": 0.6274,
"rewards/accuracies": 0.5703125,
"rewards/chosen": 0.2744140625,
"rewards/margins": 0.15234375,
"rewards/rejected": 0.12158203125,
"step": 41
},
{
"epoch": 0.046460176991150445,
"grad_norm": 19.07245635986328,
"learning_rate": 4.99728350078235e-07,
"logits/chosen": -1.53515625,
"logits/rejected": -1.4453125,
"logps/chosen": -274.0,
"logps/rejected": -251.5,
"loss": 0.6108,
"rewards/accuracies": 0.5859375,
"rewards/chosen": 0.3173828125,
"rewards/margins": 0.18896484375,
"rewards/rejected": 0.127685546875,
"step": 42
},
{
"epoch": 0.04756637168141593,
"grad_norm": 19.7177677154541,
"learning_rate": 4.996849595038388e-07,
"logits/chosen": -1.515625,
"logits/rejected": -1.49609375,
"logps/chosen": -273.5,
"logps/rejected": -281.0,
"loss": 0.6208,
"rewards/accuracies": 0.578125,
"rewards/chosen": 0.3310546875,
"rewards/margins": 0.17529296875,
"rewards/rejected": 0.15625,
"step": 43
},
{
"epoch": 0.048672566371681415,
"grad_norm": 19.820003509521484,
"learning_rate": 4.996383576104361e-07,
"logits/chosen": -1.5234375,
"logits/rejected": -1.421875,
"logps/chosen": -261.0,
"logps/rejected": -263.5,
"loss": 0.6196,
"rewards/accuracies": 0.5859375,
"rewards/chosen": 0.330078125,
"rewards/margins": 0.1796875,
"rewards/rejected": 0.15087890625,
"step": 44
},
{
"epoch": 0.049778761061946904,
"grad_norm": 20.092729568481445,
"learning_rate": 4.995885449973962e-07,
"logits/chosen": -1.36328125,
"logits/rejected": -1.39453125,
"logps/chosen": -293.0,
"logps/rejected": -295.0,
"loss": 0.6111,
"rewards/accuracies": 0.5859375,
"rewards/chosen": 0.3408203125,
"rewards/margins": 0.20068359375,
"rewards/rejected": 0.140380859375,
"step": 45
},
{
"epoch": 0.05088495575221239,
"grad_norm": 18.567899703979492,
"learning_rate": 4.995355223053834e-07,
"logits/chosen": -1.5,
"logits/rejected": -1.44921875,
"logps/chosen": -260.5,
"logps/rejected": -255.5,
"loss": 0.6146,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.3359375,
"rewards/margins": 0.1982421875,
"rewards/rejected": 0.13720703125,
"step": 46
},
{
"epoch": 0.051991150442477874,
"grad_norm": 20.356060028076172,
"learning_rate": 4.994792902163481e-07,
"logits/chosen": -1.45703125,
"logits/rejected": -1.29296875,
"logps/chosen": -280.0,
"logps/rejected": -260.0,
"loss": 0.627,
"rewards/accuracies": 0.5859375,
"rewards/chosen": 0.357421875,
"rewards/margins": 0.1787109375,
"rewards/rejected": 0.1787109375,
"step": 47
},
{
"epoch": 0.05309734513274336,
"grad_norm": 20.717748641967773,
"learning_rate": 4.994198494535182e-07,
"logits/chosen": -1.4765625,
"logits/rejected": -1.41796875,
"logps/chosen": -280.0,
"logps/rejected": -281.0,
"loss": 0.5881,
"rewards/accuracies": 0.6171875,
"rewards/chosen": 0.4091796875,
"rewards/margins": 0.25390625,
"rewards/rejected": 0.1552734375,
"step": 48
},
{
"epoch": 0.05420353982300885,
"grad_norm": 19.05792236328125,
"learning_rate": 4.993572007813904e-07,
"logits/chosen": -1.390625,
"logits/rejected": -1.35546875,
"logps/chosen": -251.5,
"logps/rejected": -277.0,
"loss": 0.5889,
"rewards/accuracies": 0.6015625,
"rewards/chosen": 0.41015625,
"rewards/margins": 0.263671875,
"rewards/rejected": 0.14599609375,
"step": 49
},
{
"epoch": 0.05530973451327434,
"grad_norm": 17.29762840270996,
"learning_rate": 4.992913450057195e-07,
"logits/chosen": -1.41796875,
"logits/rejected": -1.35546875,
"logps/chosen": -237.0,
"logps/rejected": -224.5,
"loss": 0.5867,
"rewards/accuracies": 0.6015625,
"rewards/chosen": 0.4501953125,
"rewards/margins": 0.26953125,
"rewards/rejected": 0.1796875,
"step": 50
},
{
"epoch": 0.05530973451327434,
"eval_logits/chosen": -1.4197372198104858,
"eval_logits/rejected": -1.4136348962783813,
"eval_logps/chosen": -255.96517944335938,
"eval_logps/rejected": -257.37811279296875,
"eval_loss": 0.5778365731239319,
"eval_rewards/accuracies": 0.6217424273490906,
"eval_rewards/chosen": 0.4856770932674408,
"eval_rewards/margins": 0.2985657751560211,
"eval_rewards/rejected": 0.1872473508119583,
"eval_runtime": 210.1095,
"eval_samples_per_second": 61.173,
"eval_steps_per_second": 0.957,
"step": 50
},
{
"epoch": 0.05641592920353982,
"grad_norm": 18.575069427490234,
"learning_rate": 4.992222829735082e-07,
"logits/chosen": -1.5078125,
"logits/rejected": -1.421875,
"logps/chosen": -260.0,
"logps/rejected": -259.0,
"loss": 0.5874,
"rewards/accuracies": 0.609375,
"rewards/chosen": 0.48046875,
"rewards/margins": 0.27734375,
"rewards/rejected": 0.203125,
"step": 51
},
{
"epoch": 0.05752212389380531,
"grad_norm": 17.976177215576172,
"learning_rate": 4.991500155729971e-07,
"logits/chosen": -1.42578125,
"logits/rejected": -1.43359375,
"logps/chosen": -252.5,
"logps/rejected": -256.0,
"loss": 0.575,
"rewards/accuracies": 0.6640625,
"rewards/chosen": 0.50390625,
"rewards/margins": 0.30078125,
"rewards/rejected": 0.20458984375,
"step": 52
},
{
"epoch": 0.0586283185840708,
"grad_norm": 17.979496002197266,
"learning_rate": 4.99074543733652e-07,
"logits/chosen": -1.4453125,
"logits/rejected": -1.46875,
"logps/chosen": -265.0,
"logps/rejected": -267.0,
"loss": 0.5444,
"rewards/accuracies": 0.7109375,
"rewards/chosen": 0.55859375,
"rewards/margins": 0.3984375,
"rewards/rejected": 0.1611328125,
"step": 53
},
{
"epoch": 0.059734513274336286,
"grad_norm": 17.736249923706055,
"learning_rate": 4.989958684261526e-07,
"logits/chosen": -1.31640625,
"logits/rejected": -1.375,
"logps/chosen": -255.5,
"logps/rejected": -292.0,
"loss": 0.5529,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.568359375,
"rewards/margins": 0.39453125,
"rewards/rejected": 0.17333984375,
"step": 54
},
{
"epoch": 0.06084070796460177,
"grad_norm": 17.284420013427734,
"learning_rate": 4.989139906623802e-07,
"logits/chosen": -1.44140625,
"logits/rejected": -1.42578125,
"logps/chosen": -253.0,
"logps/rejected": -256.5,
"loss": 0.5522,
"rewards/accuracies": 0.6953125,
"rewards/chosen": 0.599609375,
"rewards/margins": 0.373046875,
"rewards/rejected": 0.2255859375,
"step": 55
},
{
"epoch": 0.061946902654867256,
"grad_norm": 17.704713821411133,
"learning_rate": 4.988289114954044e-07,
"logits/chosen": -1.3671875,
"logits/rejected": -1.35546875,
"logps/chosen": -237.5,
"logps/rejected": -260.0,
"loss": 0.5504,
"rewards/accuracies": 0.609375,
"rewards/chosen": 0.654296875,
"rewards/margins": 0.3896484375,
"rewards/rejected": 0.263671875,
"step": 56
},
{
"epoch": 0.06305309734513274,
"grad_norm": 17.688888549804688,
"learning_rate": 4.987406320194694e-07,
"logits/chosen": -1.453125,
"logits/rejected": -1.3359375,
"logps/chosen": -242.5,
"logps/rejected": -247.5,
"loss": 0.5537,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.69140625,
"rewards/margins": 0.3857421875,
"rewards/rejected": 0.3046875,
"step": 57
},
{
"epoch": 0.06415929203539823,
"grad_norm": 17.646419525146484,
"learning_rate": 4.986491533699802e-07,
"logits/chosen": -1.41015625,
"logits/rejected": -1.37109375,
"logps/chosen": -256.5,
"logps/rejected": -281.0,
"loss": 0.5431,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.71484375,
"rewards/margins": 0.439453125,
"rewards/rejected": 0.27490234375,
"step": 58
},
{
"epoch": 0.06526548672566372,
"grad_norm": 17.19894027709961,
"learning_rate": 4.985544767234879e-07,
"logits/chosen": -1.41796875,
"logits/rejected": -1.4453125,
"logps/chosen": -245.5,
"logps/rejected": -250.5,
"loss": 0.5403,
"rewards/accuracies": 0.609375,
"rewards/chosen": 0.7421875,
"rewards/margins": 0.4541015625,
"rewards/rejected": 0.287109375,
"step": 59
},
{
"epoch": 0.06637168141592921,
"grad_norm": 16.630441665649414,
"learning_rate": 4.984566032976749e-07,
"logits/chosen": -1.390625,
"logits/rejected": -1.33984375,
"logps/chosen": -248.0,
"logps/rejected": -254.5,
"loss": 0.5386,
"rewards/accuracies": 0.6640625,
"rewards/chosen": 0.828125,
"rewards/margins": 0.466796875,
"rewards/rejected": 0.361328125,
"step": 60
},
{
"epoch": 0.06747787610619468,
"grad_norm": 17.252979278564453,
"learning_rate": 4.983555343513384e-07,
"logits/chosen": -1.41796875,
"logits/rejected": -1.4375,
"logps/chosen": -250.5,
"logps/rejected": -275.5,
"loss": 0.5028,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.8515625,
"rewards/margins": 0.578125,
"rewards/rejected": 0.271484375,
"step": 61
},
{
"epoch": 0.06858407079646017,
"grad_norm": 17.014602661132812,
"learning_rate": 4.982512711843752e-07,
"logits/chosen": -1.32421875,
"logits/rejected": -1.30078125,
"logps/chosen": -242.0,
"logps/rejected": -243.5,
"loss": 0.5159,
"rewards/accuracies": 0.671875,
"rewards/chosen": 0.94140625,
"rewards/margins": 0.5458984375,
"rewards/rejected": 0.39453125,
"step": 62
},
{
"epoch": 0.06969026548672566,
"grad_norm": 16.877351760864258,
"learning_rate": 4.98143815137764e-07,
"logits/chosen": -1.40234375,
"logits/rejected": -1.36328125,
"logps/chosen": -264.0,
"logps/rejected": -285.0,
"loss": 0.5343,
"rewards/accuracies": 0.6328125,
"rewards/chosen": 0.857421875,
"rewards/margins": 0.4853515625,
"rewards/rejected": 0.373046875,
"step": 63
},
{
"epoch": 0.07079646017699115,
"grad_norm": 17.047161102294922,
"learning_rate": 4.980331675935493e-07,
"logits/chosen": -1.33203125,
"logits/rejected": -1.34375,
"logps/chosen": -254.5,
"logps/rejected": -291.0,
"loss": 0.5211,
"rewards/accuracies": 0.6484375,
"rewards/chosen": 0.962890625,
"rewards/margins": 0.533203125,
"rewards/rejected": 0.4296875,
"step": 64
},
{
"epoch": 0.07190265486725664,
"grad_norm": 17.61670684814453,
"learning_rate": 4.979193299748224e-07,
"logits/chosen": -1.2890625,
"logits/rejected": -1.3828125,
"logps/chosen": -265.0,
"logps/rejected": -278.0,
"loss": 0.4878,
"rewards/accuracies": 0.7109375,
"rewards/chosen": 0.96484375,
"rewards/margins": 0.6640625,
"rewards/rejected": 0.2998046875,
"step": 65
},
{
"epoch": 0.07300884955752213,
"grad_norm": 17.460668563842773,
"learning_rate": 4.978023037457043e-07,
"logits/chosen": -1.44921875,
"logits/rejected": -1.3515625,
"logps/chosen": -267.0,
"logps/rejected": -279.0,
"loss": 0.536,
"rewards/accuracies": 0.5703125,
"rewards/chosen": 0.95703125,
"rewards/margins": 0.5390625,
"rewards/rejected": 0.416015625,
"step": 66
},
{
"epoch": 0.07411504424778761,
"grad_norm": 181.168212890625,
"learning_rate": 4.976820904113256e-07,
"logits/chosen": -1.36328125,
"logits/rejected": -1.30859375,
"logps/chosen": -233.5,
"logps/rejected": -340.0,
"loss": 0.489,
"rewards/accuracies": 0.734375,
"rewards/chosen": 1.05859375,
"rewards/margins": 0.646484375,
"rewards/rejected": 0.41015625,
"step": 67
},
{
"epoch": 0.0752212389380531,
"grad_norm": 15.64547061920166,
"learning_rate": 4.975586915178084e-07,
"logits/chosen": -1.3671875,
"logits/rejected": -1.40234375,
"logps/chosen": -241.5,
"logps/rejected": -256.0,
"loss": 0.4702,
"rewards/accuracies": 0.6953125,
"rewards/chosen": 1.041015625,
"rewards/margins": 0.716796875,
"rewards/rejected": 0.32421875,
"step": 68
},
{
"epoch": 0.07632743362831858,
"grad_norm": 16.069597244262695,
"learning_rate": 4.974321086522452e-07,
"logits/chosen": -1.3046875,
"logits/rejected": -1.234375,
"logps/chosen": -256.5,
"logps/rejected": -251.5,
"loss": 0.5188,
"rewards/accuracies": 0.6328125,
"rewards/chosen": 0.986328125,
"rewards/margins": 0.568359375,
"rewards/rejected": 0.4208984375,
"step": 69
},
{
"epoch": 0.07743362831858407,
"grad_norm": 149.1916046142578,
"learning_rate": 4.973023434426798e-07,
"logits/chosen": -1.4140625,
"logits/rejected": -1.421875,
"logps/chosen": -248.5,
"logps/rejected": -247.0,
"loss": 0.5642,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.96875,
"rewards/margins": 0.5107421875,
"rewards/rejected": 0.458984375,
"step": 70
},
{
"epoch": 0.07853982300884955,
"grad_norm": 16.17060089111328,
"learning_rate": 4.971693975580851e-07,
"logits/chosen": -1.390625,
"logits/rejected": -1.28515625,
"logps/chosen": -232.0,
"logps/rejected": -241.0,
"loss": 0.5079,
"rewards/accuracies": 0.6328125,
"rewards/chosen": 1.025390625,
"rewards/margins": 0.61328125,
"rewards/rejected": 0.4140625,
"step": 71
},
{
"epoch": 0.07964601769911504,
"grad_norm": 17.631471633911133,
"learning_rate": 4.970332727083425e-07,
"logits/chosen": -1.36328125,
"logits/rejected": -1.35546875,
"logps/chosen": -271.5,
"logps/rejected": -283.0,
"loss": 0.5212,
"rewards/accuracies": 0.625,
"rewards/chosen": 1.1328125,
"rewards/margins": 0.619140625,
"rewards/rejected": 0.5166015625,
"step": 72
},
{
"epoch": 0.08075221238938053,
"grad_norm": 17.61063003540039,
"learning_rate": 4.968939706442195e-07,
"logits/chosen": -1.39453125,
"logits/rejected": -1.203125,
"logps/chosen": -275.0,
"logps/rejected": -255.5,
"loss": 0.5211,
"rewards/accuracies": 0.640625,
"rewards/chosen": 0.99609375,
"rewards/margins": 0.583984375,
"rewards/rejected": 0.4130859375,
"step": 73
},
{
"epoch": 0.08185840707964602,
"grad_norm": 16.28321075439453,
"learning_rate": 4.967514931573472e-07,
"logits/chosen": -1.26953125,
"logits/rejected": -1.41015625,
"logps/chosen": -243.0,
"logps/rejected": -258.0,
"loss": 0.4977,
"rewards/accuracies": 0.6640625,
"rewards/chosen": 1.20703125,
"rewards/margins": 0.689453125,
"rewards/rejected": 0.5146484375,
"step": 74
},
{
"epoch": 0.08296460176991151,
"grad_norm": 16.079599380493164,
"learning_rate": 4.966058420801977e-07,
"logits/chosen": -1.34765625,
"logits/rejected": -1.35546875,
"logps/chosen": -259.5,
"logps/rejected": -244.0,
"loss": 0.4648,
"rewards/accuracies": 0.6796875,
"rewards/chosen": 1.13671875,
"rewards/margins": 0.779296875,
"rewards/rejected": 0.3583984375,
"step": 75
},
{
"epoch": 0.084070796460177,
"grad_norm": 18.725271224975586,
"learning_rate": 4.964570192860596e-07,
"logits/chosen": -1.359375,
"logits/rejected": -1.36328125,
"logps/chosen": -287.0,
"logps/rejected": -250.5,
"loss": 0.5436,
"rewards/accuracies": 0.5859375,
"rewards/chosen": 1.109375,
"rewards/margins": 0.5234375,
"rewards/rejected": 0.583984375,
"step": 76
},
{
"epoch": 0.08517699115044247,
"grad_norm": 15.678024291992188,
"learning_rate": 4.963050266890152e-07,
"logits/chosen": -1.328125,
"logits/rejected": -1.4609375,
"logps/chosen": -253.0,
"logps/rejected": -246.0,
"loss": 0.4833,
"rewards/accuracies": 0.703125,
"rewards/chosen": 1.15625,
"rewards/margins": 0.767578125,
"rewards/rejected": 0.390625,
"step": 77
},
{
"epoch": 0.08628318584070796,
"grad_norm": 15.54704761505127,
"learning_rate": 4.961498662439145e-07,
"logits/chosen": -1.32421875,
"logits/rejected": -1.359375,
"logps/chosen": -230.5,
"logps/rejected": -249.0,
"loss": 0.4718,
"rewards/accuracies": 0.7109375,
"rewards/chosen": 1.3046875,
"rewards/margins": 0.82421875,
"rewards/rejected": 0.48046875,
"step": 78
},
{
"epoch": 0.08738938053097345,
"grad_norm": 16.073156356811523,
"learning_rate": 4.959915399463512e-07,
"logits/chosen": -1.2890625,
"logits/rejected": -1.27734375,
"logps/chosen": -246.0,
"logps/rejected": -259.5,
"loss": 0.4602,
"rewards/accuracies": 0.671875,
"rewards/chosen": 1.27734375,
"rewards/margins": 0.85546875,
"rewards/rejected": 0.423828125,
"step": 79
},
{
"epoch": 0.08849557522123894,
"grad_norm": 14.94100284576416,
"learning_rate": 4.958300498326362e-07,
"logits/chosen": -1.26953125,
"logits/rejected": -1.40234375,
"logps/chosen": -231.0,
"logps/rejected": -264.5,
"loss": 0.4397,
"rewards/accuracies": 0.7421875,
"rewards/chosen": 1.30078125,
"rewards/margins": 0.953125,
"rewards/rejected": 0.345703125,
"step": 80
},
{
"epoch": 0.08960176991150443,
"grad_norm": 17.28353500366211,
"learning_rate": 4.956653979797721e-07,
"logits/chosen": -1.3671875,
"logits/rejected": -1.3828125,
"logps/chosen": -280.0,
"logps/rejected": -259.5,
"loss": 0.5153,
"rewards/accuracies": 0.609375,
"rewards/chosen": 1.18359375,
"rewards/margins": 0.71875,
"rewards/rejected": 0.4658203125,
"step": 81
},
{
"epoch": 0.09070796460176991,
"grad_norm": 15.823220252990723,
"learning_rate": 4.954975865054259e-07,
"logits/chosen": -1.3671875,
"logits/rejected": -1.32421875,
"logps/chosen": -255.5,
"logps/rejected": -256.5,
"loss": 0.4614,
"rewards/accuracies": 0.6875,
"rewards/chosen": 1.34375,
"rewards/margins": 0.87109375,
"rewards/rejected": 0.474609375,
"step": 82
},
{
"epoch": 0.0918141592920354,
"grad_norm": 14.873113632202148,
"learning_rate": 4.953266175679023e-07,
"logits/chosen": -1.2890625,
"logits/rejected": -1.3125,
"logps/chosen": -236.5,
"logps/rejected": -241.5,
"loss": 0.4586,
"rewards/accuracies": 0.703125,
"rewards/chosen": 1.33984375,
"rewards/margins": 0.869140625,
"rewards/rejected": 0.47265625,
"step": 83
},
{
"epoch": 0.09292035398230089,
"grad_norm": 16.7225341796875,
"learning_rate": 4.951524933661154e-07,
"logits/chosen": -1.328125,
"logits/rejected": -1.30078125,
"logps/chosen": -256.0,
"logps/rejected": -237.0,
"loss": 0.5129,
"rewards/accuracies": 0.640625,
"rewards/chosen": 1.1953125,
"rewards/margins": 0.6953125,
"rewards/rejected": 0.5009765625,
"step": 84
},
{
"epoch": 0.09402654867256637,
"grad_norm": 15.362020492553711,
"learning_rate": 4.949752161395605e-07,
"logits/chosen": -1.3046875,
"logits/rejected": -1.24609375,
"logps/chosen": -257.0,
"logps/rejected": -252.0,
"loss": 0.4339,
"rewards/accuracies": 0.703125,
"rewards/chosen": 1.2890625,
"rewards/margins": 0.990234375,
"rewards/rejected": 0.30078125,
"step": 85
},
{
"epoch": 0.09513274336283185,
"grad_norm": 15.91197681427002,
"learning_rate": 4.94794788168286e-07,
"logits/chosen": -1.32421875,
"logits/rejected": -1.24609375,
"logps/chosen": -229.5,
"logps/rejected": -255.5,
"loss": 0.4675,
"rewards/accuracies": 0.6875,
"rewards/chosen": 1.3203125,
"rewards/margins": 0.85546875,
"rewards/rejected": 0.4658203125,
"step": 86
},
{
"epoch": 0.09623893805309734,
"grad_norm": 15.647570610046387,
"learning_rate": 4.946112117728634e-07,
"logits/chosen": -1.3671875,
"logits/rejected": -1.3359375,
"logps/chosen": -243.0,
"logps/rejected": -235.5,
"loss": 0.4574,
"rewards/accuracies": 0.671875,
"rewards/chosen": 1.28515625,
"rewards/margins": 0.865234375,
"rewards/rejected": 0.41796875,
"step": 87
},
{
"epoch": 0.09734513274336283,
"grad_norm": 17.21525001525879,
"learning_rate": 4.944244893143572e-07,
"logits/chosen": -1.3515625,
"logits/rejected": -1.26953125,
"logps/chosen": -268.0,
"logps/rejected": -264.0,
"loss": 0.4832,
"rewards/accuracies": 0.671875,
"rewards/chosen": 1.28515625,
"rewards/margins": 0.826171875,
"rewards/rejected": 0.4580078125,
"step": 88
},
{
"epoch": 0.09845132743362832,
"grad_norm": 16.55433464050293,
"learning_rate": 4.942346231942955e-07,
"logits/chosen": -1.3671875,
"logits/rejected": -1.3828125,
"logps/chosen": -255.0,
"logps/rejected": -258.5,
"loss": 0.4967,
"rewards/accuracies": 0.609375,
"rewards/chosen": 1.3359375,
"rewards/margins": 0.80078125,
"rewards/rejected": 0.53515625,
"step": 89
},
{
"epoch": 0.09955752212389381,
"grad_norm": 15.434545516967773,
"learning_rate": 4.94041615854638e-07,
"logits/chosen": -1.3671875,
"logits/rejected": -1.28515625,
"logps/chosen": -265.0,
"logps/rejected": -262.5,
"loss": 0.4258,
"rewards/accuracies": 0.6640625,
"rewards/chosen": 1.42578125,
"rewards/margins": 1.0390625,
"rewards/rejected": 0.3876953125,
"step": 90
},
{
"epoch": 0.1006637168141593,
"grad_norm": 16.600032806396484,
"learning_rate": 4.938454697777457e-07,
"logits/chosen": -1.234375,
"logits/rejected": -1.1953125,
"logps/chosen": -279.0,
"logps/rejected": -278.0,
"loss": 0.473,
"rewards/accuracies": 0.671875,
"rewards/chosen": 1.265625,
"rewards/margins": 0.8828125,
"rewards/rejected": 0.3818359375,
"step": 91
},
{
"epoch": 0.10176991150442478,
"grad_norm": 17.01357078552246,
"learning_rate": 4.936461874863479e-07,
"logits/chosen": -1.3828125,
"logits/rejected": -1.3828125,
"logps/chosen": -255.5,
"logps/rejected": -286.0,
"loss": 0.495,
"rewards/accuracies": 0.640625,
"rewards/chosen": 1.3125,
"rewards/margins": 0.833984375,
"rewards/rejected": 0.4765625,
"step": 92
},
{
"epoch": 0.10287610619469026,
"grad_norm": 15.263469696044922,
"learning_rate": 4.934437715435107e-07,
"logits/chosen": -1.27734375,
"logits/rejected": -1.33203125,
"logps/chosen": -244.5,
"logps/rejected": -245.0,
"loss": 0.4545,
"rewards/accuracies": 0.6875,
"rewards/chosen": 1.328125,
"rewards/margins": 0.955078125,
"rewards/rejected": 0.3759765625,
"step": 93
},
{
"epoch": 0.10398230088495575,
"grad_norm": 15.016999244689941,
"learning_rate": 4.932382245526034e-07,
"logits/chosen": -1.23828125,
"logits/rejected": -1.21875,
"logps/chosen": -245.0,
"logps/rejected": -256.5,
"loss": 0.4381,
"rewards/accuracies": 0.7265625,
"rewards/chosen": 1.34375,
"rewards/margins": 0.970703125,
"rewards/rejected": 0.3740234375,
"step": 94
},
{
"epoch": 0.10508849557522124,
"grad_norm": 16.34784507751465,
"learning_rate": 4.930295491572653e-07,
"logits/chosen": -1.37109375,
"logits/rejected": -1.3203125,
"logps/chosen": -247.0,
"logps/rejected": -260.0,
"loss": 0.4766,
"rewards/accuracies": 0.640625,
"rewards/chosen": 1.34765625,
"rewards/margins": 0.87890625,
"rewards/rejected": 0.46875,
"step": 95
},
{
"epoch": 0.10619469026548672,
"grad_norm": 15.943212509155273,
"learning_rate": 4.928177480413714e-07,
"logits/chosen": -1.3046875,
"logits/rejected": -1.28125,
"logps/chosen": -259.0,
"logps/rejected": -270.5,
"loss": 0.4727,
"rewards/accuracies": 0.6484375,
"rewards/chosen": 1.3515625,
"rewards/margins": 0.955078125,
"rewards/rejected": 0.3955078125,
"step": 96
},
{
"epoch": 0.10730088495575221,
"grad_norm": 16.164276123046875,
"learning_rate": 4.926028239289984e-07,
"logits/chosen": -1.2421875,
"logits/rejected": -1.38671875,
"logps/chosen": -273.5,
"logps/rejected": -268.5,
"loss": 0.4556,
"rewards/accuracies": 0.6640625,
"rewards/chosen": 1.37890625,
"rewards/margins": 0.97265625,
"rewards/rejected": 0.4072265625,
"step": 97
},
{
"epoch": 0.1084070796460177,
"grad_norm": 16.16509437561035,
"learning_rate": 4.923847795843893e-07,
"logits/chosen": -1.359375,
"logits/rejected": -1.2265625,
"logps/chosen": -270.0,
"logps/rejected": -277.0,
"loss": 0.4657,
"rewards/accuracies": 0.6875,
"rewards/chosen": 1.265625,
"rewards/margins": 0.947265625,
"rewards/rejected": 0.3173828125,
"step": 98
},
{
"epoch": 0.10951327433628319,
"grad_norm": 16.42505645751953,
"learning_rate": 4.921636178119177e-07,
"logits/chosen": -1.47265625,
"logits/rejected": -1.16796875,
"logps/chosen": -251.0,
"logps/rejected": -232.5,
"loss": 0.4747,
"rewards/accuracies": 0.6328125,
"rewards/chosen": 1.296875,
"rewards/margins": 0.8671875,
"rewards/rejected": 0.4287109375,
"step": 99
},
{
"epoch": 0.11061946902654868,
"grad_norm": 23.872831344604492,
"learning_rate": 4.919393414560522e-07,
"logits/chosen": -1.39453125,
"logits/rejected": -1.3125,
"logps/chosen": -246.0,
"logps/rejected": -249.5,
"loss": 0.4521,
"rewards/accuracies": 0.6875,
"rewards/chosen": 1.26171875,
"rewards/margins": 0.93359375,
"rewards/rejected": 0.328125,
"step": 100
},
{
"epoch": 0.11061946902654868,
"eval_logits/chosen": -1.329796314239502,
"eval_logits/rejected": -1.3044931888580322,
"eval_logps/chosen": -247.93531799316406,
"eval_logps/rejected": -256.3333435058594,
"eval_loss": 0.4438014328479767,
"eval_rewards/accuracies": 0.7061508893966675,
"eval_rewards/chosen": 1.2971081733703613,
"eval_rewards/margins": 1.0102806091308594,
"eval_rewards/rejected": 0.2868901491165161,
"eval_runtime": 193.1281,
"eval_samples_per_second": 66.552,
"eval_steps_per_second": 1.041,
"step": 100
},
{
"epoch": 0.11172566371681415,
"grad_norm": 13.894512176513672,
"learning_rate": 4.917119534013193e-07,
"logits/chosen": -1.26953125,
"logits/rejected": -1.234375,
"logps/chosen": -233.0,
"logps/rejected": -233.0,
"loss": 0.418,
"rewards/accuracies": 0.71875,
"rewards/chosen": 1.25390625,
"rewards/margins": 1.0546875,
"rewards/rejected": 0.1982421875,
"step": 101
},
{
"epoch": 0.11283185840707964,
"grad_norm": 15.946537017822266,
"learning_rate": 4.91481456572267e-07,
"logits/chosen": -1.359375,
"logits/rejected": -1.23828125,
"logps/chosen": -251.5,
"logps/rejected": -249.5,
"loss": 0.4642,
"rewards/accuracies": 0.6953125,
"rewards/chosen": 1.22265625,
"rewards/margins": 0.91796875,
"rewards/rejected": 0.30517578125,
"step": 102
},
{
"epoch": 0.11393805309734513,
"grad_norm": 13.790292739868164,
"learning_rate": 4.912478539334264e-07,
"logits/chosen": -1.27734375,
"logits/rejected": -1.35546875,
"logps/chosen": -223.5,
"logps/rejected": -241.0,
"loss": 0.3972,
"rewards/accuracies": 0.7421875,
"rewards/chosen": 1.40625,
"rewards/margins": 1.14453125,
"rewards/rejected": 0.25634765625,
"step": 103
},
{
"epoch": 0.11504424778761062,
"grad_norm": 14.399200439453125,
"learning_rate": 4.910111484892739e-07,
"logits/chosen": -1.296875,
"logits/rejected": -1.26171875,
"logps/chosen": -240.5,
"logps/rejected": -260.5,
"loss": 0.3929,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 1.36328125,
"rewards/margins": 1.23046875,
"rewards/rejected": 0.1336669921875,
"step": 104
},
{
"epoch": 0.1161504424778761,
"grad_norm": 16.46123504638672,
"learning_rate": 4.907713432841928e-07,
"logits/chosen": -1.24609375,
"logits/rejected": -1.14453125,
"logps/chosen": -255.0,
"logps/rejected": -229.0,
"loss": 0.5001,
"rewards/accuracies": 0.6171875,
"rewards/chosen": 1.0625,
"rewards/margins": 0.828125,
"rewards/rejected": 0.23486328125,
"step": 105
},
{
"epoch": 0.1172566371681416,
"grad_norm": 16.047285079956055,
"learning_rate": 4.905284414024337e-07,
"logits/chosen": -1.23828125,
"logits/rejected": -1.40234375,
"logps/chosen": -242.5,
"logps/rejected": -284.0,
"loss": 0.4525,
"rewards/accuracies": 0.7109375,
"rewards/chosen": 1.22265625,
"rewards/margins": 0.986328125,
"rewards/rejected": 0.23388671875,
"step": 106
},
{
"epoch": 0.11836283185840708,
"grad_norm": 16.925933837890625,
"learning_rate": 4.902824459680752e-07,
"logits/chosen": -1.2578125,
"logits/rejected": -1.3125,
"logps/chosen": -265.0,
"logps/rejected": -266.0,
"loss": 0.46,
"rewards/accuracies": 0.7421875,
"rewards/chosen": 1.19140625,
"rewards/margins": 0.978515625,
"rewards/rejected": 0.209228515625,
"step": 107
},
{
"epoch": 0.11946902654867257,
"grad_norm": 15.083377838134766,
"learning_rate": 4.900333601449835e-07,
"logits/chosen": -1.265625,
"logits/rejected": -1.33203125,
"logps/chosen": -266.0,
"logps/rejected": -265.0,
"loss": 0.4376,
"rewards/accuracies": 0.6875,
"rewards/chosen": 1.21484375,
"rewards/margins": 1.05078125,
"rewards/rejected": 0.162109375,
"step": 108
},
{
"epoch": 0.12057522123893805,
"grad_norm": 17.161651611328125,
"learning_rate": 4.89781187136772e-07,
"logits/chosen": -1.25,
"logits/rejected": -1.27734375,
"logps/chosen": -254.0,
"logps/rejected": -276.0,
"loss": 0.4388,
"rewards/accuracies": 0.71875,
"rewards/chosen": 1.26171875,
"rewards/margins": 1.09375,
"rewards/rejected": 0.16796875,
"step": 109
},
{
"epoch": 0.12168141592920353,
"grad_norm": 15.041230201721191,
"learning_rate": 4.895259301867595e-07,
"logits/chosen": -1.23828125,
"logits/rejected": -1.328125,
"logps/chosen": -246.0,
"logps/rejected": -280.0,
"loss": 0.4269,
"rewards/accuracies": 0.7109375,
"rewards/chosen": 1.3046875,
"rewards/margins": 1.1171875,
"rewards/rejected": 0.18603515625,
"step": 110
},
{
"epoch": 0.12278761061946902,
"grad_norm": 13.183340072631836,
"learning_rate": 4.892675925779292e-07,
"logits/chosen": -1.3359375,
"logits/rejected": -1.36328125,
"logps/chosen": -207.5,
"logps/rejected": -250.0,
"loss": 0.4122,
"rewards/accuracies": 0.734375,
"rewards/chosen": 1.328125,
"rewards/margins": 1.26171875,
"rewards/rejected": 0.0699310302734375,
"step": 111
},
{
"epoch": 0.12389380530973451,
"grad_norm": 15.673736572265625,
"learning_rate": 4.89006177632886e-07,
"logits/chosen": -1.375,
"logits/rejected": -1.37109375,
"logps/chosen": -263.0,
"logps/rejected": -272.0,
"loss": 0.4412,
"rewards/accuracies": 0.765625,
"rewards/chosen": 1.24609375,
"rewards/margins": 1.08203125,
"rewards/rejected": 0.1640625,
"step": 112
},
{
"epoch": 0.125,
"grad_norm": 14.536067008972168,
"learning_rate": 4.887416887138138e-07,
"logits/chosen": -1.1875,
"logits/rejected": -1.15625,
"logps/chosen": -257.5,
"logps/rejected": -270.0,
"loss": 0.4604,
"rewards/accuracies": 0.671875,
"rewards/chosen": 1.1953125,
"rewards/margins": 1.09375,
"rewards/rejected": 0.10595703125,
"step": 113
},
{
"epoch": 0.1261061946902655,
"grad_norm": 14.573882102966309,
"learning_rate": 4.884741292224326e-07,
"logits/chosen": -1.296875,
"logits/rejected": -1.34765625,
"logps/chosen": -240.0,
"logps/rejected": -268.5,
"loss": 0.4091,
"rewards/accuracies": 0.734375,
"rewards/chosen": 1.265625,
"rewards/margins": 1.16796875,
"rewards/rejected": 0.09814453125,
"step": 114
},
{
"epoch": 0.12721238938053098,
"grad_norm": 15.746649742126465,
"learning_rate": 4.882035025999544e-07,
"logits/chosen": -1.22265625,
"logits/rejected": -1.1875,
"logps/chosen": -273.5,
"logps/rejected": -270.5,
"loss": 0.4313,
"rewards/accuracies": 0.71875,
"rewards/chosen": 1.15234375,
"rewards/margins": 1.1640625,
"rewards/rejected": -0.0108642578125,
"step": 115
},
{
"epoch": 0.12831858407079647,
"grad_norm": 15.159698486328125,
"learning_rate": 4.879298123270391e-07,
"logits/chosen": -1.32421875,
"logits/rejected": -1.36328125,
"logps/chosen": -244.5,
"logps/rejected": -256.0,
"loss": 0.4331,
"rewards/accuracies": 0.6796875,
"rewards/chosen": 1.23046875,
"rewards/margins": 1.171875,
"rewards/rejected": 0.0592041015625,
"step": 116
},
{
"epoch": 0.12942477876106195,
"grad_norm": 14.491214752197266,
"learning_rate": 4.876530619237495e-07,
"logits/chosen": -1.20703125,
"logits/rejected": -1.2578125,
"logps/chosen": -235.5,
"logps/rejected": -233.5,
"loss": 0.4267,
"rewards/accuracies": 0.6953125,
"rewards/chosen": 1.19140625,
"rewards/margins": 1.20703125,
"rewards/rejected": -0.018310546875,
"step": 117
},
{
"epoch": 0.13053097345132744,
"grad_norm": 15.388319969177246,
"learning_rate": 4.873732549495065e-07,
"logits/chosen": -1.3203125,
"logits/rejected": -1.25,
"logps/chosen": -263.0,
"logps/rejected": -254.5,
"loss": 0.4351,
"rewards/accuracies": 0.7109375,
"rewards/chosen": 1.16796875,
"rewards/margins": 1.1171875,
"rewards/rejected": 0.05218505859375,
"step": 118
},
{
"epoch": 0.13163716814159293,
"grad_norm": 14.733474731445312,
"learning_rate": 4.870903950030428e-07,
"logits/chosen": -1.35546875,
"logits/rejected": -1.31640625,
"logps/chosen": -241.5,
"logps/rejected": -274.0,
"loss": 0.376,
"rewards/accuracies": 0.765625,
"rewards/chosen": 1.390625,
"rewards/margins": 1.30859375,
"rewards/rejected": 0.086669921875,
"step": 119
},
{
"epoch": 0.13274336283185842,
"grad_norm": 14.26261043548584,
"learning_rate": 4.868044857223571e-07,
"logits/chosen": -1.2890625,
"logits/rejected": -1.29296875,
"logps/chosen": -248.5,
"logps/rejected": -281.0,
"loss": 0.3815,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 1.296875,
"rewards/margins": 1.3046875,
"rewards/rejected": -0.008544921875,
"step": 120
},
{
"epoch": 0.1338495575221239,
"grad_norm": 15.668647766113281,
"learning_rate": 4.865155307846669e-07,
"logits/chosen": -1.3828125,
"logits/rejected": -1.4375,
"logps/chosen": -232.0,
"logps/rejected": -240.0,
"loss": 0.4114,
"rewards/accuracies": 0.7109375,
"rewards/chosen": 1.2734375,
"rewards/margins": 1.2421875,
"rewards/rejected": 0.0323486328125,
"step": 121
},
{
"epoch": 0.13495575221238937,
"grad_norm": 16.117341995239258,
"learning_rate": 4.862235339063613e-07,
"logits/chosen": -1.35546875,
"logits/rejected": -1.26171875,
"logps/chosen": -252.5,
"logps/rejected": -268.0,
"loss": 0.4789,
"rewards/accuracies": 0.6640625,
"rewards/chosen": 1.10546875,
"rewards/margins": 0.9375,
"rewards/rejected": 0.169189453125,
"step": 122
},
{
"epoch": 0.13606194690265486,
"grad_norm": 15.315237998962402,
"learning_rate": 4.859284988429533e-07,
"logits/chosen": -1.390625,
"logits/rejected": -1.37109375,
"logps/chosen": -264.0,
"logps/rejected": -302.0,
"loss": 0.4574,
"rewards/accuracies": 0.6796875,
"rewards/chosen": 1.09375,
"rewards/margins": 1.0234375,
"rewards/rejected": 0.0693359375,
"step": 123
},
{
"epoch": 0.13716814159292035,
"grad_norm": 13.38134765625,
"learning_rate": 4.856304293890317e-07,
"logits/chosen": -1.25,
"logits/rejected": -1.18359375,
"logps/chosen": -255.0,
"logps/rejected": -253.5,
"loss": 0.3681,
"rewards/accuracies": 0.765625,
"rewards/chosen": 1.3984375,
"rewards/margins": 1.484375,
"rewards/rejected": -0.08575439453125,
"step": 124
},
{
"epoch": 0.13827433628318583,
"grad_norm": 17.225801467895508,
"learning_rate": 4.853293293782118e-07,
"logits/chosen": -1.39453125,
"logits/rejected": -1.4140625,
"logps/chosen": -276.0,
"logps/rejected": -280.0,
"loss": 0.458,
"rewards/accuracies": 0.6796875,
"rewards/chosen": 1.1796875,
"rewards/margins": 1.08203125,
"rewards/rejected": 0.09991455078125,
"step": 125
},
{
"epoch": 0.13938053097345132,
"grad_norm": 14.186132431030273,
"learning_rate": 4.850252026830863e-07,
"logits/chosen": -1.359375,
"logits/rejected": -1.26953125,
"logps/chosen": -234.5,
"logps/rejected": -252.5,
"loss": 0.4436,
"rewards/accuracies": 0.734375,
"rewards/chosen": 1.25390625,
"rewards/margins": 1.123046875,
"rewards/rejected": 0.1328125,
"step": 126
},
{
"epoch": 0.1404867256637168,
"grad_norm": 14.477481842041016,
"learning_rate": 4.84718053215176e-07,
"logits/chosen": -1.25390625,
"logits/rejected": -1.23046875,
"logps/chosen": -249.5,
"logps/rejected": -256.0,
"loss": 0.4314,
"rewards/accuracies": 0.703125,
"rewards/chosen": 1.296875,
"rewards/margins": 1.140625,
"rewards/rejected": 0.1552734375,
"step": 127
},
{
"epoch": 0.1415929203539823,
"grad_norm": 15.153040885925293,
"learning_rate": 4.844078849248785e-07,
"logits/chosen": -1.30859375,
"logits/rejected": -1.3125,
"logps/chosen": -260.0,
"logps/rejected": -292.0,
"loss": 0.3964,
"rewards/accuracies": 0.75,
"rewards/chosen": 1.421875,
"rewards/margins": 1.37890625,
"rewards/rejected": 0.0396728515625,
"step": 128
},
{
"epoch": 0.1426991150442478,
"grad_norm": 14.35177230834961,
"learning_rate": 4.840947018014182e-07,
"logits/chosen": -1.23828125,
"logits/rejected": -1.19140625,
"logps/chosen": -256.5,
"logps/rejected": -251.5,
"loss": 0.4107,
"rewards/accuracies": 0.7578125,
"rewards/chosen": 1.33203125,
"rewards/margins": 1.2734375,
"rewards/rejected": 0.060028076171875,
"step": 129
},
{
"epoch": 0.14380530973451328,
"grad_norm": 14.168734550476074,
"learning_rate": 4.837785078727948e-07,
"logits/chosen": -1.25,
"logits/rejected": -1.19140625,
"logps/chosen": -248.0,
"logps/rejected": -284.0,
"loss": 0.3812,
"rewards/accuracies": 0.7578125,
"rewards/chosen": 1.3984375,
"rewards/margins": 1.390625,
"rewards/rejected": 0.00927734375,
"step": 130
},
{
"epoch": 0.14491150442477876,
"grad_norm": 15.743026733398438,
"learning_rate": 4.834593072057313e-07,
"logits/chosen": -1.28125,
"logits/rejected": -1.30078125,
"logps/chosen": -246.0,
"logps/rejected": -265.0,
"loss": 0.4586,
"rewards/accuracies": 0.6875,
"rewards/chosen": 1.31640625,
"rewards/margins": 1.08984375,
"rewards/rejected": 0.2255859375,
"step": 131
},
{
"epoch": 0.14601769911504425,
"grad_norm": 16.969074249267578,
"learning_rate": 4.831371039056217e-07,
"logits/chosen": -1.20703125,
"logits/rejected": -1.1484375,
"logps/chosen": -275.0,
"logps/rejected": -296.0,
"loss": 0.4373,
"rewards/accuracies": 0.7265625,
"rewards/chosen": 1.2109375,
"rewards/margins": 1.19921875,
"rewards/rejected": 0.0108642578125,
"step": 132
},
{
"epoch": 0.14712389380530974,
"grad_norm": 14.101773262023926,
"learning_rate": 4.828119021164786e-07,
"logits/chosen": -1.25390625,
"logits/rejected": -1.296875,
"logps/chosen": -246.5,
"logps/rejected": -277.0,
"loss": 0.3919,
"rewards/accuracies": 0.7578125,
"rewards/chosen": 1.359375,
"rewards/margins": 1.43359375,
"rewards/rejected": -0.07568359375,
"step": 133
},
{
"epoch": 0.14823008849557523,
"grad_norm": 15.83488941192627,
"learning_rate": 4.824837060208795e-07,
"logits/chosen": -1.3046875,
"logits/rejected": -1.265625,
"logps/chosen": -275.0,
"logps/rejected": -268.5,
"loss": 0.4578,
"rewards/accuracies": 0.703125,
"rewards/chosen": 1.1953125,
"rewards/margins": 0.998046875,
"rewards/rejected": 0.193359375,
"step": 134
},
{
"epoch": 0.14933628318584072,
"grad_norm": 13.669934272766113,
"learning_rate": 4.82152519839913e-07,
"logits/chosen": -1.390625,
"logits/rejected": -1.2578125,
"logps/chosen": -241.5,
"logps/rejected": -243.5,
"loss": 0.3765,
"rewards/accuracies": 0.796875,
"rewards/chosen": 1.4375,
"rewards/margins": 1.40625,
"rewards/rejected": 0.0296630859375,
"step": 135
},
{
"epoch": 0.1504424778761062,
"grad_norm": 16.85657501220703,
"learning_rate": 4.818183478331247e-07,
"logits/chosen": -1.13671875,
"logits/rejected": -1.25390625,
"logps/chosen": -257.5,
"logps/rejected": -277.5,
"loss": 0.4258,
"rewards/accuracies": 0.6953125,
"rewards/chosen": 1.390625,
"rewards/margins": 1.3125,
"rewards/rejected": 0.0772705078125,
"step": 136
},
{
"epoch": 0.1515486725663717,
"grad_norm": 15.21373462677002,
"learning_rate": 4.814811942984625e-07,
"logits/chosen": -1.29296875,
"logits/rejected": -1.1953125,
"logps/chosen": -256.5,
"logps/rejected": -240.0,
"loss": 0.4232,
"rewards/accuracies": 0.78125,
"rewards/chosen": 1.19921875,
"rewards/margins": 1.12890625,
"rewards/rejected": 0.0693359375,
"step": 137
},
{
"epoch": 0.15265486725663716,
"grad_norm": 13.69796085357666,
"learning_rate": 4.811410635722209e-07,
"logits/chosen": -1.2890625,
"logits/rejected": -1.21875,
"logps/chosen": -236.5,
"logps/rejected": -257.0,
"loss": 0.3722,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 1.4453125,
"rewards/margins": 1.55078125,
"rewards/rejected": -0.10595703125,
"step": 138
},
{
"epoch": 0.15376106194690264,
"grad_norm": 15.000753402709961,
"learning_rate": 4.807979600289857e-07,
"logits/chosen": -1.21875,
"logits/rejected": -1.27734375,
"logps/chosen": -274.0,
"logps/rejected": -297.0,
"loss": 0.3709,
"rewards/accuracies": 0.75,
"rewards/chosen": 1.30078125,
"rewards/margins": 1.515625,
"rewards/rejected": -0.21240234375,
"step": 139
},
{
"epoch": 0.15486725663716813,
"grad_norm": 13.44487476348877,
"learning_rate": 4.804518880815776e-07,
"logits/chosen": -1.15625,
"logits/rejected": -1.27734375,
"logps/chosen": -248.5,
"logps/rejected": -267.5,
"loss": 0.3818,
"rewards/accuracies": 0.7265625,
"rewards/chosen": 1.37109375,
"rewards/margins": 1.515625,
"rewards/rejected": -0.144775390625,
"step": 140
},
{
"epoch": 0.15597345132743362,
"grad_norm": 15.1209135055542,
"learning_rate": 4.801028521809951e-07,
"logits/chosen": -1.21875,
"logits/rejected": -1.16796875,
"logps/chosen": -273.0,
"logps/rejected": -271.5,
"loss": 0.4027,
"rewards/accuracies": 0.7578125,
"rewards/chosen": 1.19140625,
"rewards/margins": 1.3125,
"rewards/rejected": -0.122802734375,
"step": 141
},
{
"epoch": 0.1570796460176991,
"grad_norm": 16.363567352294922,
"learning_rate": 4.797508568163578e-07,
"logits/chosen": -1.33203125,
"logits/rejected": -1.2109375,
"logps/chosen": -262.0,
"logps/rejected": -269.0,
"loss": 0.4581,
"rewards/accuracies": 0.7265625,
"rewards/chosen": 1.072265625,
"rewards/margins": 1.169921875,
"rewards/rejected": -0.097412109375,
"step": 142
},
{
"epoch": 0.1581858407079646,
"grad_norm": 13.670063972473145,
"learning_rate": 4.793959065148484e-07,
"logits/chosen": -1.2734375,
"logits/rejected": -1.2421875,
"logps/chosen": -240.0,
"logps/rejected": -254.5,
"loss": 0.3719,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 1.296875,
"rewards/margins": 1.4765625,
"rewards/rejected": -0.179443359375,
"step": 143
},
{
"epoch": 0.1592920353982301,
"grad_norm": 14.17078971862793,
"learning_rate": 4.790380058416542e-07,
"logits/chosen": -1.26953125,
"logits/rejected": -1.23046875,
"logps/chosen": -240.0,
"logps/rejected": -259.5,
"loss": 0.3726,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 1.35546875,
"rewards/margins": 1.625,
"rewards/rejected": -0.2666015625,
"step": 144
},
{
"epoch": 0.16039823008849557,
"grad_norm": 13.858586311340332,
"learning_rate": 4.786771593999089e-07,
"logits/chosen": -1.25390625,
"logits/rejected": -1.25,
"logps/chosen": -242.5,
"logps/rejected": -251.5,
"loss": 0.377,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 1.3515625,
"rewards/margins": 1.5078125,
"rewards/rejected": -0.154296875,
"step": 145
},
{
"epoch": 0.16150442477876106,
"grad_norm": 15.108954429626465,
"learning_rate": 4.783133718306331e-07,
"logits/chosen": -1.21875,
"logits/rejected": -1.28125,
"logps/chosen": -266.0,
"logps/rejected": -305.0,
"loss": 0.4185,
"rewards/accuracies": 0.765625,
"rewards/chosen": 1.21875,
"rewards/margins": 1.37109375,
"rewards/rejected": -0.15087890625,
"step": 146
},
{
"epoch": 0.16261061946902655,
"grad_norm": 14.861040115356445,
"learning_rate": 4.779466478126746e-07,
"logits/chosen": -1.3828125,
"logits/rejected": -1.30078125,
"logps/chosen": -242.0,
"logps/rejected": -239.5,
"loss": 0.3849,
"rewards/accuracies": 0.765625,
"rewards/chosen": 1.140625,
"rewards/margins": 1.4296875,
"rewards/rejected": -0.2919921875,
"step": 147
},
{
"epoch": 0.16371681415929204,
"grad_norm": 14.671175956726074,
"learning_rate": 4.775769920626483e-07,
"logits/chosen": -1.37890625,
"logits/rejected": -1.27734375,
"logps/chosen": -238.5,
"logps/rejected": -251.0,
"loss": 0.4109,
"rewards/accuracies": 0.71875,
"rewards/chosen": 1.1484375,
"rewards/margins": 1.27734375,
"rewards/rejected": -0.129150390625,
"step": 148
},
{
"epoch": 0.16482300884955753,
"grad_norm": 13.885614395141602,
"learning_rate": 4.772044093348757e-07,
"logits/chosen": -1.24609375,
"logits/rejected": -1.23828125,
"logps/chosen": -245.5,
"logps/rejected": -247.0,
"loss": 0.4042,
"rewards/accuracies": 0.75,
"rewards/chosen": 1.162109375,
"rewards/margins": 1.361328125,
"rewards/rejected": -0.19720458984375,
"step": 149
},
{
"epoch": 0.16592920353982302,
"grad_norm": 15.551752090454102,
"learning_rate": 4.7682890442132336e-07,
"logits/chosen": -1.3671875,
"logits/rejected": -1.2265625,
"logps/chosen": -255.0,
"logps/rejected": -252.0,
"loss": 0.415,
"rewards/accuracies": 0.71875,
"rewards/chosen": 1.095703125,
"rewards/margins": 1.37890625,
"rewards/rejected": -0.28369140625,
"step": 150
},
{
"epoch": 0.16592920353982302,
"eval_logits/chosen": -1.2870413064956665,
"eval_logits/rejected": -1.2431591749191284,
"eval_logps/chosen": -248.97512817382812,
"eval_logps/rejected": -261.86566162109375,
"eval_loss": 0.39721065759658813,
"eval_rewards/accuracies": 0.7473672032356262,
"eval_rewards/chosen": 1.184818148612976,
"eval_rewards/margins": 1.4427666664123535,
"eval_rewards/rejected": -0.25743111968040466,
"eval_runtime": 193.0648,
"eval_samples_per_second": 66.573,
"eval_steps_per_second": 1.041,
"step": 150
},
{
"epoch": 0.1670353982300885,
"grad_norm": 15.598936080932617,
"learning_rate": 4.7645048215154156e-07,
"logits/chosen": -1.3046875,
"logits/rejected": -1.2890625,
"logps/chosen": -242.0,
"logps/rejected": -260.0,
"loss": 0.4404,
"rewards/accuracies": 0.6875,
"rewards/chosen": 1.146484375,
"rewards/margins": 1.39453125,
"rewards/rejected": -0.24755859375,
"step": 151
},
{
"epoch": 0.168141592920354,
"grad_norm": 13.759398460388184,
"learning_rate": 4.760691473926021e-07,
"logits/chosen": -1.24609375,
"logits/rejected": -1.234375,
"logps/chosen": -248.5,
"logps/rejected": -269.0,
"loss": 0.3753,
"rewards/accuracies": 0.75,
"rewards/chosen": 1.2890625,
"rewards/margins": 1.55859375,
"rewards/rejected": -0.265625,
"step": 152
},
{
"epoch": 0.16924778761061948,
"grad_norm": 17.32530975341797,
"learning_rate": 4.756849050490357e-07,
"logits/chosen": -1.26953125,
"logits/rejected": -1.1640625,
"logps/chosen": -287.0,
"logps/rejected": -302.0,
"loss": 0.4487,
"rewards/accuracies": 0.7265625,
"rewards/chosen": 1.01953125,
"rewards/margins": 1.23828125,
"rewards/rejected": -0.21826171875,
"step": 153
},
{
"epoch": 0.17035398230088494,
"grad_norm": 16.289810180664062,
"learning_rate": 4.75297760062769e-07,
"logits/chosen": -1.36328125,
"logits/rejected": -1.296875,
"logps/chosen": -271.0,
"logps/rejected": -266.5,
"loss": 0.4189,
"rewards/accuracies": 0.703125,
"rewards/chosen": 1.0703125,
"rewards/margins": 1.375,
"rewards/rejected": -0.30419921875,
"step": 154
},
{
"epoch": 0.17146017699115043,
"grad_norm": 15.245888710021973,
"learning_rate": 4.749077174130608e-07,
"logits/chosen": -1.3203125,
"logits/rejected": -1.23828125,
"logps/chosen": -264.0,
"logps/rejected": -282.0,
"loss": 0.4183,
"rewards/accuracies": 0.71875,
"rewards/chosen": 1.16015625,
"rewards/margins": 1.40234375,
"rewards/rejected": -0.240234375,
"step": 155
},
{
"epoch": 0.17256637168141592,
"grad_norm": 14.452110290527344,
"learning_rate": 4.7451478211643835e-07,
"logits/chosen": -1.39453125,
"logits/rejected": -1.30859375,
"logps/chosen": -253.0,
"logps/rejected": -256.0,
"loss": 0.3993,
"rewards/accuracies": 0.7421875,
"rewards/chosen": 1.2421875,
"rewards/margins": 1.44921875,
"rewards/rejected": -0.20654296875,
"step": 156
},
{
"epoch": 0.1736725663716814,
"grad_norm": 14.378584861755371,
"learning_rate": 4.741189592266325e-07,
"logits/chosen": -1.29296875,
"logits/rejected": -1.26171875,
"logps/chosen": -231.5,
"logps/rejected": -273.5,
"loss": 0.3664,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 1.3125,
"rewards/margins": 1.70703125,
"rewards/rejected": -0.39453125,
"step": 157
},
{
"epoch": 0.1747787610619469,
"grad_norm": 13.193842887878418,
"learning_rate": 4.7372025383451274e-07,
"logits/chosen": -1.12109375,
"logits/rejected": -1.203125,
"logps/chosen": -240.0,
"logps/rejected": -260.0,
"loss": 0.3485,
"rewards/accuracies": 0.78125,
"rewards/chosen": 1.12109375,
"rewards/margins": 1.6171875,
"rewards/rejected": -0.4912109375,
"step": 158
},
{
"epoch": 0.17588495575221239,
"grad_norm": 13.745351791381836,
"learning_rate": 4.7331867106802204e-07,
"logits/chosen": -1.2421875,
"logits/rejected": -1.1875,
"logps/chosen": -258.5,
"logps/rejected": -265.0,
"loss": 0.3891,
"rewards/accuracies": 0.7109375,
"rewards/chosen": 1.23046875,
"rewards/margins": 1.5859375,
"rewards/rejected": -0.35546875,
"step": 159
},
{
"epoch": 0.17699115044247787,
"grad_norm": 14.0711669921875,
"learning_rate": 4.7291421609211045e-07,
"logits/chosen": -1.31640625,
"logits/rejected": -1.24609375,
"logps/chosen": -251.5,
"logps/rejected": -282.0,
"loss": 0.3999,
"rewards/accuracies": 0.75,
"rewards/chosen": 1.1875,
"rewards/margins": 1.3984375,
"rewards/rejected": -0.20654296875,
"step": 160
},
{
"epoch": 0.17809734513274336,
"grad_norm": 13.304108619689941,
"learning_rate": 4.725068941086692e-07,
"logits/chosen": -1.37109375,
"logits/rejected": -1.1953125,
"logps/chosen": -255.5,
"logps/rejected": -262.0,
"loss": 0.3558,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 1.2265625,
"rewards/margins": 1.63671875,
"rewards/rejected": -0.4140625,
"step": 161
},
{
"epoch": 0.17920353982300885,
"grad_norm": 13.896252632141113,
"learning_rate": 4.7209671035646304e-07,
"logits/chosen": -1.3125,
"logits/rejected": -1.2265625,
"logps/chosen": -248.5,
"logps/rejected": -264.0,
"loss": 0.3942,
"rewards/accuracies": 0.75,
"rewards/chosen": 1.15234375,
"rewards/margins": 1.42578125,
"rewards/rejected": -0.27294921875,
"step": 162
},
{
"epoch": 0.18030973451327434,
"grad_norm": 14.796649932861328,
"learning_rate": 4.7168367011106367e-07,
"logits/chosen": -1.30859375,
"logits/rejected": -1.30078125,
"logps/chosen": -245.0,
"logps/rejected": -262.5,
"loss": 0.3799,
"rewards/accuracies": 0.7578125,
"rewards/chosen": 1.13671875,
"rewards/margins": 1.5703125,
"rewards/rejected": -0.431640625,
"step": 163
},
{
"epoch": 0.18141592920353983,
"grad_norm": 16.078460693359375,
"learning_rate": 4.712677786847814e-07,
"logits/chosen": -1.44140625,
"logits/rejected": -1.1875,
"logps/chosen": -243.5,
"logps/rejected": -250.0,
"loss": 0.4507,
"rewards/accuracies": 0.703125,
"rewards/chosen": 1.044921875,
"rewards/margins": 1.2265625,
"rewards/rejected": -0.18359375,
"step": 164
},
{
"epoch": 0.18252212389380532,
"grad_norm": 13.583531379699707,
"learning_rate": 4.708490414265971e-07,
"logits/chosen": -1.375,
"logits/rejected": -1.1796875,
"logps/chosen": -262.0,
"logps/rejected": -272.5,
"loss": 0.3486,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 1.18359375,
"rewards/margins": 1.63671875,
"rewards/rejected": -0.4521484375,
"step": 165
},
{
"epoch": 0.1836283185840708,
"grad_norm": 14.29465389251709,
"learning_rate": 4.7042746372209296e-07,
"logits/chosen": -1.25,
"logits/rejected": -1.32421875,
"logps/chosen": -249.5,
"logps/rejected": -278.0,
"loss": 0.357,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 1.3046875,
"rewards/margins": 1.71875,
"rewards/rejected": -0.416015625,
"step": 166
},
{
"epoch": 0.1847345132743363,
"grad_norm": 14.11926555633545,
"learning_rate": 4.700030509933839e-07,
"logits/chosen": -1.12890625,
"logits/rejected": -1.1484375,
"logps/chosen": -235.5,
"logps/rejected": -273.0,
"loss": 0.3775,
"rewards/accuracies": 0.796875,
"rewards/chosen": 1.33984375,
"rewards/margins": 1.60546875,
"rewards/rejected": -0.2646484375,
"step": 167
},
{
"epoch": 0.18584070796460178,
"grad_norm": 13.987667083740234,
"learning_rate": 4.6957580869904707e-07,
"logits/chosen": -1.234375,
"logits/rejected": -1.140625,
"logps/chosen": -266.0,
"logps/rejected": -280.0,
"loss": 0.3593,
"rewards/accuracies": 0.796875,
"rewards/chosen": 1.15625,
"rewards/margins": 1.55078125,
"rewards/rejected": -0.3935546875,
"step": 168
},
{
"epoch": 0.18694690265486727,
"grad_norm": 14.725763320922852,
"learning_rate": 4.691457423340524e-07,
"logits/chosen": -1.2265625,
"logits/rejected": -1.076171875,
"logps/chosen": -261.0,
"logps/rejected": -248.5,
"loss": 0.3935,
"rewards/accuracies": 0.7421875,
"rewards/chosen": 0.953125,
"rewards/margins": 1.39453125,
"rewards/rejected": -0.439453125,
"step": 169
},
{
"epoch": 0.18805309734513273,
"grad_norm": 15.593293190002441,
"learning_rate": 4.6871285742969114e-07,
"logits/chosen": -1.171875,
"logits/rejected": -1.21875,
"logps/chosen": -267.0,
"logps/rejected": -278.0,
"loss": 0.4233,
"rewards/accuracies": 0.6875,
"rewards/chosen": 1.23828125,
"rewards/margins": 1.5234375,
"rewards/rejected": -0.279296875,
"step": 170
},
{
"epoch": 0.18915929203539822,
"grad_norm": 13.978684425354004,
"learning_rate": 4.682771595535056e-07,
"logits/chosen": -1.2890625,
"logits/rejected": -1.3046875,
"logps/chosen": -244.5,
"logps/rejected": -274.0,
"loss": 0.3605,
"rewards/accuracies": 0.8125,
"rewards/chosen": 1.41796875,
"rewards/margins": 1.6171875,
"rewards/rejected": -0.19677734375,
"step": 171
},
{
"epoch": 0.1902654867256637,
"grad_norm": 12.64192008972168,
"learning_rate": 4.678386543092168e-07,
"logits/chosen": -1.1953125,
"logits/rejected": -1.16015625,
"logps/chosen": -243.5,
"logps/rejected": -267.0,
"loss": 0.35,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 1.32421875,
"rewards/margins": 1.8515625,
"rewards/rejected": -0.52734375,
"step": 172
},
{
"epoch": 0.1913716814159292,
"grad_norm": 15.251437187194824,
"learning_rate": 4.673973473366527e-07,
"logits/chosen": -1.25390625,
"logits/rejected": -1.2578125,
"logps/chosen": -252.5,
"logps/rejected": -270.5,
"loss": 0.386,
"rewards/accuracies": 0.71875,
"rewards/chosen": 1.33984375,
"rewards/margins": 1.71875,
"rewards/rejected": -0.3818359375,
"step": 173
},
{
"epoch": 0.19247787610619468,
"grad_norm": 11.346704483032227,
"learning_rate": 4.669532443116757e-07,
"logits/chosen": -1.2890625,
"logits/rejected": -1.20703125,
"logps/chosen": -227.0,
"logps/rejected": -244.5,
"loss": 0.2852,
"rewards/accuracies": 0.84375,
"rewards/chosen": 1.46484375,
"rewards/margins": 2.0625,
"rewards/rejected": -0.59765625,
"step": 174
},
{
"epoch": 0.19358407079646017,
"grad_norm": 17.457523345947266,
"learning_rate": 4.6650635094610966e-07,
"logits/chosen": -1.34375,
"logits/rejected": -1.12109375,
"logps/chosen": -280.0,
"logps/rejected": -277.0,
"loss": 0.4692,
"rewards/accuracies": 0.6953125,
"rewards/chosen": 1.05859375,
"rewards/margins": 1.22265625,
"rewards/rejected": -0.166748046875,
"step": 175
},
{
"epoch": 0.19469026548672566,
"grad_norm": 14.530098915100098,
"learning_rate": 4.6605667298766607e-07,
"logits/chosen": -1.30078125,
"logits/rejected": -1.21875,
"logps/chosen": -241.5,
"logps/rejected": -260.0,
"loss": 0.3907,
"rewards/accuracies": 0.71875,
"rewards/chosen": 1.17578125,
"rewards/margins": 1.67578125,
"rewards/rejected": -0.501953125,
"step": 176
},
{
"epoch": 0.19579646017699115,
"grad_norm": 15.266855239868164,
"learning_rate": 4.656042162198708e-07,
"logits/chosen": -1.43359375,
"logits/rejected": -1.3046875,
"logps/chosen": -235.0,
"logps/rejected": -265.0,
"loss": 0.4364,
"rewards/accuracies": 0.75,
"rewards/chosen": 1.0703125,
"rewards/margins": 1.4296875,
"rewards/rejected": -0.357421875,
"step": 177
},
{
"epoch": 0.19690265486725664,
"grad_norm": 12.054651260375977,
"learning_rate": 4.6514898646198896e-07,
"logits/chosen": -1.3203125,
"logits/rejected": -1.26171875,
"logps/chosen": -257.0,
"logps/rejected": -271.0,
"loss": 0.3194,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 1.2734375,
"rewards/margins": 1.91015625,
"rewards/rejected": -0.6328125,
"step": 178
},
{
"epoch": 0.19800884955752213,
"grad_norm": 15.28715705871582,
"learning_rate": 4.6469098956895076e-07,
"logits/chosen": -1.3125,
"logits/rejected": -1.22265625,
"logps/chosen": -265.5,
"logps/rejected": -277.0,
"loss": 0.3848,
"rewards/accuracies": 0.765625,
"rewards/chosen": 1.28125,
"rewards/margins": 1.65234375,
"rewards/rejected": -0.373046875,
"step": 179
},
{
"epoch": 0.19911504424778761,
"grad_norm": 14.788736343383789,
"learning_rate": 4.6423023143127557e-07,
"logits/chosen": -1.2265625,
"logits/rejected": -1.3984375,
"logps/chosen": -252.0,
"logps/rejected": -272.0,
"loss": 0.3994,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 1.0859375,
"rewards/margins": 1.5,
"rewards/rejected": -0.4150390625,
"step": 180
},
{
"epoch": 0.2002212389380531,
"grad_norm": 14.42548942565918,
"learning_rate": 4.637667179749968e-07,
"logits/chosen": -1.23046875,
"logits/rejected": -1.21484375,
"logps/chosen": -272.5,
"logps/rejected": -274.5,
"loss": 0.3871,
"rewards/accuracies": 0.6875,
"rewards/chosen": 1.0390625,
"rewards/margins": 1.515625,
"rewards/rejected": -0.48046875,
"step": 181
},
{
"epoch": 0.2013274336283186,
"grad_norm": 13.830480575561523,
"learning_rate": 4.63300455161585e-07,
"logits/chosen": -1.29296875,
"logits/rejected": -1.19140625,
"logps/chosen": -250.0,
"logps/rejected": -248.0,
"loss": 0.3167,
"rewards/accuracies": 0.796875,
"rewards/chosen": 1.34375,
"rewards/margins": 1.92578125,
"rewards/rejected": -0.5859375,
"step": 182
},
{
"epoch": 0.20243362831858408,
"grad_norm": 14.639776229858398,
"learning_rate": 4.6283144898787174e-07,
"logits/chosen": -1.32421875,
"logits/rejected": -1.20703125,
"logps/chosen": -247.5,
"logps/rejected": -279.0,
"loss": 0.3672,
"rewards/accuracies": 0.78125,
"rewards/chosen": 1.32421875,
"rewards/margins": 1.79296875,
"rewards/rejected": -0.46484375,
"step": 183
},
{
"epoch": 0.20353982300884957,
"grad_norm": 13.662202835083008,
"learning_rate": 4.6235970548597224e-07,
"logits/chosen": -1.26953125,
"logits/rejected": -1.234375,
"logps/chosen": -231.0,
"logps/rejected": -240.0,
"loss": 0.3531,
"rewards/accuracies": 0.75,
"rewards/chosen": 1.28125,
"rewards/margins": 1.78515625,
"rewards/rejected": -0.505859375,
"step": 184
},
{
"epoch": 0.20464601769911506,
"grad_norm": 13.101706504821777,
"learning_rate": 4.6188523072320777e-07,
"logits/chosen": -1.2421875,
"logits/rejected": -1.14453125,
"logps/chosen": -253.0,
"logps/rejected": -273.0,
"loss": 0.3276,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 1.2421875,
"rewards/margins": 1.82421875,
"rewards/rejected": -0.5830078125,
"step": 185
},
{
"epoch": 0.20575221238938052,
"grad_norm": 16.33759307861328,
"learning_rate": 4.614080308020277e-07,
"logits/chosen": -1.25,
"logits/rejected": -1.2265625,
"logps/chosen": -258.0,
"logps/rejected": -290.0,
"loss": 0.3694,
"rewards/accuracies": 0.78125,
"rewards/chosen": 1.203125,
"rewards/margins": 1.69140625,
"rewards/rejected": -0.48828125,
"step": 186
},
{
"epoch": 0.206858407079646,
"grad_norm": 13.627776145935059,
"learning_rate": 4.609281118599311e-07,
"logits/chosen": -1.3046875,
"logits/rejected": -1.21875,
"logps/chosen": -238.5,
"logps/rejected": -239.0,
"loss": 0.4007,
"rewards/accuracies": 0.7265625,
"rewards/chosen": 1.056640625,
"rewards/margins": 1.55078125,
"rewards/rejected": -0.4931640625,
"step": 187
},
{
"epoch": 0.2079646017699115,
"grad_norm": 13.673922538757324,
"learning_rate": 4.6044548006938734e-07,
"logits/chosen": -1.3671875,
"logits/rejected": -1.1796875,
"logps/chosen": -247.5,
"logps/rejected": -254.5,
"loss": 0.3592,
"rewards/accuracies": 0.78125,
"rewards/chosen": 1.29296875,
"rewards/margins": 1.7265625,
"rewards/rejected": -0.4375,
"step": 188
},
{
"epoch": 0.20907079646017698,
"grad_norm": 14.20157527923584,
"learning_rate": 4.5996014163775745e-07,
"logits/chosen": -1.30078125,
"logits/rejected": -1.26953125,
"logps/chosen": -268.5,
"logps/rejected": -272.0,
"loss": 0.3429,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 1.32421875,
"rewards/margins": 1.84765625,
"rewards/rejected": -0.5234375,
"step": 189
},
{
"epoch": 0.21017699115044247,
"grad_norm": 14.90439510345459,
"learning_rate": 4.5947210280721353e-07,
"logits/chosen": -1.34375,
"logits/rejected": -1.2421875,
"logps/chosen": -248.0,
"logps/rejected": -285.0,
"loss": 0.373,
"rewards/accuracies": 0.7265625,
"rewards/chosen": 1.203125,
"rewards/margins": 1.765625,
"rewards/rejected": -0.5625,
"step": 190
},
{
"epoch": 0.21128318584070796,
"grad_norm": 14.063448905944824,
"learning_rate": 4.589813698546592e-07,
"logits/chosen": -1.34375,
"logits/rejected": -1.1328125,
"logps/chosen": -256.0,
"logps/rejected": -274.0,
"loss": 0.3471,
"rewards/accuracies": 0.8125,
"rewards/chosen": 1.21875,
"rewards/margins": 1.828125,
"rewards/rejected": -0.607421875,
"step": 191
},
{
"epoch": 0.21238938053097345,
"grad_norm": 13.391234397888184,
"learning_rate": 4.584879490916481e-07,
"logits/chosen": -1.30078125,
"logits/rejected": -1.234375,
"logps/chosen": -247.5,
"logps/rejected": -241.5,
"loss": 0.356,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 1.0390625,
"rewards/margins": 1.9296875,
"rewards/rejected": -0.892578125,
"step": 192
},
{
"epoch": 0.21349557522123894,
"grad_norm": 13.415105819702148,
"learning_rate": 4.5799184686430343e-07,
"logits/chosen": -1.23046875,
"logits/rejected": -1.09375,
"logps/chosen": -251.0,
"logps/rejected": -257.5,
"loss": 0.34,
"rewards/accuracies": 0.75,
"rewards/chosen": 1.1875,
"rewards/margins": 1.87109375,
"rewards/rejected": -0.689453125,
"step": 193
},
{
"epoch": 0.21460176991150443,
"grad_norm": 13.00170612335205,
"learning_rate": 4.574930695532356e-07,
"logits/chosen": -1.30078125,
"logits/rejected": -1.32421875,
"logps/chosen": -257.0,
"logps/rejected": -273.0,
"loss": 0.3455,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 1.203125,
"rewards/margins": 1.82421875,
"rewards/rejected": -0.623046875,
"step": 194
},
{
"epoch": 0.2157079646017699,
"grad_norm": 13.366878509521484,
"learning_rate": 4.569916235734611e-07,
"logits/chosen": -1.2890625,
"logits/rejected": -1.19921875,
"logps/chosen": -240.5,
"logps/rejected": -272.0,
"loss": 0.3792,
"rewards/accuracies": 0.7109375,
"rewards/chosen": 1.2109375,
"rewards/margins": 1.7578125,
"rewards/rejected": -0.544921875,
"step": 195
},
{
"epoch": 0.2168141592920354,
"grad_norm": 14.402266502380371,
"learning_rate": 4.5648751537431897e-07,
"logits/chosen": -1.21484375,
"logits/rejected": -1.171875,
"logps/chosen": -250.5,
"logps/rejected": -286.0,
"loss": 0.428,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.9453125,
"rewards/margins": 1.3984375,
"rewards/rejected": -0.4560546875,
"step": 196
},
{
"epoch": 0.2179203539823009,
"grad_norm": 15.003867149353027,
"learning_rate": 4.559807514393885e-07,
"logits/chosen": -1.30859375,
"logits/rejected": -1.2421875,
"logps/chosen": -276.5,
"logps/rejected": -286.0,
"loss": 0.35,
"rewards/accuracies": 0.75,
"rewards/chosen": 1.296875,
"rewards/margins": 1.87109375,
"rewards/rejected": -0.5810546875,
"step": 197
},
{
"epoch": 0.21902654867256638,
"grad_norm": 14.217790603637695,
"learning_rate": 4.5547133828640595e-07,
"logits/chosen": -1.25,
"logits/rejected": -1.24609375,
"logps/chosen": -267.0,
"logps/rejected": -266.0,
"loss": 0.3393,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 1.27734375,
"rewards/margins": 1.83203125,
"rewards/rejected": -0.5556640625,
"step": 198
},
{
"epoch": 0.22013274336283187,
"grad_norm": 13.773700714111328,
"learning_rate": 4.5495928246717995e-07,
"logits/chosen": -1.2421875,
"logits/rejected": -1.28125,
"logps/chosen": -265.0,
"logps/rejected": -292.0,
"loss": 0.3351,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 1.375,
"rewards/margins": 2.0546875,
"rewards/rejected": -0.67578125,
"step": 199
},
{
"epoch": 0.22123893805309736,
"grad_norm": 14.733463287353516,
"learning_rate": 4.544445905675081e-07,
"logits/chosen": -1.265625,
"logits/rejected": -1.18359375,
"logps/chosen": -266.0,
"logps/rejected": -281.5,
"loss": 0.3673,
"rewards/accuracies": 0.78125,
"rewards/chosen": 1.0859375,
"rewards/margins": 1.7265625,
"rewards/rejected": -0.642578125,
"step": 200
},
{
"epoch": 0.22123893805309736,
"eval_logits/chosen": -1.276119351387024,
"eval_logits/rejected": -1.2190414667129517,
"eval_logps/chosen": -249.2039794921875,
"eval_logps/rejected": -265.0248718261719,
"eval_loss": 0.37490636110305786,
"eval_rewards/accuracies": 0.7651365399360657,
"eval_rewards/chosen": 1.1634211540222168,
"eval_rewards/margins": 1.7447527647018433,
"eval_rewards/rejected": -0.5807631611824036,
"eval_runtime": 192.9266,
"eval_samples_per_second": 66.621,
"eval_steps_per_second": 1.042,
"step": 200
},
{
"epoch": 0.22234513274336284,
"grad_norm": 14.183818817138672,
"learning_rate": 4.539272692070919e-07,
"logits/chosen": -1.2890625,
"logits/rejected": -1.203125,
"logps/chosen": -270.0,
"logps/rejected": -238.5,
"loss": 0.3398,
"rewards/accuracies": 0.828125,
"rewards/chosen": 1.2421875,
"rewards/margins": 1.84375,
"rewards/rejected": -0.6015625,
"step": 201
},
{
"epoch": 0.2234513274336283,
"grad_norm": 14.671875953674316,
"learning_rate": 4.534073250394515e-07,
"logits/chosen": -1.40625,
"logits/rejected": -1.25390625,
"logps/chosen": -245.5,
"logps/rejected": -261.5,
"loss": 0.4247,
"rewards/accuracies": 0.6796875,
"rewards/chosen": 0.904296875,
"rewards/margins": 1.375,
"rewards/rejected": -0.47265625,
"step": 202
},
{
"epoch": 0.2245575221238938,
"grad_norm": 14.409346580505371,
"learning_rate": 4.5288476475184025e-07,
"logits/chosen": -1.23828125,
"logits/rejected": -1.15234375,
"logps/chosen": -251.5,
"logps/rejected": -259.0,
"loss": 0.3738,
"rewards/accuracies": 0.7578125,
"rewards/chosen": 1.10546875,
"rewards/margins": 1.73046875,
"rewards/rejected": -0.623046875,
"step": 203
},
{
"epoch": 0.22566371681415928,
"grad_norm": 16.879392623901367,
"learning_rate": 4.523595950651587e-07,
"logits/chosen": -1.21484375,
"logits/rejected": -1.25390625,
"logps/chosen": -272.0,
"logps/rejected": -281.0,
"loss": 0.4152,
"rewards/accuracies": 0.75,
"rewards/chosen": 1.0546875,
"rewards/margins": 1.640625,
"rewards/rejected": -0.58203125,
"step": 204
},
{
"epoch": 0.22676991150442477,
"grad_norm": 13.093546867370605,
"learning_rate": 4.518318227338681e-07,
"logits/chosen": -1.20703125,
"logits/rejected": -1.1953125,
"logps/chosen": -272.0,
"logps/rejected": -275.0,
"loss": 0.3398,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 1.07421875,
"rewards/margins": 1.71484375,
"rewards/rejected": -0.640625,
"step": 205
},
{
"epoch": 0.22787610619469026,
"grad_norm": 14.883780479431152,
"learning_rate": 4.5130145454590374e-07,
"logits/chosen": -1.28515625,
"logits/rejected": -1.140625,
"logps/chosen": -247.0,
"logps/rejected": -279.0,
"loss": 0.3714,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 1.12890625,
"rewards/margins": 1.84375,
"rewards/rejected": -0.71875,
"step": 206
},
{
"epoch": 0.22898230088495575,
"grad_norm": 13.462334632873535,
"learning_rate": 4.5076849732258737e-07,
"logits/chosen": -1.30078125,
"logits/rejected": -1.1953125,
"logps/chosen": -233.0,
"logps/rejected": -231.5,
"loss": 0.3624,
"rewards/accuracies": 0.8125,
"rewards/chosen": 1.2578125,
"rewards/margins": 1.75,
"rewards/rejected": -0.4931640625,
"step": 207
},
{
"epoch": 0.23008849557522124,
"grad_norm": 13.485892295837402,
"learning_rate": 4.5023295791853937e-07,
"logits/chosen": -1.30859375,
"logits/rejected": -1.25390625,
"logps/chosen": -243.0,
"logps/rejected": -284.0,
"loss": 0.3465,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 1.36328125,
"rewards/margins": 1.86328125,
"rewards/rejected": -0.4951171875,
"step": 208
},
{
"epoch": 0.23119469026548672,
"grad_norm": 13.468306541442871,
"learning_rate": 4.496948432215912e-07,
"logits/chosen": -1.2734375,
"logits/rejected": -1.14453125,
"logps/chosen": -239.0,
"logps/rejected": -231.5,
"loss": 0.3881,
"rewards/accuracies": 0.828125,
"rewards/chosen": 1.07421875,
"rewards/margins": 1.6484375,
"rewards/rejected": -0.576171875,
"step": 209
},
{
"epoch": 0.2323008849557522,
"grad_norm": 14.274983406066895,
"learning_rate": 4.4915416015269614e-07,
"logits/chosen": -1.30078125,
"logits/rejected": -1.265625,
"logps/chosen": -271.0,
"logps/rejected": -279.5,
"loss": 0.3449,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 1.24609375,
"rewards/margins": 1.875,
"rewards/rejected": -0.6298828125,
"step": 210
},
{
"epoch": 0.2334070796460177,
"grad_norm": 14.726081848144531,
"learning_rate": 4.486109156658405e-07,
"logits/chosen": -1.25390625,
"logits/rejected": -1.30078125,
"logps/chosen": -223.0,
"logps/rejected": -258.0,
"loss": 0.3548,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 1.26953125,
"rewards/margins": 1.83203125,
"rewards/rejected": -0.5556640625,
"step": 211
},
{
"epoch": 0.2345132743362832,
"grad_norm": 14.424053192138672,
"learning_rate": 4.480651167479544e-07,
"logits/chosen": -1.328125,
"logits/rejected": -1.18359375,
"logps/chosen": -235.5,
"logps/rejected": -251.0,
"loss": 0.3725,
"rewards/accuracies": 0.7578125,
"rewards/chosen": 1.3125,
"rewards/margins": 1.7109375,
"rewards/rejected": -0.3974609375,
"step": 212
},
{
"epoch": 0.23561946902654868,
"grad_norm": 16.50137710571289,
"learning_rate": 4.475167704188218e-07,
"logits/chosen": -1.28515625,
"logits/rejected": -1.33203125,
"logps/chosen": -261.0,
"logps/rejected": -274.0,
"loss": 0.4309,
"rewards/accuracies": 0.765625,
"rewards/chosen": 1.0625,
"rewards/margins": 1.57421875,
"rewards/rejected": -0.513671875,
"step": 213
},
{
"epoch": 0.23672566371681417,
"grad_norm": 13.223847389221191,
"learning_rate": 4.4696588373098973e-07,
"logits/chosen": -1.203125,
"logits/rejected": -1.26953125,
"logps/chosen": -246.0,
"logps/rejected": -262.5,
"loss": 0.3152,
"rewards/accuracies": 0.828125,
"rewards/chosen": 1.40234375,
"rewards/margins": 2.12109375,
"rewards/rejected": -0.7177734375,
"step": 214
},
{
"epoch": 0.23783185840707965,
"grad_norm": 15.553281784057617,
"learning_rate": 4.4641246376967854e-07,
"logits/chosen": -1.19140625,
"logits/rejected": -1.1640625,
"logps/chosen": -256.5,
"logps/rejected": -271.0,
"loss": 0.3849,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 1.203125,
"rewards/margins": 1.73046875,
"rewards/rejected": -0.52734375,
"step": 215
},
{
"epoch": 0.23893805309734514,
"grad_norm": 14.652776718139648,
"learning_rate": 4.4585651765268983e-07,
"logits/chosen": -1.21484375,
"logits/rejected": -1.16796875,
"logps/chosen": -249.0,
"logps/rejected": -240.0,
"loss": 0.394,
"rewards/accuracies": 0.7421875,
"rewards/chosen": 1.15625,
"rewards/margins": 1.73828125,
"rewards/rejected": -0.5859375,
"step": 216
},
{
"epoch": 0.24004424778761063,
"grad_norm": 15.165270805358887,
"learning_rate": 4.452980525303155e-07,
"logits/chosen": -1.29296875,
"logits/rejected": -1.22265625,
"logps/chosen": -272.5,
"logps/rejected": -269.0,
"loss": 0.3583,
"rewards/accuracies": 0.8125,
"rewards/chosen": 1.25,
"rewards/margins": 1.81640625,
"rewards/rejected": -0.56640625,
"step": 217
},
{
"epoch": 0.2411504424778761,
"grad_norm": 13.010436058044434,
"learning_rate": 4.4473707558524553e-07,
"logits/chosen": -1.41796875,
"logits/rejected": -1.1640625,
"logps/chosen": -248.5,
"logps/rejected": -276.0,
"loss": 0.3244,
"rewards/accuracies": 0.796875,
"rewards/chosen": 1.30859375,
"rewards/margins": 2.05859375,
"rewards/rejected": -0.751953125,
"step": 218
},
{
"epoch": 0.24225663716814158,
"grad_norm": 14.902968406677246,
"learning_rate": 4.4417359403247567e-07,
"logits/chosen": -1.2890625,
"logits/rejected": -1.10546875,
"logps/chosen": -255.0,
"logps/rejected": -276.0,
"loss": 0.3569,
"rewards/accuracies": 0.75,
"rewards/chosen": 1.453125,
"rewards/margins": 2.0234375,
"rewards/rejected": -0.568359375,
"step": 219
},
{
"epoch": 0.24336283185840707,
"grad_norm": 13.878840446472168,
"learning_rate": 4.436076151192146e-07,
"logits/chosen": -1.33203125,
"logits/rejected": -1.26953125,
"logps/chosen": -218.0,
"logps/rejected": -246.5,
"loss": 0.3976,
"rewards/accuracies": 0.703125,
"rewards/chosen": 1.1484375,
"rewards/margins": 1.8046875,
"rewards/rejected": -0.654296875,
"step": 220
},
{
"epoch": 0.24446902654867256,
"grad_norm": 13.981918334960938,
"learning_rate": 4.4303914612479104e-07,
"logits/chosen": -1.32421875,
"logits/rejected": -1.25,
"logps/chosen": -237.0,
"logps/rejected": -273.0,
"loss": 0.3427,
"rewards/accuracies": 0.78125,
"rewards/chosen": 1.36328125,
"rewards/margins": 2.0390625,
"rewards/rejected": -0.673828125,
"step": 221
},
{
"epoch": 0.24557522123893805,
"grad_norm": 12.754227638244629,
"learning_rate": 4.4246819436055946e-07,
"logits/chosen": -1.234375,
"logits/rejected": -1.140625,
"logps/chosen": -248.0,
"logps/rejected": -250.0,
"loss": 0.3383,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 1.109375,
"rewards/margins": 1.87890625,
"rewards/rejected": -0.76171875,
"step": 222
},
{
"epoch": 0.24668141592920353,
"grad_norm": 14.764009475708008,
"learning_rate": 4.418947671698066e-07,
"logits/chosen": -1.34375,
"logits/rejected": -1.234375,
"logps/chosen": -250.5,
"logps/rejected": -266.0,
"loss": 0.3845,
"rewards/accuracies": 0.8125,
"rewards/chosen": 1.060546875,
"rewards/margins": 1.7265625,
"rewards/rejected": -0.66015625,
"step": 223
},
{
"epoch": 0.24778761061946902,
"grad_norm": 15.158235549926758,
"learning_rate": 4.4131887192765684e-07,
"logits/chosen": -1.28515625,
"logits/rejected": -1.26953125,
"logps/chosen": -244.0,
"logps/rejected": -265.0,
"loss": 0.3368,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 1.19140625,
"rewards/margins": 2.09375,
"rewards/rejected": -0.904296875,
"step": 224
},
{
"epoch": 0.2488938053097345,
"grad_norm": 13.44605827331543,
"learning_rate": 4.4074051604097753e-07,
"logits/chosen": -1.265625,
"logits/rejected": -1.26171875,
"logps/chosen": -248.0,
"logps/rejected": -269.0,
"loss": 0.3464,
"rewards/accuracies": 0.796875,
"rewards/chosen": 1.28515625,
"rewards/margins": 2.03125,
"rewards/rejected": -0.744140625,
"step": 225
},
{
"epoch": 0.25,
"grad_norm": 15.778076171875,
"learning_rate": 4.401597069482832e-07,
"logits/chosen": -1.32421875,
"logits/rejected": -1.26171875,
"logps/chosen": -248.5,
"logps/rejected": -265.0,
"loss": 0.4139,
"rewards/accuracies": 0.7265625,
"rewards/chosen": 0.962890625,
"rewards/margins": 1.66015625,
"rewards/rejected": -0.697265625,
"step": 226
},
{
"epoch": 0.25110619469026546,
"grad_norm": 13.870752334594727,
"learning_rate": 4.395764521196406e-07,
"logits/chosen": -1.26171875,
"logits/rejected": -1.2265625,
"logps/chosen": -234.5,
"logps/rejected": -281.0,
"loss": 0.3158,
"rewards/accuracies": 0.796875,
"rewards/chosen": 1.3984375,
"rewards/margins": 2.2734375,
"rewards/rejected": -0.87109375,
"step": 227
},
{
"epoch": 0.252212389380531,
"grad_norm": 13.615601539611816,
"learning_rate": 4.389907590565721e-07,
"logits/chosen": -1.1796875,
"logits/rejected": -1.15234375,
"logps/chosen": -268.0,
"logps/rejected": -290.0,
"loss": 0.3724,
"rewards/accuracies": 0.7421875,
"rewards/chosen": 1.087890625,
"rewards/margins": 1.91796875,
"rewards/rejected": -0.830078125,
"step": 228
},
{
"epoch": 0.25331858407079644,
"grad_norm": 14.186120986938477,
"learning_rate": 4.3840263529195943e-07,
"logits/chosen": -1.2421875,
"logits/rejected": -1.2109375,
"logps/chosen": -248.5,
"logps/rejected": -262.0,
"loss": 0.3415,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 1.072265625,
"rewards/margins": 1.9375,
"rewards/rejected": -0.865234375,
"step": 229
},
{
"epoch": 0.25442477876106195,
"grad_norm": 12.267884254455566,
"learning_rate": 4.3781208838994663e-07,
"logits/chosen": -1.296875,
"logits/rejected": -1.23828125,
"logps/chosen": -246.0,
"logps/rejected": -257.5,
"loss": 0.3271,
"rewards/accuracies": 0.796875,
"rewards/chosen": 1.15234375,
"rewards/margins": 1.97265625,
"rewards/rejected": -0.8203125,
"step": 230
},
{
"epoch": 0.2555309734513274,
"grad_norm": 14.3861722946167,
"learning_rate": 4.372191259458432e-07,
"logits/chosen": -1.3515625,
"logits/rejected": -1.21875,
"logps/chosen": -234.5,
"logps/rejected": -251.0,
"loss": 0.3735,
"rewards/accuracies": 0.765625,
"rewards/chosen": 1.111328125,
"rewards/margins": 1.92578125,
"rewards/rejected": -0.81640625,
"step": 231
},
{
"epoch": 0.25663716814159293,
"grad_norm": 13.046867370605469,
"learning_rate": 4.366237555860256e-07,
"logits/chosen": -1.35546875,
"logits/rejected": -1.2109375,
"logps/chosen": -246.0,
"logps/rejected": -270.0,
"loss": 0.3317,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 1.16015625,
"rewards/margins": 2.109375,
"rewards/rejected": -0.9453125,
"step": 232
},
{
"epoch": 0.2577433628318584,
"grad_norm": 15.247108459472656,
"learning_rate": 4.3602598496784013e-07,
"logits/chosen": -1.2578125,
"logits/rejected": -1.140625,
"logps/chosen": -272.0,
"logps/rejected": -268.0,
"loss": 0.3798,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 0.9765625,
"rewards/margins": 1.875,
"rewards/rejected": -0.896484375,
"step": 233
},
{
"epoch": 0.2588495575221239,
"grad_norm": 13.2136812210083,
"learning_rate": 4.3542582177950373e-07,
"logits/chosen": -1.21484375,
"logits/rejected": -1.1484375,
"logps/chosen": -227.5,
"logps/rejected": -262.5,
"loss": 0.3171,
"rewards/accuracies": 0.828125,
"rewards/chosen": 1.26953125,
"rewards/margins": 2.1015625,
"rewards/rejected": -0.822265625,
"step": 234
},
{
"epoch": 0.25995575221238937,
"grad_norm": 13.574021339416504,
"learning_rate": 4.348232737400054e-07,
"logits/chosen": -1.13671875,
"logits/rejected": -1.171875,
"logps/chosen": -239.0,
"logps/rejected": -267.0,
"loss": 0.3749,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 1.05859375,
"rewards/margins": 1.78125,
"rewards/rejected": -0.720703125,
"step": 235
},
{
"epoch": 0.2610619469026549,
"grad_norm": 13.393758773803711,
"learning_rate": 4.3421834859900685e-07,
"logits/chosen": -1.265625,
"logits/rejected": -1.140625,
"logps/chosen": -236.5,
"logps/rejected": -255.0,
"loss": 0.3454,
"rewards/accuracies": 0.78125,
"rewards/chosen": 1.1796875,
"rewards/margins": 2.140625,
"rewards/rejected": -0.962890625,
"step": 236
},
{
"epoch": 0.26216814159292035,
"grad_norm": 17.910938262939453,
"learning_rate": 4.336110541367428e-07,
"logits/chosen": -1.25390625,
"logits/rejected": -1.19921875,
"logps/chosen": -245.5,
"logps/rejected": -272.0,
"loss": 0.4424,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.943359375,
"rewards/margins": 1.58984375,
"rewards/rejected": -0.646484375,
"step": 237
},
{
"epoch": 0.26327433628318586,
"grad_norm": 14.35909366607666,
"learning_rate": 4.33001398163921e-07,
"logits/chosen": -1.25,
"logits/rejected": -1.2265625,
"logps/chosen": -243.5,
"logps/rejected": -260.5,
"loss": 0.3525,
"rewards/accuracies": 0.78125,
"rewards/chosen": 1.30078125,
"rewards/margins": 2.2109375,
"rewards/rejected": -0.91015625,
"step": 238
},
{
"epoch": 0.2643805309734513,
"grad_norm": 15.5848970413208,
"learning_rate": 4.3238938852162187e-07,
"logits/chosen": -1.1875,
"logits/rejected": -1.3046875,
"logps/chosen": -250.5,
"logps/rejected": -273.0,
"loss": 0.3839,
"rewards/accuracies": 0.7265625,
"rewards/chosen": 1.1640625,
"rewards/margins": 2.03125,
"rewards/rejected": -0.861328125,
"step": 239
},
{
"epoch": 0.26548672566371684,
"grad_norm": 13.962175369262695,
"learning_rate": 4.317750330811972e-07,
"logits/chosen": -1.3671875,
"logits/rejected": -1.328125,
"logps/chosen": -250.5,
"logps/rejected": -275.0,
"loss": 0.3394,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 1.1875,
"rewards/margins": 1.91015625,
"rewards/rejected": -0.72265625,
"step": 240
},
{
"epoch": 0.2665929203539823,
"grad_norm": 13.130892753601074,
"learning_rate": 4.311583397441696e-07,
"logits/chosen": -1.18359375,
"logits/rejected": -1.26171875,
"logps/chosen": -240.0,
"logps/rejected": -255.5,
"loss": 0.3364,
"rewards/accuracies": 0.765625,
"rewards/chosen": 1.25,
"rewards/margins": 2.203125,
"rewards/rejected": -0.9453125,
"step": 241
},
{
"epoch": 0.2676991150442478,
"grad_norm": 15.227952003479004,
"learning_rate": 4.3053931644213e-07,
"logits/chosen": -1.23046875,
"logits/rejected": -1.1640625,
"logps/chosen": -261.0,
"logps/rejected": -269.5,
"loss": 0.4343,
"rewards/accuracies": 0.75,
"rewards/chosen": 1.15234375,
"rewards/margins": 1.65234375,
"rewards/rejected": -0.4970703125,
"step": 242
},
{
"epoch": 0.2688053097345133,
"grad_norm": 11.86292552947998,
"learning_rate": 4.2991797113663676e-07,
"logits/chosen": -1.24609375,
"logits/rejected": -1.1875,
"logps/chosen": -239.5,
"logps/rejected": -268.0,
"loss": 0.2865,
"rewards/accuracies": 0.8515625,
"rewards/chosen": 1.3125,
"rewards/margins": 2.296875,
"rewards/rejected": -0.98046875,
"step": 243
},
{
"epoch": 0.26991150442477874,
"grad_norm": 12.915170669555664,
"learning_rate": 4.292943118191121e-07,
"logits/chosen": -1.19140625,
"logits/rejected": -1.23828125,
"logps/chosen": -243.0,
"logps/rejected": -257.0,
"loss": 0.3192,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 1.26953125,
"rewards/margins": 2.1328125,
"rewards/rejected": -0.869140625,
"step": 244
},
{
"epoch": 0.27101769911504425,
"grad_norm": 16.35938262939453,
"learning_rate": 4.2866834651074024e-07,
"logits/chosen": -1.16015625,
"logits/rejected": -1.12890625,
"logps/chosen": -283.0,
"logps/rejected": -308.0,
"loss": 0.3896,
"rewards/accuracies": 0.734375,
"rewards/chosen": 1.19921875,
"rewards/margins": 1.8046875,
"rewards/rejected": -0.607421875,
"step": 245
},
{
"epoch": 0.2721238938053097,
"grad_norm": 14.654645919799805,
"learning_rate": 4.280400832623636e-07,
"logits/chosen": -1.25,
"logits/rejected": -1.10546875,
"logps/chosen": -269.5,
"logps/rejected": -273.0,
"loss": 0.3785,
"rewards/accuracies": 0.7265625,
"rewards/chosen": 1.234375,
"rewards/margins": 1.95703125,
"rewards/rejected": -0.71484375,
"step": 246
},
{
"epoch": 0.27323008849557523,
"grad_norm": 12.577658653259277,
"learning_rate": 4.274095301543796e-07,
"logits/chosen": -1.4140625,
"logits/rejected": -1.234375,
"logps/chosen": -222.5,
"logps/rejected": -252.0,
"loss": 0.3402,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 1.33984375,
"rewards/margins": 2.06640625,
"rewards/rejected": -0.73046875,
"step": 247
},
{
"epoch": 0.2743362831858407,
"grad_norm": 13.634322166442871,
"learning_rate": 4.2677669529663686e-07,
"logits/chosen": -1.2578125,
"logits/rejected": -1.1640625,
"logps/chosen": -266.0,
"logps/rejected": -267.5,
"loss": 0.3221,
"rewards/accuracies": 0.796875,
"rewards/chosen": 1.41015625,
"rewards/margins": 2.2734375,
"rewards/rejected": -0.87109375,
"step": 248
},
{
"epoch": 0.2754424778761062,
"grad_norm": 14.120111465454102,
"learning_rate": 4.2614158682833037e-07,
"logits/chosen": -1.26953125,
"logits/rejected": -1.1328125,
"logps/chosen": -251.0,
"logps/rejected": -281.0,
"loss": 0.3739,
"rewards/accuracies": 0.765625,
"rewards/chosen": 1.32421875,
"rewards/margins": 1.90625,
"rewards/rejected": -0.5791015625,
"step": 249
},
{
"epoch": 0.27654867256637167,
"grad_norm": 14.189047813415527,
"learning_rate": 4.255042129178973e-07,
"logits/chosen": -1.27734375,
"logits/rejected": -1.21484375,
"logps/chosen": -237.0,
"logps/rejected": -268.0,
"loss": 0.3868,
"rewards/accuracies": 0.7421875,
"rewards/chosen": 1.14453125,
"rewards/margins": 1.9375,
"rewards/rejected": -0.794921875,
"step": 250
},
{
"epoch": 0.27654867256637167,
"eval_logits/chosen": -1.2774020433425903,
"eval_logits/rejected": -1.207769751548767,
"eval_logps/chosen": -248.95523071289062,
"eval_logps/rejected": -267.1990051269531,
"eval_loss": 0.36062541604042053,
"eval_rewards/accuracies": 0.7771241068840027,
"eval_rewards/chosen": 1.1926889419555664,
"eval_rewards/margins": 1.988767147064209,
"eval_rewards/rejected": -0.7957963943481445,
"eval_runtime": 193.0793,
"eval_samples_per_second": 66.568,
"eval_steps_per_second": 1.041,
"step": 250
},
{
"epoch": 0.2776548672566372,
"grad_norm": 14.157464981079102,
"learning_rate": 4.248645817629117e-07,
"logits/chosen": -1.40625,
"logits/rejected": -1.26171875,
"logps/chosen": -262.0,
"logps/rejected": -279.0,
"loss": 0.3588,
"rewards/accuracies": 0.796875,
"rewards/chosen": 1.033203125,
"rewards/margins": 1.91796875,
"rewards/rejected": -0.88671875,
"step": 251
},
{
"epoch": 0.27876106194690264,
"grad_norm": 12.822221755981445,
"learning_rate": 4.242227015899793e-07,
"logits/chosen": -1.2890625,
"logits/rejected": -1.19140625,
"logps/chosen": -245.5,
"logps/rejected": -273.0,
"loss": 0.3323,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 1.41796875,
"rewards/margins": 2.2734375,
"rewards/rejected": -0.853515625,
"step": 252
},
{
"epoch": 0.27986725663716816,
"grad_norm": 15.107699394226074,
"learning_rate": 4.2357858065463124e-07,
"logits/chosen": -1.3046875,
"logits/rejected": -1.13671875,
"logps/chosen": -243.5,
"logps/rejected": -275.0,
"loss": 0.4063,
"rewards/accuracies": 0.7421875,
"rewards/chosen": 1.2421875,
"rewards/margins": 1.88671875,
"rewards/rejected": -0.642578125,
"step": 253
},
{
"epoch": 0.2809734513274336,
"grad_norm": 14.644704818725586,
"learning_rate": 4.229322272412185e-07,
"logits/chosen": -1.2421875,
"logits/rejected": -1.203125,
"logps/chosen": -274.0,
"logps/rejected": -289.0,
"loss": 0.3511,
"rewards/accuracies": 0.78125,
"rewards/chosen": 1.009765625,
"rewards/margins": 1.98046875,
"rewards/rejected": -0.970703125,
"step": 254
},
{
"epoch": 0.28207964601769914,
"grad_norm": 14.453044891357422,
"learning_rate": 4.222836496628047e-07,
"logits/chosen": -1.31640625,
"logits/rejected": -1.21875,
"logps/chosen": -264.0,
"logps/rejected": -286.0,
"loss": 0.3342,
"rewards/accuracies": 0.828125,
"rewards/chosen": 1.21484375,
"rewards/margins": 1.9375,
"rewards/rejected": -0.72265625,
"step": 255
},
{
"epoch": 0.2831858407079646,
"grad_norm": 12.731569290161133,
"learning_rate": 4.216328562610599e-07,
"logits/chosen": -1.33203125,
"logits/rejected": -1.2109375,
"logps/chosen": -231.5,
"logps/rejected": -262.5,
"loss": 0.3542,
"rewards/accuracies": 0.78125,
"rewards/chosen": 1.06640625,
"rewards/margins": 2.12890625,
"rewards/rejected": -1.064453125,
"step": 256
},
{
"epoch": 0.2842920353982301,
"grad_norm": 14.310720443725586,
"learning_rate": 4.209798554061527e-07,
"logits/chosen": -1.2578125,
"logits/rejected": -1.21484375,
"logps/chosen": -258.0,
"logps/rejected": -282.0,
"loss": 0.3884,
"rewards/accuracies": 0.7578125,
"rewards/chosen": 0.966796875,
"rewards/margins": 1.828125,
"rewards/rejected": -0.861328125,
"step": 257
},
{
"epoch": 0.2853982300884956,
"grad_norm": 14.716500282287598,
"learning_rate": 4.203246554966428e-07,
"logits/chosen": -1.1796875,
"logits/rejected": -1.3046875,
"logps/chosen": -243.0,
"logps/rejected": -253.0,
"loss": 0.4139,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.962890625,
"rewards/margins": 1.71484375,
"rewards/rejected": -0.75,
"step": 258
},
{
"epoch": 0.28650442477876104,
"grad_norm": 14.436864852905273,
"learning_rate": 4.1966726495937305e-07,
"logits/chosen": -1.42578125,
"logits/rejected": -1.20703125,
"logps/chosen": -252.5,
"logps/rejected": -273.0,
"loss": 0.3439,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 1.08984375,
"rewards/margins": 1.95703125,
"rewards/rejected": -0.86328125,
"step": 259
},
{
"epoch": 0.28761061946902655,
"grad_norm": 15.182847023010254,
"learning_rate": 4.1900769224936124e-07,
"logits/chosen": -1.1953125,
"logits/rejected": -1.19140625,
"logps/chosen": -286.0,
"logps/rejected": -310.0,
"loss": 0.3774,
"rewards/accuracies": 0.7109375,
"rewards/chosen": 0.96875,
"rewards/margins": 1.99609375,
"rewards/rejected": -1.03125,
"step": 260
},
{
"epoch": 0.288716814159292,
"grad_norm": 13.360356330871582,
"learning_rate": 4.1834594584969077e-07,
"logits/chosen": -1.25390625,
"logits/rejected": -1.2109375,
"logps/chosen": -248.5,
"logps/rejected": -266.0,
"loss": 0.3638,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 1.115234375,
"rewards/margins": 1.9140625,
"rewards/rejected": -0.802734375,
"step": 261
},
{
"epoch": 0.28982300884955753,
"grad_norm": 13.982027053833008,
"learning_rate": 4.176820342714022e-07,
"logits/chosen": -1.39453125,
"logits/rejected": -1.26171875,
"logps/chosen": -259.0,
"logps/rejected": -281.0,
"loss": 0.3449,
"rewards/accuracies": 0.7578125,
"rewards/chosen": 1.068359375,
"rewards/margins": 1.99609375,
"rewards/rejected": -0.921875,
"step": 262
},
{
"epoch": 0.290929203539823,
"grad_norm": 13.159867286682129,
"learning_rate": 4.1701596605338334e-07,
"logits/chosen": -1.40234375,
"logits/rejected": -1.234375,
"logps/chosen": -242.5,
"logps/rejected": -271.0,
"loss": 0.3395,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 1.25,
"rewards/margins": 1.99609375,
"rewards/rejected": -0.751953125,
"step": 263
},
{
"epoch": 0.2920353982300885,
"grad_norm": 12.9893798828125,
"learning_rate": 4.1634774976225965e-07,
"logits/chosen": -1.35546875,
"logits/rejected": -1.2109375,
"logps/chosen": -234.5,
"logps/rejected": -277.0,
"loss": 0.3156,
"rewards/accuracies": 0.8125,
"rewards/chosen": 1.171875,
"rewards/margins": 2.2421875,
"rewards/rejected": -1.0703125,
"step": 264
},
{
"epoch": 0.29314159292035397,
"grad_norm": 13.78116226196289,
"learning_rate": 4.15677393992284e-07,
"logits/chosen": -1.2890625,
"logits/rejected": -1.2578125,
"logps/chosen": -253.5,
"logps/rejected": -279.0,
"loss": 0.3418,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 1.20703125,
"rewards/margins": 2.171875,
"rewards/rejected": -0.966796875,
"step": 265
},
{
"epoch": 0.2942477876106195,
"grad_norm": 14.935332298278809,
"learning_rate": 4.150049073652261e-07,
"logits/chosen": -1.31640625,
"logits/rejected": -1.15625,
"logps/chosen": -265.0,
"logps/rejected": -291.0,
"loss": 0.3503,
"rewards/accuracies": 0.765625,
"rewards/chosen": 1.21875,
"rewards/margins": 2.1640625,
"rewards/rejected": -0.94140625,
"step": 266
},
{
"epoch": 0.29535398230088494,
"grad_norm": 15.937322616577148,
"learning_rate": 4.1433029853026163e-07,
"logits/chosen": -1.31640625,
"logits/rejected": -1.203125,
"logps/chosen": -245.5,
"logps/rejected": -294.0,
"loss": 0.3923,
"rewards/accuracies": 0.796875,
"rewards/chosen": 1.0625,
"rewards/margins": 1.91796875,
"rewards/rejected": -0.85546875,
"step": 267
},
{
"epoch": 0.29646017699115046,
"grad_norm": 14.759867668151855,
"learning_rate": 4.136535761638611e-07,
"logits/chosen": -1.2890625,
"logits/rejected": -1.09375,
"logps/chosen": -276.0,
"logps/rejected": -295.0,
"loss": 0.356,
"rewards/accuracies": 0.7578125,
"rewards/chosen": 0.98828125,
"rewards/margins": 1.98828125,
"rewards/rejected": -1.001953125,
"step": 268
},
{
"epoch": 0.2975663716814159,
"grad_norm": 13.723134994506836,
"learning_rate": 4.129747489696781e-07,
"logits/chosen": -1.23828125,
"logits/rejected": -1.12890625,
"logps/chosen": -252.5,
"logps/rejected": -246.5,
"loss": 0.3215,
"rewards/accuracies": 0.78125,
"rewards/chosen": 1.06640625,
"rewards/margins": 2.3671875,
"rewards/rejected": -1.29296875,
"step": 269
},
{
"epoch": 0.29867256637168144,
"grad_norm": 12.263731002807617,
"learning_rate": 4.122938256784374e-07,
"logits/chosen": -1.3671875,
"logits/rejected": -1.30859375,
"logps/chosen": -216.5,
"logps/rejected": -275.0,
"loss": 0.3189,
"rewards/accuracies": 0.8125,
"rewards/chosen": 1.16796875,
"rewards/margins": 2.265625,
"rewards/rejected": -1.09375,
"step": 270
},
{
"epoch": 0.2997787610619469,
"grad_norm": 15.063496589660645,
"learning_rate": 4.116108150478228e-07,
"logits/chosen": -1.1875,
"logits/rejected": -1.125,
"logps/chosen": -255.5,
"logps/rejected": -256.5,
"loss": 0.3799,
"rewards/accuracies": 0.7421875,
"rewards/chosen": 0.818359375,
"rewards/margins": 1.7890625,
"rewards/rejected": -0.974609375,
"step": 271
},
{
"epoch": 0.3008849557522124,
"grad_norm": 15.24313735961914,
"learning_rate": 4.109257258623643e-07,
"logits/chosen": -1.2265625,
"logits/rejected": -1.09765625,
"logps/chosen": -238.5,
"logps/rejected": -274.0,
"loss": 0.3779,
"rewards/accuracies": 0.7578125,
"rewards/chosen": 1.15625,
"rewards/margins": 2.1875,
"rewards/rejected": -1.03125,
"step": 272
},
{
"epoch": 0.3019911504424779,
"grad_norm": 14.098630905151367,
"learning_rate": 4.1023856693332516e-07,
"logits/chosen": -1.3203125,
"logits/rejected": -1.17578125,
"logps/chosen": -248.0,
"logps/rejected": -272.0,
"loss": 0.3197,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 0.982421875,
"rewards/margins": 2.25,
"rewards/rejected": -1.26953125,
"step": 273
},
{
"epoch": 0.3030973451327434,
"grad_norm": 13.230525970458984,
"learning_rate": 4.0954934709858857e-07,
"logits/chosen": -1.234375,
"logits/rejected": -1.16796875,
"logps/chosen": -268.0,
"logps/rejected": -287.0,
"loss": 0.3215,
"rewards/accuracies": 0.828125,
"rewards/chosen": 1.15234375,
"rewards/margins": 2.125,
"rewards/rejected": -0.97265625,
"step": 274
},
{
"epoch": 0.30420353982300885,
"grad_norm": 12.722634315490723,
"learning_rate": 4.0885807522254433e-07,
"logits/chosen": -1.375,
"logits/rejected": -1.25390625,
"logps/chosen": -256.5,
"logps/rejected": -319.0,
"loss": 0.3175,
"rewards/accuracies": 0.8125,
"rewards/chosen": 1.37890625,
"rewards/margins": 2.3515625,
"rewards/rejected": -0.96875,
"step": 275
},
{
"epoch": 0.3053097345132743,
"grad_norm": 12.688482284545898,
"learning_rate": 4.0816476019597423e-07,
"logits/chosen": -1.41015625,
"logits/rejected": -1.2890625,
"logps/chosen": -235.5,
"logps/rejected": -256.0,
"loss": 0.3222,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 1.28515625,
"rewards/margins": 2.20703125,
"rewards/rejected": -0.92578125,
"step": 276
},
{
"epoch": 0.3064159292035398,
"grad_norm": 14.044715881347656,
"learning_rate": 4.0746941093593807e-07,
"logits/chosen": -1.2578125,
"logits/rejected": -1.27734375,
"logps/chosen": -249.0,
"logps/rejected": -295.0,
"loss": 0.2954,
"rewards/accuracies": 0.8515625,
"rewards/chosen": 1.359375,
"rewards/margins": 2.3671875,
"rewards/rejected": -1.009765625,
"step": 277
},
{
"epoch": 0.3075221238938053,
"grad_norm": 15.867609024047852,
"learning_rate": 4.0677203638565893e-07,
"logits/chosen": -1.36328125,
"logits/rejected": -1.25390625,
"logps/chosen": -260.0,
"logps/rejected": -275.0,
"loss": 0.3278,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 1.19921875,
"rewards/margins": 2.296875,
"rewards/rejected": -1.10546875,
"step": 278
},
{
"epoch": 0.3086283185840708,
"grad_norm": 16.124387741088867,
"learning_rate": 4.060726455144082e-07,
"logits/chosen": -1.25390625,
"logits/rejected": -1.16015625,
"logps/chosen": -240.5,
"logps/rejected": -281.0,
"loss": 0.3936,
"rewards/accuracies": 0.71875,
"rewards/chosen": 1.025390625,
"rewards/margins": 1.96484375,
"rewards/rejected": -0.9375,
"step": 279
},
{
"epoch": 0.30973451327433627,
"grad_norm": 14.164496421813965,
"learning_rate": 4.0537124731739003e-07,
"logits/chosen": -1.34765625,
"logits/rejected": -1.1953125,
"logps/chosen": -250.0,
"logps/rejected": -270.0,
"loss": 0.3594,
"rewards/accuracies": 0.796875,
"rewards/chosen": 1.005859375,
"rewards/margins": 2.02734375,
"rewards/rejected": -1.02734375,
"step": 280
},
{
"epoch": 0.3108407079646018,
"grad_norm": 14.754056930541992,
"learning_rate": 4.0466785081562583e-07,
"logits/chosen": -1.234375,
"logits/rejected": -1.2265625,
"logps/chosen": -258.5,
"logps/rejected": -247.0,
"loss": 0.3625,
"rewards/accuracies": 0.8125,
"rewards/chosen": 1.095703125,
"rewards/margins": 2.06640625,
"rewards/rejected": -0.970703125,
"step": 281
},
{
"epoch": 0.31194690265486724,
"grad_norm": 14.682291030883789,
"learning_rate": 4.039624650558382e-07,
"logits/chosen": -1.13671875,
"logits/rejected": -1.21484375,
"logps/chosen": -239.0,
"logps/rejected": -265.5,
"loss": 0.3439,
"rewards/accuracies": 0.765625,
"rewards/chosen": 1.28125,
"rewards/margins": 2.2890625,
"rewards/rejected": -1.009765625,
"step": 282
},
{
"epoch": 0.31305309734513276,
"grad_norm": 13.215510368347168,
"learning_rate": 4.032550991103344e-07,
"logits/chosen": -1.3984375,
"logits/rejected": -1.30859375,
"logps/chosen": -218.5,
"logps/rejected": -263.5,
"loss": 0.3302,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 1.0078125,
"rewards/margins": 2.03515625,
"rewards/rejected": -1.029296875,
"step": 283
},
{
"epoch": 0.3141592920353982,
"grad_norm": 14.175792694091797,
"learning_rate": 4.0254576207689004e-07,
"logits/chosen": -1.23046875,
"logits/rejected": -1.21484375,
"logps/chosen": -268.0,
"logps/rejected": -307.0,
"loss": 0.3466,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 1.083984375,
"rewards/margins": 2.0703125,
"rewards/rejected": -0.986328125,
"step": 284
},
{
"epoch": 0.31526548672566373,
"grad_norm": 15.70940113067627,
"learning_rate": 4.0183446307863174e-07,
"logits/chosen": -1.359375,
"logits/rejected": -1.20703125,
"logps/chosen": -249.0,
"logps/rejected": -281.0,
"loss": 0.3759,
"rewards/accuracies": 0.765625,
"rewards/chosen": 0.927734375,
"rewards/margins": 1.94921875,
"rewards/rejected": -1.015625,
"step": 285
},
{
"epoch": 0.3163716814159292,
"grad_norm": 14.439340591430664,
"learning_rate": 4.0112121126391967e-07,
"logits/chosen": -1.26953125,
"logits/rejected": -1.26171875,
"logps/chosen": -278.0,
"logps/rejected": -298.0,
"loss": 0.3487,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 1.2421875,
"rewards/margins": 2.3515625,
"rewards/rejected": -1.109375,
"step": 286
},
{
"epoch": 0.3174778761061947,
"grad_norm": 13.696681022644043,
"learning_rate": 4.0040601580623054e-07,
"logits/chosen": -1.3125,
"logits/rejected": -1.3515625,
"logps/chosen": -236.0,
"logps/rejected": -246.0,
"loss": 0.3344,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 1.181640625,
"rewards/margins": 2.3515625,
"rewards/rejected": -1.16796875,
"step": 287
},
{
"epoch": 0.3185840707964602,
"grad_norm": 13.659770011901855,
"learning_rate": 3.9968888590403904e-07,
"logits/chosen": -1.28125,
"logits/rejected": -1.3828125,
"logps/chosen": -248.5,
"logps/rejected": -280.0,
"loss": 0.3278,
"rewards/accuracies": 0.796875,
"rewards/chosen": 1.33203125,
"rewards/margins": 2.375,
"rewards/rejected": -1.046875,
"step": 288
},
{
"epoch": 0.3196902654867257,
"grad_norm": 12.041626930236816,
"learning_rate": 3.9896983078069947e-07,
"logits/chosen": -1.3203125,
"logits/rejected": -1.16796875,
"logps/chosen": -245.0,
"logps/rejected": -273.5,
"loss": 0.3141,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 1.22265625,
"rewards/margins": 2.296875,
"rewards/rejected": -1.078125,
"step": 289
},
{
"epoch": 0.32079646017699115,
"grad_norm": 14.61534595489502,
"learning_rate": 3.9824885968432755e-07,
"logits/chosen": -1.31640625,
"logits/rejected": -1.1875,
"logps/chosen": -241.5,
"logps/rejected": -251.0,
"loss": 0.3742,
"rewards/accuracies": 0.7421875,
"rewards/chosen": 1.05859375,
"rewards/margins": 2.0859375,
"rewards/rejected": -1.025390625,
"step": 290
},
{
"epoch": 0.3219026548672566,
"grad_norm": 13.926555633544922,
"learning_rate": 3.975259818876811e-07,
"logits/chosen": -1.31640625,
"logits/rejected": -1.2578125,
"logps/chosen": -262.0,
"logps/rejected": -259.0,
"loss": 0.298,
"rewards/accuracies": 0.84375,
"rewards/chosen": 1.2265625,
"rewards/margins": 2.328125,
"rewards/rejected": -1.1015625,
"step": 291
},
{
"epoch": 0.3230088495575221,
"grad_norm": 12.315802574157715,
"learning_rate": 3.968012066880412e-07,
"logits/chosen": -1.27734375,
"logits/rejected": -1.2265625,
"logps/chosen": -259.0,
"logps/rejected": -267.0,
"loss": 0.3022,
"rewards/accuracies": 0.796875,
"rewards/chosen": 1.2109375,
"rewards/margins": 2.515625,
"rewards/rejected": -1.30078125,
"step": 292
},
{
"epoch": 0.3241150442477876,
"grad_norm": 12.437846183776855,
"learning_rate": 3.960745434070921e-07,
"logits/chosen": -1.19921875,
"logits/rejected": -1.06640625,
"logps/chosen": -256.5,
"logps/rejected": -281.0,
"loss": 0.3422,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.744140625,
"rewards/margins": 1.89453125,
"rewards/rejected": -1.1484375,
"step": 293
},
{
"epoch": 0.3252212389380531,
"grad_norm": 13.978434562683105,
"learning_rate": 3.9534600139080163e-07,
"logits/chosen": -1.2109375,
"logits/rejected": -1.140625,
"logps/chosen": -237.0,
"logps/rejected": -274.0,
"loss": 0.366,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.888671875,
"rewards/margins": 2.2109375,
"rewards/rejected": -1.32421875,
"step": 294
},
{
"epoch": 0.32632743362831856,
"grad_norm": 17.65538787841797,
"learning_rate": 3.94615590009301e-07,
"logits/chosen": -1.2578125,
"logits/rejected": -1.25390625,
"logps/chosen": -264.0,
"logps/rejected": -285.0,
"loss": 0.4392,
"rewards/accuracies": 0.7109375,
"rewards/chosen": 0.931640625,
"rewards/margins": 1.875,
"rewards/rejected": -0.947265625,
"step": 295
},
{
"epoch": 0.3274336283185841,
"grad_norm": 12.776206016540527,
"learning_rate": 3.9388331865676425e-07,
"logits/chosen": -1.1796875,
"logits/rejected": -1.234375,
"logps/chosen": -246.5,
"logps/rejected": -260.0,
"loss": 0.2823,
"rewards/accuracies": 0.875,
"rewards/chosen": 1.0234375,
"rewards/margins": 2.453125,
"rewards/rejected": -1.42578125,
"step": 296
},
{
"epoch": 0.32853982300884954,
"grad_norm": 15.579447746276855,
"learning_rate": 3.931491967512872e-07,
"logits/chosen": -1.3984375,
"logits/rejected": -1.2734375,
"logps/chosen": -252.5,
"logps/rejected": -283.0,
"loss": 0.3896,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 0.94140625,
"rewards/margins": 1.859375,
"rewards/rejected": -0.91796875,
"step": 297
},
{
"epoch": 0.32964601769911506,
"grad_norm": 13.556859970092773,
"learning_rate": 3.9241323373476686e-07,
"logits/chosen": -1.16015625,
"logits/rejected": -1.125,
"logps/chosen": -258.0,
"logps/rejected": -265.0,
"loss": 0.3322,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 1.1015625,
"rewards/margins": 2.296875,
"rewards/rejected": -1.1953125,
"step": 298
},
{
"epoch": 0.3307522123893805,
"grad_norm": 12.573343276977539,
"learning_rate": 3.916754390727794e-07,
"logits/chosen": -1.203125,
"logits/rejected": -1.15234375,
"logps/chosen": -251.0,
"logps/rejected": -285.0,
"loss": 0.2524,
"rewards/accuracies": 0.875,
"rewards/chosen": 1.375,
"rewards/margins": 2.71875,
"rewards/rejected": -1.34765625,
"step": 299
},
{
"epoch": 0.33185840707964603,
"grad_norm": 13.832542419433594,
"learning_rate": 3.9093582225445877e-07,
"logits/chosen": -1.28125,
"logits/rejected": -1.1953125,
"logps/chosen": -263.0,
"logps/rejected": -283.5,
"loss": 0.3695,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 0.689453125,
"rewards/margins": 1.85546875,
"rewards/rejected": -1.1640625,
"step": 300
},
{
"epoch": 0.33185840707964603,
"eval_logits/chosen": -1.2725435495376587,
"eval_logits/rejected": -1.1930581331253052,
"eval_logps/chosen": -251.15921020507812,
"eval_logps/rejected": -271.43780517578125,
"eval_loss": 0.35059425234794617,
"eval_rewards/accuracies": 0.7869349718093872,
"eval_rewards/chosen": 0.9655628204345703,
"eval_rewards/margins": 2.1816697120666504,
"eval_rewards/rejected": -1.2169232368469238,
"eval_runtime": 193.0334,
"eval_samples_per_second": 66.584,
"eval_steps_per_second": 1.041,
"step": 300
},
{
"epoch": 0.3329646017699115,
"grad_norm": 13.506171226501465,
"learning_rate": 3.901943927923744e-07,
"logits/chosen": -1.2578125,
"logits/rejected": -1.15234375,
"logps/chosen": -258.5,
"logps/rejected": -279.0,
"loss": 0.3567,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.908203125,
"rewards/margins": 2.03125,
"rewards/rejected": -1.119140625,
"step": 301
},
{
"epoch": 0.334070796460177,
"grad_norm": 14.112154960632324,
"learning_rate": 3.8945116022240937e-07,
"logits/chosen": -1.18359375,
"logits/rejected": -1.08203125,
"logps/chosen": -268.0,
"logps/rejected": -313.0,
"loss": 0.3424,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.9765625,
"rewards/margins": 2.2109375,
"rewards/rejected": -1.23046875,
"step": 302
},
{
"epoch": 0.33517699115044247,
"grad_norm": 13.437678337097168,
"learning_rate": 3.8870613410363707e-07,
"logits/chosen": -1.26171875,
"logits/rejected": -1.18359375,
"logps/chosen": -269.0,
"logps/rejected": -273.5,
"loss": 0.361,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 0.830078125,
"rewards/margins": 1.97265625,
"rewards/rejected": -1.14453125,
"step": 303
},
{
"epoch": 0.336283185840708,
"grad_norm": 14.396577835083008,
"learning_rate": 3.8795932401819863e-07,
"logits/chosen": -1.27734375,
"logits/rejected": -1.1171875,
"logps/chosen": -272.0,
"logps/rejected": -286.0,
"loss": 0.3308,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 0.861328125,
"rewards/margins": 2.1796875,
"rewards/rejected": -1.31640625,
"step": 304
},
{
"epoch": 0.33738938053097345,
"grad_norm": 15.585335731506348,
"learning_rate": 3.872107395711798e-07,
"logits/chosen": -1.30859375,
"logits/rejected": -1.08984375,
"logps/chosen": -280.0,
"logps/rejected": -337.0,
"loss": 0.369,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.9921875,
"rewards/margins": 1.98046875,
"rewards/rejected": -0.984375,
"step": 305
},
{
"epoch": 0.33849557522123896,
"grad_norm": 16.997081756591797,
"learning_rate": 3.864603903904871e-07,
"logits/chosen": -1.27734375,
"logits/rejected": -1.171875,
"logps/chosen": -283.0,
"logps/rejected": -291.0,
"loss": 0.3989,
"rewards/accuracies": 0.734375,
"rewards/chosen": 0.982421875,
"rewards/margins": 2.2421875,
"rewards/rejected": -1.2578125,
"step": 306
},
{
"epoch": 0.3396017699115044,
"grad_norm": 16.18097496032715,
"learning_rate": 3.857082861267242e-07,
"logits/chosen": -1.265625,
"logits/rejected": -1.23828125,
"logps/chosen": -249.0,
"logps/rejected": -266.0,
"loss": 0.402,
"rewards/accuracies": 0.765625,
"rewards/chosen": 0.83984375,
"rewards/margins": 1.9921875,
"rewards/rejected": -1.15234375,
"step": 307
},
{
"epoch": 0.3407079646017699,
"grad_norm": 13.513619422912598,
"learning_rate": 3.849544364530677e-07,
"logits/chosen": -1.32421875,
"logits/rejected": -1.2421875,
"logps/chosen": -264.5,
"logps/rejected": -273.0,
"loss": 0.2981,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.98046875,
"rewards/margins": 2.4921875,
"rewards/rejected": -1.515625,
"step": 308
},
{
"epoch": 0.3418141592920354,
"grad_norm": 15.130499839782715,
"learning_rate": 3.8419885106514295e-07,
"logits/chosen": -1.23828125,
"logits/rejected": -1.18359375,
"logps/chosen": -271.5,
"logps/rejected": -284.0,
"loss": 0.3542,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.962890625,
"rewards/margins": 2.1484375,
"rewards/rejected": -1.1875,
"step": 309
},
{
"epoch": 0.34292035398230086,
"grad_norm": 18.18540382385254,
"learning_rate": 3.834415396808988e-07,
"logits/chosen": -1.27734375,
"logits/rejected": -1.1640625,
"logps/chosen": -251.0,
"logps/rejected": -289.0,
"loss": 0.3976,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.78515625,
"rewards/margins": 2.03515625,
"rewards/rejected": -1.24609375,
"step": 310
},
{
"epoch": 0.3440265486725664,
"grad_norm": 13.235279083251953,
"learning_rate": 3.826825120404833e-07,
"logits/chosen": -1.14453125,
"logits/rejected": -1.18359375,
"logps/chosen": -265.0,
"logps/rejected": -271.0,
"loss": 0.3018,
"rewards/accuracies": 0.828125,
"rewards/chosen": 0.990234375,
"rewards/margins": 2.40625,
"rewards/rejected": -1.4140625,
"step": 311
},
{
"epoch": 0.34513274336283184,
"grad_norm": 14.219352722167969,
"learning_rate": 3.81921777906118e-07,
"logits/chosen": -1.34375,
"logits/rejected": -1.16796875,
"logps/chosen": -243.0,
"logps/rejected": -265.0,
"loss": 0.3433,
"rewards/accuracies": 0.78125,
"rewards/chosen": 1.10546875,
"rewards/margins": 2.3203125,
"rewards/rejected": -1.2109375,
"step": 312
},
{
"epoch": 0.34623893805309736,
"grad_norm": 14.085458755493164,
"learning_rate": 3.8115934706197244e-07,
"logits/chosen": -1.3046875,
"logits/rejected": -1.234375,
"logps/chosen": -263.0,
"logps/rejected": -260.0,
"loss": 0.3526,
"rewards/accuracies": 0.7421875,
"rewards/chosen": 1.12890625,
"rewards/margins": 2.17578125,
"rewards/rejected": -1.046875,
"step": 313
},
{
"epoch": 0.3473451327433628,
"grad_norm": 13.884740829467773,
"learning_rate": 3.8039522931403847e-07,
"logits/chosen": -1.4375,
"logits/rejected": -1.22265625,
"logps/chosen": -257.0,
"logps/rejected": -274.5,
"loss": 0.3197,
"rewards/accuracies": 0.828125,
"rewards/chosen": 1.1328125,
"rewards/margins": 2.29296875,
"rewards/rejected": -1.15625,
"step": 314
},
{
"epoch": 0.34845132743362833,
"grad_norm": 15.969679832458496,
"learning_rate": 3.7962943449000377e-07,
"logits/chosen": -1.1640625,
"logits/rejected": -1.1484375,
"logps/chosen": -260.0,
"logps/rejected": -283.0,
"loss": 0.4191,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.90625,
"rewards/margins": 1.7890625,
"rewards/rejected": -0.880859375,
"step": 315
},
{
"epoch": 0.3495575221238938,
"grad_norm": 14.207815170288086,
"learning_rate": 3.7886197243912607e-07,
"logits/chosen": -1.22265625,
"logits/rejected": -1.1015625,
"logps/chosen": -256.0,
"logps/rejected": -279.0,
"loss": 0.3409,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 1.19140625,
"rewards/margins": 2.34375,
"rewards/rejected": -1.15625,
"step": 316
},
{
"epoch": 0.3506637168141593,
"grad_norm": 14.867213249206543,
"learning_rate": 3.7809285303210593e-07,
"logits/chosen": -1.26953125,
"logits/rejected": -1.09375,
"logps/chosen": -248.5,
"logps/rejected": -246.0,
"loss": 0.3668,
"rewards/accuracies": 0.7421875,
"rewards/chosen": 1.06640625,
"rewards/margins": 2.0,
"rewards/rejected": -0.93359375,
"step": 317
},
{
"epoch": 0.35176991150442477,
"grad_norm": 13.388772964477539,
"learning_rate": 3.7732208616095986e-07,
"logits/chosen": -1.12890625,
"logits/rejected": -1.1484375,
"logps/chosen": -249.0,
"logps/rejected": -276.0,
"loss": 0.3055,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 1.44140625,
"rewards/margins": 2.578125,
"rewards/rejected": -1.13671875,
"step": 318
},
{
"epoch": 0.3528761061946903,
"grad_norm": 13.385259628295898,
"learning_rate": 3.7654968173889334e-07,
"logits/chosen": -1.34375,
"logits/rejected": -1.14453125,
"logps/chosen": -240.5,
"logps/rejected": -279.0,
"loss": 0.3375,
"rewards/accuracies": 0.78125,
"rewards/chosen": 1.28125,
"rewards/margins": 2.4765625,
"rewards/rejected": -1.19140625,
"step": 319
},
{
"epoch": 0.35398230088495575,
"grad_norm": 13.495279312133789,
"learning_rate": 3.7577564970017336e-07,
"logits/chosen": -1.3203125,
"logits/rejected": -1.12890625,
"logps/chosen": -237.5,
"logps/rejected": -251.0,
"loss": 0.3125,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 1.16796875,
"rewards/margins": 2.546875,
"rewards/rejected": -1.37890625,
"step": 320
},
{
"epoch": 0.35508849557522126,
"grad_norm": 14.151552200317383,
"learning_rate": 3.75e-07,
"logits/chosen": -1.28515625,
"logits/rejected": -1.1171875,
"logps/chosen": -251.0,
"logps/rejected": -281.0,
"loss": 0.3316,
"rewards/accuracies": 0.765625,
"rewards/chosen": 1.12109375,
"rewards/margins": 2.3125,
"rewards/rejected": -1.19140625,
"step": 321
},
{
"epoch": 0.3561946902654867,
"grad_norm": 13.408705711364746,
"learning_rate": 3.742227426143793e-07,
"logits/chosen": -1.23046875,
"logits/rejected": -1.12890625,
"logps/chosen": -229.5,
"logps/rejected": -235.0,
"loss": 0.3559,
"rewards/accuracies": 0.7578125,
"rewards/chosen": 0.94140625,
"rewards/margins": 2.0625,
"rewards/rejected": -1.1171875,
"step": 322
},
{
"epoch": 0.3573008849557522,
"grad_norm": 14.06219482421875,
"learning_rate": 3.734438875399943e-07,
"logits/chosen": -1.29296875,
"logits/rejected": -1.22265625,
"logps/chosen": -270.0,
"logps/rejected": -296.0,
"loss": 0.3082,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 1.3046875,
"rewards/margins": 2.5,
"rewards/rejected": -1.19921875,
"step": 323
},
{
"epoch": 0.3584070796460177,
"grad_norm": 13.949469566345215,
"learning_rate": 3.726634447940768e-07,
"logits/chosen": -1.3359375,
"logits/rejected": -1.17578125,
"logps/chosen": -277.0,
"logps/rejected": -297.0,
"loss": 0.3666,
"rewards/accuracies": 0.78125,
"rewards/chosen": 1.04296875,
"rewards/margins": 1.87109375,
"rewards/rejected": -0.83203125,
"step": 324
},
{
"epoch": 0.35951327433628316,
"grad_norm": 14.00977897644043,
"learning_rate": 3.7188142441427836e-07,
"logits/chosen": -1.1484375,
"logits/rejected": -1.1640625,
"logps/chosen": -237.5,
"logps/rejected": -263.0,
"loss": 0.3086,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 1.328125,
"rewards/margins": 2.53125,
"rewards/rejected": -1.203125,
"step": 325
},
{
"epoch": 0.3606194690265487,
"grad_norm": 17.24125099182129,
"learning_rate": 3.710978364585411e-07,
"logits/chosen": -1.25,
"logits/rejected": -1.23046875,
"logps/chosen": -265.5,
"logps/rejected": -274.0,
"loss": 0.4063,
"rewards/accuracies": 0.7421875,
"rewards/chosen": 1.005859375,
"rewards/margins": 2.12890625,
"rewards/rejected": -1.126953125,
"step": 326
},
{
"epoch": 0.36172566371681414,
"grad_norm": 13.335806846618652,
"learning_rate": 3.7031269100496897e-07,
"logits/chosen": -1.3203125,
"logits/rejected": -1.1328125,
"logps/chosen": -246.0,
"logps/rejected": -255.5,
"loss": 0.3012,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 1.142578125,
"rewards/margins": 2.578125,
"rewards/rejected": -1.42578125,
"step": 327
},
{
"epoch": 0.36283185840707965,
"grad_norm": 14.329955101013184,
"learning_rate": 3.69525998151697e-07,
"logits/chosen": -1.20703125,
"logits/rejected": -1.140625,
"logps/chosen": -260.0,
"logps/rejected": -284.0,
"loss": 0.3322,
"rewards/accuracies": 0.75,
"rewards/chosen": 1.23046875,
"rewards/margins": 2.453125,
"rewards/rejected": -1.22265625,
"step": 328
},
{
"epoch": 0.3639380530973451,
"grad_norm": 14.133960723876953,
"learning_rate": 3.687377680167626e-07,
"logits/chosen": -1.23828125,
"logits/rejected": -1.125,
"logps/chosen": -253.0,
"logps/rejected": -271.0,
"loss": 0.3381,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 1.23046875,
"rewards/margins": 2.5390625,
"rewards/rejected": -1.3125,
"step": 329
},
{
"epoch": 0.36504424778761063,
"grad_norm": 15.067541122436523,
"learning_rate": 3.6794801073797453e-07,
"logits/chosen": -1.2265625,
"logits/rejected": -1.15234375,
"logps/chosen": -262.0,
"logps/rejected": -284.0,
"loss": 0.3784,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 1.06640625,
"rewards/margins": 2.1640625,
"rewards/rejected": -1.09375,
"step": 330
},
{
"epoch": 0.3661504424778761,
"grad_norm": 14.064321517944336,
"learning_rate": 3.671567364727833e-07,
"logits/chosen": -1.29296875,
"logits/rejected": -1.140625,
"logps/chosen": -234.5,
"logps/rejected": -260.0,
"loss": 0.3866,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 1.0234375,
"rewards/margins": 2.15625,
"rewards/rejected": -1.12890625,
"step": 331
},
{
"epoch": 0.3672566371681416,
"grad_norm": 14.258491516113281,
"learning_rate": 3.663639553981497e-07,
"logits/chosen": -1.2890625,
"logits/rejected": -1.2421875,
"logps/chosen": -236.0,
"logps/rejected": -253.5,
"loss": 0.3042,
"rewards/accuracies": 0.84375,
"rewards/chosen": 1.4140625,
"rewards/margins": 2.6640625,
"rewards/rejected": -1.25,
"step": 332
},
{
"epoch": 0.36836283185840707,
"grad_norm": 14.841512680053711,
"learning_rate": 3.655696777104146e-07,
"logits/chosen": -1.25390625,
"logits/rejected": -1.21875,
"logps/chosen": -251.5,
"logps/rejected": -276.0,
"loss": 0.338,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 1.28515625,
"rewards/margins": 2.53125,
"rewards/rejected": -1.24609375,
"step": 333
},
{
"epoch": 0.3694690265486726,
"grad_norm": 13.853322982788086,
"learning_rate": 3.647739136251673e-07,
"logits/chosen": -1.1953125,
"logits/rejected": -1.20703125,
"logps/chosen": -265.0,
"logps/rejected": -308.0,
"loss": 0.3455,
"rewards/accuracies": 0.765625,
"rewards/chosen": 1.1796875,
"rewards/margins": 2.234375,
"rewards/rejected": -1.0546875,
"step": 334
},
{
"epoch": 0.37057522123893805,
"grad_norm": 15.147529602050781,
"learning_rate": 3.639766733771147e-07,
"logits/chosen": -1.34375,
"logits/rejected": -1.25,
"logps/chosen": -250.5,
"logps/rejected": -287.0,
"loss": 0.3692,
"rewards/accuracies": 0.78125,
"rewards/chosen": 1.10546875,
"rewards/margins": 2.26171875,
"rewards/rejected": -1.15625,
"step": 335
},
{
"epoch": 0.37168141592920356,
"grad_norm": 13.71894359588623,
"learning_rate": 3.6317796721994903e-07,
"logits/chosen": -1.30859375,
"logits/rejected": -1.1953125,
"logps/chosen": -272.0,
"logps/rejected": -268.0,
"loss": 0.3311,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 1.083984375,
"rewards/margins": 2.33984375,
"rewards/rejected": -1.25390625,
"step": 336
},
{
"epoch": 0.372787610619469,
"grad_norm": 12.683527946472168,
"learning_rate": 3.623778054262164e-07,
"logits/chosen": -1.30078125,
"logits/rejected": -1.23046875,
"logps/chosen": -268.0,
"logps/rejected": -275.0,
"loss": 0.302,
"rewards/accuracies": 0.8125,
"rewards/chosen": 1.26171875,
"rewards/margins": 2.3515625,
"rewards/rejected": -1.09375,
"step": 337
},
{
"epoch": 0.37389380530973454,
"grad_norm": 12.87300968170166,
"learning_rate": 3.6157619828718473e-07,
"logits/chosen": -1.21875,
"logits/rejected": -1.140625,
"logps/chosen": -248.0,
"logps/rejected": -249.5,
"loss": 0.3173,
"rewards/accuracies": 0.828125,
"rewards/chosen": 0.939453125,
"rewards/margins": 2.3046875,
"rewards/rejected": -1.36328125,
"step": 338
},
{
"epoch": 0.375,
"grad_norm": 13.38857650756836,
"learning_rate": 3.6077315611271095e-07,
"logits/chosen": -1.359375,
"logits/rejected": -1.28125,
"logps/chosen": -247.5,
"logps/rejected": -256.0,
"loss": 0.3028,
"rewards/accuracies": 0.84375,
"rewards/chosen": 1.03125,
"rewards/margins": 2.3671875,
"rewards/rejected": -1.33984375,
"step": 339
},
{
"epoch": 0.37610619469026546,
"grad_norm": 13.413825988769531,
"learning_rate": 3.5996868923110883e-07,
"logits/chosen": -1.30859375,
"logits/rejected": -1.1796875,
"logps/chosen": -229.0,
"logps/rejected": -270.5,
"loss": 0.3433,
"rewards/accuracies": 0.796875,
"rewards/chosen": 1.015625,
"rewards/margins": 2.2421875,
"rewards/rejected": -1.2265625,
"step": 340
},
{
"epoch": 0.377212389380531,
"grad_norm": 14.234886169433594,
"learning_rate": 3.59162807989016e-07,
"logits/chosen": -1.26953125,
"logits/rejected": -1.13671875,
"logps/chosen": -248.5,
"logps/rejected": -233.5,
"loss": 0.3125,
"rewards/accuracies": 0.78125,
"rewards/chosen": 1.125,
"rewards/margins": 2.6796875,
"rewards/rejected": -1.55078125,
"step": 341
},
{
"epoch": 0.37831858407079644,
"grad_norm": 14.096612930297852,
"learning_rate": 3.583555227512607e-07,
"logits/chosen": -1.3515625,
"logits/rejected": -1.12109375,
"logps/chosen": -238.5,
"logps/rejected": -265.0,
"loss": 0.356,
"rewards/accuracies": 0.765625,
"rewards/chosen": 1.1875,
"rewards/margins": 2.65625,
"rewards/rejected": -1.47265625,
"step": 342
},
{
"epoch": 0.37942477876106195,
"grad_norm": 13.053836822509766,
"learning_rate": 3.5754684390072886e-07,
"logits/chosen": -1.16796875,
"logits/rejected": -1.0625,
"logps/chosen": -241.0,
"logps/rejected": -280.0,
"loss": 0.3579,
"rewards/accuracies": 0.7578125,
"rewards/chosen": 1.033203125,
"rewards/margins": 2.10546875,
"rewards/rejected": -1.07421875,
"step": 343
},
{
"epoch": 0.3805309734513274,
"grad_norm": 13.854188919067383,
"learning_rate": 3.5673678183823024e-07,
"logits/chosen": -1.13671875,
"logits/rejected": -1.0546875,
"logps/chosen": -284.0,
"logps/rejected": -302.0,
"loss": 0.3128,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 1.25390625,
"rewards/margins": 2.5078125,
"rewards/rejected": -1.2578125,
"step": 344
},
{
"epoch": 0.38163716814159293,
"grad_norm": 15.331599235534668,
"learning_rate": 3.559253469823647e-07,
"logits/chosen": -1.09375,
"logits/rejected": -1.048828125,
"logps/chosen": -250.5,
"logps/rejected": -278.0,
"loss": 0.3974,
"rewards/accuracies": 0.7109375,
"rewards/chosen": 1.05078125,
"rewards/margins": 2.3203125,
"rewards/rejected": -1.2734375,
"step": 345
},
{
"epoch": 0.3827433628318584,
"grad_norm": 13.891538619995117,
"learning_rate": 3.5511254976938834e-07,
"logits/chosen": -1.2890625,
"logits/rejected": -1.1484375,
"logps/chosen": -269.0,
"logps/rejected": -274.0,
"loss": 0.3552,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 1.0703125,
"rewards/margins": 2.125,
"rewards/rejected": -1.05078125,
"step": 346
},
{
"epoch": 0.3838495575221239,
"grad_norm": 14.387117385864258,
"learning_rate": 3.542984006530792e-07,
"logits/chosen": -1.36328125,
"logits/rejected": -1.16015625,
"logps/chosen": -237.0,
"logps/rejected": -272.0,
"loss": 0.3257,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 1.044921875,
"rewards/margins": 2.546875,
"rewards/rejected": -1.5078125,
"step": 347
},
{
"epoch": 0.38495575221238937,
"grad_norm": 14.693575859069824,
"learning_rate": 3.534829101046027e-07,
"logits/chosen": -1.26953125,
"logits/rejected": -1.19921875,
"logps/chosen": -253.0,
"logps/rejected": -271.5,
"loss": 0.4006,
"rewards/accuracies": 0.7578125,
"rewards/chosen": 0.859375,
"rewards/margins": 1.90234375,
"rewards/rejected": -1.046875,
"step": 348
},
{
"epoch": 0.3860619469026549,
"grad_norm": 14.893670082092285,
"learning_rate": 3.5266608861237723e-07,
"logits/chosen": -1.30859375,
"logits/rejected": -1.16015625,
"logps/chosen": -257.0,
"logps/rejected": -273.0,
"loss": 0.3469,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.845703125,
"rewards/margins": 2.171875,
"rewards/rejected": -1.32421875,
"step": 349
},
{
"epoch": 0.38716814159292035,
"grad_norm": 13.093351364135742,
"learning_rate": 3.518479466819389e-07,
"logits/chosen": -1.1484375,
"logits/rejected": -1.20703125,
"logps/chosen": -251.0,
"logps/rejected": -290.0,
"loss": 0.3118,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 1.056640625,
"rewards/margins": 2.5546875,
"rewards/rejected": -1.5,
"step": 350
},
{
"epoch": 0.38716814159292035,
"eval_logits/chosen": -1.260883092880249,
"eval_logits/rejected": -1.1730799674987793,
"eval_logps/chosen": -250.82586669921875,
"eval_logps/rejected": -272.52239990234375,
"eval_loss": 0.3436649739742279,
"eval_rewards/accuracies": 0.7924543023109436,
"eval_rewards/chosen": 1.0035176277160645,
"eval_rewards/margins": 2.3350045680999756,
"eval_rewards/rejected": -1.3310012817382812,
"eval_runtime": 192.8803,
"eval_samples_per_second": 66.637,
"eval_steps_per_second": 1.042,
"step": 350
},
{
"epoch": 0.38827433628318586,
"grad_norm": 17.082809448242188,
"learning_rate": 3.510284948358068e-07,
"logits/chosen": -1.2734375,
"logits/rejected": -1.1328125,
"logps/chosen": -264.0,
"logps/rejected": -286.0,
"loss": 0.4283,
"rewards/accuracies": 0.703125,
"rewards/chosen": 0.78125,
"rewards/margins": 2.02734375,
"rewards/rejected": -1.25,
"step": 351
},
{
"epoch": 0.3893805309734513,
"grad_norm": 14.743417739868164,
"learning_rate": 3.5020774361334744e-07,
"logits/chosen": -1.27734375,
"logits/rejected": -1.28125,
"logps/chosen": -235.0,
"logps/rejected": -291.0,
"loss": 0.3538,
"rewards/accuracies": 0.765625,
"rewards/chosen": 1.1015625,
"rewards/margins": 2.375,
"rewards/rejected": -1.26953125,
"step": 352
},
{
"epoch": 0.39048672566371684,
"grad_norm": 12.676244735717773,
"learning_rate": 3.49385703570639e-07,
"logits/chosen": -1.35546875,
"logits/rejected": -1.19921875,
"logps/chosen": -243.5,
"logps/rejected": -254.0,
"loss": 0.2715,
"rewards/accuracies": 0.8515625,
"rewards/chosen": 1.2578125,
"rewards/margins": 2.7578125,
"rewards/rejected": -1.5,
"step": 353
},
{
"epoch": 0.3915929203539823,
"grad_norm": 13.607057571411133,
"learning_rate": 3.485623852803361e-07,
"logits/chosen": -1.1953125,
"logits/rejected": -1.12890625,
"logps/chosen": -248.0,
"logps/rejected": -263.5,
"loss": 0.3456,
"rewards/accuracies": 0.7578125,
"rewards/chosen": 0.962890625,
"rewards/margins": 2.46875,
"rewards/rejected": -1.5,
"step": 354
},
{
"epoch": 0.3926991150442478,
"grad_norm": 12.650440216064453,
"learning_rate": 3.4773779933153343e-07,
"logits/chosen": -1.296875,
"logits/rejected": -1.2578125,
"logps/chosen": -222.5,
"logps/rejected": -240.0,
"loss": 0.3298,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 1.037109375,
"rewards/margins": 2.265625,
"rewards/rejected": -1.2265625,
"step": 355
},
{
"epoch": 0.3938053097345133,
"grad_norm": 13.600220680236816,
"learning_rate": 3.4691195632962957e-07,
"logits/chosen": -1.41015625,
"logits/rejected": -1.1796875,
"logps/chosen": -225.0,
"logps/rejected": -251.0,
"loss": 0.3439,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.978515625,
"rewards/margins": 2.3515625,
"rewards/rejected": -1.375,
"step": 356
},
{
"epoch": 0.39491150442477874,
"grad_norm": 14.466880798339844,
"learning_rate": 3.4608486689619083e-07,
"logits/chosen": -1.1640625,
"logits/rejected": -1.171875,
"logps/chosen": -252.0,
"logps/rejected": -254.0,
"loss": 0.3437,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.904296875,
"rewards/margins": 2.4375,
"rewards/rejected": -1.53125,
"step": 357
},
{
"epoch": 0.39601769911504425,
"grad_norm": 13.527942657470703,
"learning_rate": 3.4525654166881426e-07,
"logits/chosen": -1.34375,
"logits/rejected": -1.19140625,
"logps/chosen": -256.5,
"logps/rejected": -281.0,
"loss": 0.3267,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.962890625,
"rewards/margins": 2.40625,
"rewards/rejected": -1.44140625,
"step": 358
},
{
"epoch": 0.3971238938053097,
"grad_norm": 13.268882751464844,
"learning_rate": 3.4442699130099116e-07,
"logits/chosen": -1.26171875,
"logits/rejected": -1.1328125,
"logps/chosen": -268.0,
"logps/rejected": -297.0,
"loss": 0.293,
"rewards/accuracies": 0.828125,
"rewards/chosen": 0.990234375,
"rewards/margins": 2.6171875,
"rewards/rejected": -1.6328125,
"step": 359
},
{
"epoch": 0.39823008849557523,
"grad_norm": 14.783308982849121,
"learning_rate": 3.435962264619702e-07,
"logits/chosen": -1.1796875,
"logits/rejected": -1.12109375,
"logps/chosen": -239.5,
"logps/rejected": -278.0,
"loss": 0.3438,
"rewards/accuracies": 0.765625,
"rewards/chosen": 0.765625,
"rewards/margins": 2.25390625,
"rewards/rejected": -1.484375,
"step": 360
},
{
"epoch": 0.3993362831858407,
"grad_norm": 14.32043743133545,
"learning_rate": 3.427642578366194e-07,
"logits/chosen": -1.2578125,
"logits/rejected": -1.19140625,
"logps/chosen": -256.5,
"logps/rejected": -278.0,
"loss": 0.3622,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 0.9375,
"rewards/margins": 2.359375,
"rewards/rejected": -1.421875,
"step": 361
},
{
"epoch": 0.4004424778761062,
"grad_norm": 175.1194305419922,
"learning_rate": 3.419310961252897e-07,
"logits/chosen": -1.2109375,
"logits/rejected": -1.10546875,
"logps/chosen": -234.5,
"logps/rejected": -354.0,
"loss": 0.3211,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 1.06640625,
"rewards/margins": 2.6015625,
"rewards/rejected": -1.53515625,
"step": 362
},
{
"epoch": 0.40154867256637167,
"grad_norm": 13.051432609558105,
"learning_rate": 3.4109675204367686e-07,
"logits/chosen": -1.1953125,
"logits/rejected": -1.06640625,
"logps/chosen": -269.0,
"logps/rejected": -313.0,
"loss": 0.3161,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 1.140625,
"rewards/margins": 2.5703125,
"rewards/rejected": -1.4296875,
"step": 363
},
{
"epoch": 0.4026548672566372,
"grad_norm": 12.166739463806152,
"learning_rate": 3.4026123632268354e-07,
"logits/chosen": -1.25390625,
"logits/rejected": -1.09375,
"logps/chosen": -235.0,
"logps/rejected": -244.0,
"loss": 0.3185,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.6875,
"rewards/margins": 2.1953125,
"rewards/rejected": -1.50390625,
"step": 364
},
{
"epoch": 0.40376106194690264,
"grad_norm": 13.57703971862793,
"learning_rate": 3.3942455970828146e-07,
"logits/chosen": -1.25390625,
"logits/rejected": -1.11328125,
"logps/chosen": -249.5,
"logps/rejected": -265.0,
"loss": 0.3227,
"rewards/accuracies": 0.8515625,
"rewards/chosen": 1.013671875,
"rewards/margins": 2.546875,
"rewards/rejected": -1.52734375,
"step": 365
},
{
"epoch": 0.40486725663716816,
"grad_norm": 12.446300506591797,
"learning_rate": 3.38586732961373e-07,
"logits/chosen": -1.3828125,
"logits/rejected": -1.21875,
"logps/chosen": -232.5,
"logps/rejected": -237.5,
"loss": 0.3575,
"rewards/accuracies": 0.7578125,
"rewards/chosen": 0.560546875,
"rewards/margins": 2.234375,
"rewards/rejected": -1.6796875,
"step": 366
},
{
"epoch": 0.4059734513274336,
"grad_norm": 13.510902404785156,
"learning_rate": 3.3774776685765327e-07,
"logits/chosen": -1.3125,
"logits/rejected": -1.265625,
"logps/chosen": -242.5,
"logps/rejected": -264.5,
"loss": 0.3312,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 0.79296875,
"rewards/margins": 2.3671875,
"rewards/rejected": -1.57421875,
"step": 367
},
{
"epoch": 0.40707964601769914,
"grad_norm": 12.729259490966797,
"learning_rate": 3.3690767218747104e-07,
"logits/chosen": -1.1015625,
"logits/rejected": -1.1328125,
"logps/chosen": -243.5,
"logps/rejected": -272.0,
"loss": 0.2871,
"rewards/accuracies": 0.828125,
"rewards/chosen": 1.12109375,
"rewards/margins": 2.7734375,
"rewards/rejected": -1.6484375,
"step": 368
},
{
"epoch": 0.4081858407079646,
"grad_norm": 14.687115669250488,
"learning_rate": 3.3606645975569e-07,
"logits/chosen": -1.34765625,
"logits/rejected": -1.078125,
"logps/chosen": -248.5,
"logps/rejected": -254.5,
"loss": 0.3694,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.65625,
"rewards/margins": 2.125,
"rewards/rejected": -1.47265625,
"step": 369
},
{
"epoch": 0.4092920353982301,
"grad_norm": 11.254127502441406,
"learning_rate": 3.3522414038155016e-07,
"logits/chosen": -1.22265625,
"logits/rejected": -1.12109375,
"logps/chosen": -225.0,
"logps/rejected": -259.5,
"loss": 0.2835,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.970703125,
"rewards/margins": 2.65625,
"rewards/rejected": -1.68359375,
"step": 370
},
{
"epoch": 0.4103982300884956,
"grad_norm": 15.060591697692871,
"learning_rate": 3.343807248985283e-07,
"logits/chosen": -1.2890625,
"logits/rejected": -1.16015625,
"logps/chosen": -240.0,
"logps/rejected": -265.5,
"loss": 0.3759,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.689453125,
"rewards/margins": 2.2109375,
"rewards/rejected": -1.5234375,
"step": 371
},
{
"epoch": 0.41150442477876104,
"grad_norm": 11.981417655944824,
"learning_rate": 3.335362241541988e-07,
"logits/chosen": -1.171875,
"logits/rejected": -1.09375,
"logps/chosen": -260.0,
"logps/rejected": -280.0,
"loss": 0.3012,
"rewards/accuracies": 0.8515625,
"rewards/chosen": 0.732421875,
"rewards/margins": 2.3828125,
"rewards/rejected": -1.64453125,
"step": 372
},
{
"epoch": 0.41261061946902655,
"grad_norm": 13.114727020263672,
"learning_rate": 3.32690649010094e-07,
"logits/chosen": -1.265625,
"logits/rejected": -1.16796875,
"logps/chosen": -244.5,
"logps/rejected": -267.5,
"loss": 0.2875,
"rewards/accuracies": 0.8828125,
"rewards/chosen": 0.927734375,
"rewards/margins": 2.8125,
"rewards/rejected": -1.8828125,
"step": 373
},
{
"epoch": 0.413716814159292,
"grad_norm": 13.435688972473145,
"learning_rate": 3.3184401034156484e-07,
"logits/chosen": -1.16796875,
"logits/rejected": -1.015625,
"logps/chosen": -263.0,
"logps/rejected": -271.0,
"loss": 0.344,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.658203125,
"rewards/margins": 2.2890625,
"rewards/rejected": -1.625,
"step": 374
},
{
"epoch": 0.41482300884955753,
"grad_norm": 16.25585174560547,
"learning_rate": 3.3099631903764064e-07,
"logits/chosen": -1.26171875,
"logits/rejected": -1.125,
"logps/chosen": -270.0,
"logps/rejected": -291.0,
"loss": 0.4301,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.572265625,
"rewards/margins": 1.765625,
"rewards/rejected": -1.1953125,
"step": 375
},
{
"epoch": 0.415929203539823,
"grad_norm": 13.53650188446045,
"learning_rate": 3.3014758600088923e-07,
"logits/chosen": -1.2890625,
"logits/rejected": -1.2265625,
"logps/chosen": -232.0,
"logps/rejected": -266.5,
"loss": 0.3326,
"rewards/accuracies": 0.8515625,
"rewards/chosen": 0.755859375,
"rewards/margins": 2.234375,
"rewards/rejected": -1.48046875,
"step": 376
},
{
"epoch": 0.4170353982300885,
"grad_norm": 12.601165771484375,
"learning_rate": 3.2929782214727653e-07,
"logits/chosen": -1.29296875,
"logits/rejected": -1.18359375,
"logps/chosen": -246.5,
"logps/rejected": -263.5,
"loss": 0.3436,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 0.771484375,
"rewards/margins": 2.515625,
"rewards/rejected": -1.7421875,
"step": 377
},
{
"epoch": 0.41814159292035397,
"grad_norm": 13.9395112991333,
"learning_rate": 3.2844703840602636e-07,
"logits/chosen": -1.30078125,
"logits/rejected": -1.30859375,
"logps/chosen": -243.5,
"logps/rejected": -262.5,
"loss": 0.3515,
"rewards/accuracies": 0.765625,
"rewards/chosen": 0.92578125,
"rewards/margins": 2.390625,
"rewards/rejected": -1.46484375,
"step": 378
},
{
"epoch": 0.4192477876106195,
"grad_norm": 13.737992286682129,
"learning_rate": 3.2759524571948e-07,
"logits/chosen": -1.3125,
"logits/rejected": -1.11328125,
"logps/chosen": -258.0,
"logps/rejected": -293.0,
"loss": 0.2964,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.970703125,
"rewards/margins": 2.5859375,
"rewards/rejected": -1.61328125,
"step": 379
},
{
"epoch": 0.42035398230088494,
"grad_norm": 14.736825942993164,
"learning_rate": 3.26742455042955e-07,
"logits/chosen": -1.3515625,
"logits/rejected": -1.2109375,
"logps/chosen": -247.5,
"logps/rejected": -238.0,
"loss": 0.3616,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.634765625,
"rewards/margins": 2.1875,
"rewards/rejected": -1.55078125,
"step": 380
},
{
"epoch": 0.42146017699115046,
"grad_norm": 14.469903945922852,
"learning_rate": 3.2588867734460464e-07,
"logits/chosen": -1.1796875,
"logits/rejected": -1.15234375,
"logps/chosen": -252.0,
"logps/rejected": -260.5,
"loss": 0.355,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.9296875,
"rewards/margins": 2.5546875,
"rewards/rejected": -1.62109375,
"step": 381
},
{
"epoch": 0.4225663716814159,
"grad_norm": 14.506092071533203,
"learning_rate": 3.250339236052767e-07,
"logits/chosen": -1.140625,
"logits/rejected": -1.18359375,
"logps/chosen": -261.0,
"logps/rejected": -289.0,
"loss": 0.3673,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.951171875,
"rewards/margins": 2.3359375,
"rewards/rejected": -1.37890625,
"step": 382
},
{
"epoch": 0.42367256637168144,
"grad_norm": 13.806063652038574,
"learning_rate": 3.2417820481837256e-07,
"logits/chosen": -1.3671875,
"logits/rejected": -1.16015625,
"logps/chosen": -254.0,
"logps/rejected": -274.0,
"loss": 0.3272,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 0.712890625,
"rewards/margins": 2.40625,
"rewards/rejected": -1.69140625,
"step": 383
},
{
"epoch": 0.4247787610619469,
"grad_norm": 13.990365982055664,
"learning_rate": 3.2332153198970517e-07,
"logits/chosen": -1.28125,
"logits/rejected": -1.2109375,
"logps/chosen": -250.5,
"logps/rejected": -279.0,
"loss": 0.334,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.66796875,
"rewards/margins": 2.1640625,
"rewards/rejected": -1.49609375,
"step": 384
},
{
"epoch": 0.4258849557522124,
"grad_norm": 15.526028633117676,
"learning_rate": 3.2246391613735815e-07,
"logits/chosen": -1.390625,
"logits/rejected": -1.1953125,
"logps/chosen": -253.0,
"logps/rejected": -258.5,
"loss": 0.3283,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.92578125,
"rewards/margins": 2.421875,
"rewards/rejected": -1.49609375,
"step": 385
},
{
"epoch": 0.4269911504424779,
"grad_norm": 13.068387985229492,
"learning_rate": 3.2160536829154356e-07,
"logits/chosen": -1.27734375,
"logits/rejected": -1.23046875,
"logps/chosen": -248.5,
"logps/rejected": -286.0,
"loss": 0.2813,
"rewards/accuracies": 0.859375,
"rewards/chosen": 1.46875,
"rewards/margins": 3.0078125,
"rewards/rejected": -1.54296875,
"step": 386
},
{
"epoch": 0.4280973451327434,
"grad_norm": 14.58014965057373,
"learning_rate": 3.207458994944606e-07,
"logits/chosen": -1.2578125,
"logits/rejected": -1.16796875,
"logps/chosen": -261.0,
"logps/rejected": -269.0,
"loss": 0.3732,
"rewards/accuracies": 0.7578125,
"rewards/chosen": 0.6640625,
"rewards/margins": 2.0546875,
"rewards/rejected": -1.390625,
"step": 387
},
{
"epoch": 0.42920353982300885,
"grad_norm": 13.451835632324219,
"learning_rate": 3.1988552080015294e-07,
"logits/chosen": -1.33984375,
"logits/rejected": -1.15234375,
"logps/chosen": -256.5,
"logps/rejected": -264.0,
"loss": 0.3112,
"rewards/accuracies": 0.828125,
"rewards/chosen": 1.11328125,
"rewards/margins": 2.515625,
"rewards/rejected": -1.40234375,
"step": 388
},
{
"epoch": 0.4303097345132743,
"grad_norm": 15.350495338439941,
"learning_rate": 3.1902424327436725e-07,
"logits/chosen": -1.23828125,
"logits/rejected": -1.0625,
"logps/chosen": -273.0,
"logps/rejected": -264.0,
"loss": 0.3406,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 0.712890625,
"rewards/margins": 2.1484375,
"rewards/rejected": -1.44140625,
"step": 389
},
{
"epoch": 0.4314159292035398,
"grad_norm": 17.061744689941406,
"learning_rate": 3.1816207799440996e-07,
"logits/chosen": -1.265625,
"logits/rejected": -1.12890625,
"logps/chosen": -278.5,
"logps/rejected": -318.0,
"loss": 0.3654,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 1.16015625,
"rewards/margins": 2.4375,
"rewards/rejected": -1.28125,
"step": 390
},
{
"epoch": 0.4325221238938053,
"grad_norm": 16.2224063873291,
"learning_rate": 3.1729903604900595e-07,
"logits/chosen": -1.25390625,
"logits/rejected": -1.08203125,
"logps/chosen": -244.0,
"logps/rejected": -281.0,
"loss": 0.3328,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.923828125,
"rewards/margins": 2.5234375,
"rewards/rejected": -1.59375,
"step": 391
},
{
"epoch": 0.4336283185840708,
"grad_norm": 13.922826766967773,
"learning_rate": 3.1643512853815487e-07,
"logits/chosen": -1.26171875,
"logits/rejected": -1.25390625,
"logps/chosen": -249.0,
"logps/rejected": -280.0,
"loss": 0.3626,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.89453125,
"rewards/margins": 2.12890625,
"rewards/rejected": -1.2421875,
"step": 392
},
{
"epoch": 0.43473451327433627,
"grad_norm": 14.233848571777344,
"learning_rate": 3.15570366572989e-07,
"logits/chosen": -1.3203125,
"logits/rejected": -1.18359375,
"logps/chosen": -246.5,
"logps/rejected": -255.5,
"loss": 0.33,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.875,
"rewards/margins": 2.2421875,
"rewards/rejected": -1.3671875,
"step": 393
},
{
"epoch": 0.4358407079646018,
"grad_norm": 15.168607711791992,
"learning_rate": 3.147047612756302e-07,
"logits/chosen": -1.21875,
"logits/rejected": -1.17578125,
"logps/chosen": -281.0,
"logps/rejected": -290.0,
"loss": 0.345,
"rewards/accuracies": 0.828125,
"rewards/chosen": 0.892578125,
"rewards/margins": 2.3125,
"rewards/rejected": -1.421875,
"step": 394
},
{
"epoch": 0.43694690265486724,
"grad_norm": 14.825115203857422,
"learning_rate": 3.138383237790467e-07,
"logits/chosen": -1.203125,
"logits/rejected": -1.1484375,
"logps/chosen": -250.5,
"logps/rejected": -272.0,
"loss": 0.3428,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.98828125,
"rewards/margins": 2.4765625,
"rewards/rejected": -1.484375,
"step": 395
},
{
"epoch": 0.43805309734513276,
"grad_norm": 12.729918479919434,
"learning_rate": 3.129710652269103e-07,
"logits/chosen": -1.30859375,
"logits/rejected": -1.1171875,
"logps/chosen": -230.0,
"logps/rejected": -255.5,
"loss": 0.2864,
"rewards/accuracies": 0.8515625,
"rewards/chosen": 1.20703125,
"rewards/margins": 2.875,
"rewards/rejected": -1.66796875,
"step": 396
},
{
"epoch": 0.4391592920353982,
"grad_norm": 12.986177444458008,
"learning_rate": 3.1210299677345253e-07,
"logits/chosen": -1.17578125,
"logits/rejected": -1.140625,
"logps/chosen": -257.0,
"logps/rejected": -279.0,
"loss": 0.3394,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 1.013671875,
"rewards/margins": 2.46875,
"rewards/rejected": -1.45703125,
"step": 397
},
{
"epoch": 0.44026548672566373,
"grad_norm": 15.79924488067627,
"learning_rate": 3.1123412958332153e-07,
"logits/chosen": -1.30859375,
"logits/rejected": -1.15234375,
"logps/chosen": -248.0,
"logps/rejected": -275.0,
"loss": 0.3804,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.84375,
"rewards/margins": 2.34375,
"rewards/rejected": -1.5,
"step": 398
},
{
"epoch": 0.4413716814159292,
"grad_norm": 14.248608589172363,
"learning_rate": 3.1036447483143834e-07,
"logits/chosen": -1.2578125,
"logits/rejected": -1.22265625,
"logps/chosen": -261.5,
"logps/rejected": -275.0,
"loss": 0.3299,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.99609375,
"rewards/margins": 2.6015625,
"rewards/rejected": -1.609375,
"step": 399
},
{
"epoch": 0.4424778761061947,
"grad_norm": 14.430908203125,
"learning_rate": 3.094940437028535e-07,
"logits/chosen": -1.1171875,
"logits/rejected": -1.125,
"logps/chosen": -250.5,
"logps/rejected": -251.5,
"loss": 0.3726,
"rewards/accuracies": 0.765625,
"rewards/chosen": 0.953125,
"rewards/margins": 2.26953125,
"rewards/rejected": -1.3203125,
"step": 400
},
{
"epoch": 0.4424778761061947,
"eval_logits/chosen": -1.2572294473648071,
"eval_logits/rejected": -1.1652674674987793,
"eval_logps/chosen": -250.63681030273438,
"eval_logps/rejected": -273.5472717285156,
"eval_loss": 0.3369702994823456,
"eval_rewards/accuracies": 0.7978180646896362,
"eval_rewards/chosen": 1.024176001548767,
"eval_rewards/margins": 2.449626922607422,
"eval_rewards/rejected": -1.4251010417938232,
"eval_runtime": 193.115,
"eval_samples_per_second": 66.556,
"eval_steps_per_second": 1.041,
"step": 400
},
{
"epoch": 0.4435840707964602,
"grad_norm": 14.084267616271973,
"learning_rate": 3.086228473926024e-07,
"logits/chosen": -1.23828125,
"logits/rejected": -1.26171875,
"logps/chosen": -242.5,
"logps/rejected": -257.0,
"loss": 0.3172,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 1.12109375,
"rewards/margins": 2.5390625,
"rewards/rejected": -1.4140625,
"step": 401
},
{
"epoch": 0.4446902654867257,
"grad_norm": 13.272000312805176,
"learning_rate": 3.077508971055623e-07,
"logits/chosen": -1.1015625,
"logits/rejected": -1.171875,
"logps/chosen": -246.5,
"logps/rejected": -295.0,
"loss": 0.2771,
"rewards/accuracies": 0.828125,
"rewards/chosen": 1.2265625,
"rewards/margins": 2.90625,
"rewards/rejected": -1.6796875,
"step": 402
},
{
"epoch": 0.44579646017699115,
"grad_norm": 13.017451286315918,
"learning_rate": 3.0687820405630736e-07,
"logits/chosen": -1.31640625,
"logits/rejected": -1.2265625,
"logps/chosen": -258.5,
"logps/rejected": -286.0,
"loss": 0.2997,
"rewards/accuracies": 0.828125,
"rewards/chosen": 1.3125,
"rewards/margins": 2.8125,
"rewards/rejected": -1.5,
"step": 403
},
{
"epoch": 0.4469026548672566,
"grad_norm": 11.719470024108887,
"learning_rate": 3.060047794689649e-07,
"logits/chosen": -1.24609375,
"logits/rejected": -1.171875,
"logps/chosen": -246.0,
"logps/rejected": -252.0,
"loss": 0.273,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 1.154296875,
"rewards/margins": 2.7109375,
"rewards/rejected": -1.55078125,
"step": 404
},
{
"epoch": 0.4480088495575221,
"grad_norm": 12.74482250213623,
"learning_rate": 3.0513063457707106e-07,
"logits/chosen": -1.32421875,
"logits/rejected": -1.17578125,
"logps/chosen": -238.5,
"logps/rejected": -227.0,
"loss": 0.3567,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.908203125,
"rewards/margins": 2.234375,
"rewards/rejected": -1.32421875,
"step": 405
},
{
"epoch": 0.4491150442477876,
"grad_norm": 14.130414962768555,
"learning_rate": 3.0425578062342577e-07,
"logits/chosen": -1.1484375,
"logits/rejected": -1.18359375,
"logps/chosen": -241.5,
"logps/rejected": -268.0,
"loss": 0.3743,
"rewards/accuracies": 0.71875,
"rewards/chosen": 1.017578125,
"rewards/margins": 2.203125,
"rewards/rejected": -1.18359375,
"step": 406
},
{
"epoch": 0.4502212389380531,
"grad_norm": 15.725412368774414,
"learning_rate": 3.03380228859949e-07,
"logits/chosen": -1.29296875,
"logits/rejected": -1.13671875,
"logps/chosen": -271.5,
"logps/rejected": -291.0,
"loss": 0.3421,
"rewards/accuracies": 0.765625,
"rewards/chosen": 1.140625,
"rewards/margins": 2.4140625,
"rewards/rejected": -1.26953125,
"step": 407
},
{
"epoch": 0.45132743362831856,
"grad_norm": 13.073963165283203,
"learning_rate": 3.0250399054753526e-07,
"logits/chosen": -1.19921875,
"logits/rejected": -1.1328125,
"logps/chosen": -271.0,
"logps/rejected": -265.0,
"loss": 0.3024,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 1.078125,
"rewards/margins": 2.515625,
"rewards/rejected": -1.4375,
"step": 408
},
{
"epoch": 0.4524336283185841,
"grad_norm": 12.850948333740234,
"learning_rate": 3.016270769559093e-07,
"logits/chosen": -1.203125,
"logits/rejected": -1.06640625,
"logps/chosen": -258.0,
"logps/rejected": -275.0,
"loss": 0.3189,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 1.041015625,
"rewards/margins": 2.234375,
"rewards/rejected": -1.1953125,
"step": 409
},
{
"epoch": 0.45353982300884954,
"grad_norm": 13.47533130645752,
"learning_rate": 3.007494993634808e-07,
"logits/chosen": -1.2578125,
"logits/rejected": -1.16015625,
"logps/chosen": -259.0,
"logps/rejected": -269.0,
"loss": 0.3222,
"rewards/accuracies": 0.828125,
"rewards/chosen": 1.125,
"rewards/margins": 2.53125,
"rewards/rejected": -1.41015625,
"step": 410
},
{
"epoch": 0.45464601769911506,
"grad_norm": 13.043135643005371,
"learning_rate": 2.9987126905719965e-07,
"logits/chosen": -1.20703125,
"logits/rejected": -1.1484375,
"logps/chosen": -265.5,
"logps/rejected": -272.5,
"loss": 0.3374,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.9140625,
"rewards/margins": 2.3125,
"rewards/rejected": -1.40234375,
"step": 411
},
{
"epoch": 0.4557522123893805,
"grad_norm": 14.49729061126709,
"learning_rate": 2.989923973324105e-07,
"logits/chosen": -1.20703125,
"logits/rejected": -1.08984375,
"logps/chosen": -252.5,
"logps/rejected": -281.0,
"loss": 0.3668,
"rewards/accuracies": 0.765625,
"rewards/chosen": 1.09765625,
"rewards/margins": 2.265625,
"rewards/rejected": -1.1640625,
"step": 412
},
{
"epoch": 0.45685840707964603,
"grad_norm": 15.509222030639648,
"learning_rate": 2.9811289549270745e-07,
"logits/chosen": -1.33203125,
"logits/rejected": -1.2109375,
"logps/chosen": -250.5,
"logps/rejected": -286.0,
"loss": 0.3665,
"rewards/accuracies": 0.7421875,
"rewards/chosen": 1.203125,
"rewards/margins": 2.5,
"rewards/rejected": -1.2890625,
"step": 413
},
{
"epoch": 0.4579646017699115,
"grad_norm": 13.909936904907227,
"learning_rate": 2.9723277484978917e-07,
"logits/chosen": -1.1796875,
"logits/rejected": -1.2109375,
"logps/chosen": -270.0,
"logps/rejected": -291.0,
"loss": 0.2915,
"rewards/accuracies": 0.828125,
"rewards/chosen": 1.17578125,
"rewards/margins": 2.765625,
"rewards/rejected": -1.58984375,
"step": 414
},
{
"epoch": 0.459070796460177,
"grad_norm": 13.325665473937988,
"learning_rate": 2.963520467233127e-07,
"logits/chosen": -1.45703125,
"logits/rejected": -1.19140625,
"logps/chosen": -252.0,
"logps/rejected": -262.5,
"loss": 0.3212,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 1.07421875,
"rewards/margins": 2.4140625,
"rewards/rejected": -1.34375,
"step": 415
},
{
"epoch": 0.46017699115044247,
"grad_norm": 15.239510536193848,
"learning_rate": 2.954707224407485e-07,
"logits/chosen": -1.30078125,
"logits/rejected": -1.24609375,
"logps/chosen": -261.5,
"logps/rejected": -285.0,
"loss": 0.3534,
"rewards/accuracies": 0.78125,
"rewards/chosen": 1.0625,
"rewards/margins": 2.3046875,
"rewards/rejected": -1.2421875,
"step": 416
},
{
"epoch": 0.461283185840708,
"grad_norm": 15.735715866088867,
"learning_rate": 2.945888133372343e-07,
"logits/chosen": -1.21484375,
"logits/rejected": -1.12890625,
"logps/chosen": -287.0,
"logps/rejected": -288.0,
"loss": 0.3967,
"rewards/accuracies": 0.7265625,
"rewards/chosen": 0.88671875,
"rewards/margins": 2.125,
"rewards/rejected": -1.23828125,
"step": 417
},
{
"epoch": 0.46238938053097345,
"grad_norm": 15.247976303100586,
"learning_rate": 2.937063307554295e-07,
"logits/chosen": -1.32421875,
"logits/rejected": -1.21484375,
"logps/chosen": -226.0,
"logps/rejected": -250.0,
"loss": 0.3726,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 1.12890625,
"rewards/margins": 2.375,
"rewards/rejected": -1.24609375,
"step": 418
},
{
"epoch": 0.46349557522123896,
"grad_norm": 12.819127082824707,
"learning_rate": 2.9282328604536937e-07,
"logits/chosen": -1.26953125,
"logits/rejected": -1.2109375,
"logps/chosen": -249.5,
"logps/rejected": -271.0,
"loss": 0.3065,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 1.14453125,
"rewards/margins": 2.578125,
"rewards/rejected": -1.4296875,
"step": 419
},
{
"epoch": 0.4646017699115044,
"grad_norm": 13.217004776000977,
"learning_rate": 2.9193969056431907e-07,
"logits/chosen": -1.19921875,
"logits/rejected": -1.15234375,
"logps/chosen": -254.5,
"logps/rejected": -270.0,
"loss": 0.3139,
"rewards/accuracies": 0.828125,
"rewards/chosen": 1.09765625,
"rewards/margins": 2.78125,
"rewards/rejected": -1.68359375,
"step": 420
},
{
"epoch": 0.4657079646017699,
"grad_norm": 14.80079174041748,
"learning_rate": 2.910555556766272e-07,
"logits/chosen": -1.4375,
"logits/rejected": -1.234375,
"logps/chosen": -226.5,
"logps/rejected": -263.0,
"loss": 0.3987,
"rewards/accuracies": 0.7265625,
"rewards/chosen": 0.94140625,
"rewards/margins": 2.12109375,
"rewards/rejected": -1.17578125,
"step": 421
},
{
"epoch": 0.4668141592920354,
"grad_norm": 13.704545974731445,
"learning_rate": 2.9017089275358014e-07,
"logits/chosen": -1.1953125,
"logits/rejected": -1.1328125,
"logps/chosen": -271.0,
"logps/rejected": -287.0,
"loss": 0.3016,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.9609375,
"rewards/margins": 2.546875,
"rewards/rejected": -1.5859375,
"step": 422
},
{
"epoch": 0.46792035398230086,
"grad_norm": 14.536904335021973,
"learning_rate": 2.8928571317325564e-07,
"logits/chosen": -1.25,
"logits/rejected": -1.0703125,
"logps/chosen": -279.0,
"logps/rejected": -291.0,
"loss": 0.3234,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 1.19140625,
"rewards/margins": 2.6171875,
"rewards/rejected": -1.4296875,
"step": 423
},
{
"epoch": 0.4690265486725664,
"grad_norm": 15.45283317565918,
"learning_rate": 2.8840002832037625e-07,
"logits/chosen": -1.28515625,
"logits/rejected": -1.1875,
"logps/chosen": -261.0,
"logps/rejected": -283.0,
"loss": 0.365,
"rewards/accuracies": 0.78125,
"rewards/chosen": 1.171875,
"rewards/margins": 2.234375,
"rewards/rejected": -1.0703125,
"step": 424
},
{
"epoch": 0.47013274336283184,
"grad_norm": 14.761160850524902,
"learning_rate": 2.8751384958616316e-07,
"logits/chosen": -1.20703125,
"logits/rejected": -1.1484375,
"logps/chosen": -257.0,
"logps/rejected": -285.0,
"loss": 0.3295,
"rewards/accuracies": 0.796875,
"rewards/chosen": 1.2578125,
"rewards/margins": 2.9296875,
"rewards/rejected": -1.671875,
"step": 425
},
{
"epoch": 0.47123893805309736,
"grad_norm": 14.022229194641113,
"learning_rate": 2.8662718836818964e-07,
"logits/chosen": -1.3359375,
"logits/rejected": -1.1796875,
"logps/chosen": -249.5,
"logps/rejected": -275.0,
"loss": 0.3165,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.984375,
"rewards/margins": 2.53125,
"rewards/rejected": -1.546875,
"step": 426
},
{
"epoch": 0.4723451327433628,
"grad_norm": 13.972189903259277,
"learning_rate": 2.8574005607023444e-07,
"logits/chosen": -1.328125,
"logits/rejected": -1.15234375,
"logps/chosen": -253.0,
"logps/rejected": -286.0,
"loss": 0.3595,
"rewards/accuracies": 0.765625,
"rewards/chosen": 1.17578125,
"rewards/margins": 2.3359375,
"rewards/rejected": -1.16015625,
"step": 427
},
{
"epoch": 0.47345132743362833,
"grad_norm": 13.316848754882812,
"learning_rate": 2.848524641021349e-07,
"logits/chosen": -1.2734375,
"logits/rejected": -1.15625,
"logps/chosen": -279.0,
"logps/rejected": -304.0,
"loss": 0.2876,
"rewards/accuracies": 0.8828125,
"rewards/chosen": 1.14453125,
"rewards/margins": 2.734375,
"rewards/rejected": -1.5859375,
"step": 428
},
{
"epoch": 0.4745575221238938,
"grad_norm": 15.443883895874023,
"learning_rate": 2.839644238796407e-07,
"logits/chosen": -1.24609375,
"logits/rejected": -1.14453125,
"logps/chosen": -279.0,
"logps/rejected": -291.0,
"loss": 0.3446,
"rewards/accuracies": 0.796875,
"rewards/chosen": 1.017578125,
"rewards/margins": 2.453125,
"rewards/rejected": -1.4375,
"step": 429
},
{
"epoch": 0.4756637168141593,
"grad_norm": 14.372305870056152,
"learning_rate": 2.8307594682426637e-07,
"logits/chosen": -1.2109375,
"logits/rejected": -1.1484375,
"logps/chosen": -260.5,
"logps/rejected": -309.0,
"loss": 0.2813,
"rewards/accuracies": 0.84375,
"rewards/chosen": 1.0,
"rewards/margins": 2.9609375,
"rewards/rejected": -1.95703125,
"step": 430
},
{
"epoch": 0.47676991150442477,
"grad_norm": 13.688916206359863,
"learning_rate": 2.8218704436314524e-07,
"logits/chosen": -1.46875,
"logits/rejected": -1.22265625,
"logps/chosen": -253.5,
"logps/rejected": -276.0,
"loss": 0.341,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.798828125,
"rewards/margins": 2.171875,
"rewards/rejected": -1.37109375,
"step": 431
},
{
"epoch": 0.4778761061946903,
"grad_norm": 12.372815132141113,
"learning_rate": 2.8129772792888145e-07,
"logits/chosen": -1.29296875,
"logits/rejected": -1.1171875,
"logps/chosen": -235.0,
"logps/rejected": -281.0,
"loss": 0.2966,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 0.869140625,
"rewards/margins": 2.6328125,
"rewards/rejected": -1.76171875,
"step": 432
},
{
"epoch": 0.47898230088495575,
"grad_norm": 15.202491760253906,
"learning_rate": 2.804080089594039e-07,
"logits/chosen": -1.26953125,
"logits/rejected": -1.109375,
"logps/chosen": -260.0,
"logps/rejected": -263.0,
"loss": 0.3812,
"rewards/accuracies": 0.765625,
"rewards/chosen": 0.490234375,
"rewards/margins": 1.98046875,
"rewards/rejected": -1.484375,
"step": 433
},
{
"epoch": 0.48008849557522126,
"grad_norm": 15.252046585083008,
"learning_rate": 2.7951789889781845e-07,
"logits/chosen": -1.25,
"logits/rejected": -1.125,
"logps/chosen": -261.0,
"logps/rejected": -299.0,
"loss": 0.3649,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 0.919921875,
"rewards/margins": 2.328125,
"rewards/rejected": -1.40625,
"step": 434
},
{
"epoch": 0.4811946902654867,
"grad_norm": 11.937089920043945,
"learning_rate": 2.786274091922611e-07,
"logits/chosen": -1.296875,
"logits/rejected": -1.1484375,
"logps/chosen": -257.0,
"logps/rejected": -279.0,
"loss": 0.2799,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.9375,
"rewards/margins": 2.703125,
"rewards/rejected": -1.76953125,
"step": 435
},
{
"epoch": 0.4823008849557522,
"grad_norm": 12.86253833770752,
"learning_rate": 2.7773655129575043e-07,
"logits/chosen": -1.25,
"logits/rejected": -1.11328125,
"logps/chosen": -237.5,
"logps/rejected": -266.5,
"loss": 0.3076,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 0.802734375,
"rewards/margins": 2.75,
"rewards/rejected": -1.953125,
"step": 436
},
{
"epoch": 0.4834070796460177,
"grad_norm": 12.546619415283203,
"learning_rate": 2.7684533666604076e-07,
"logits/chosen": -1.3828125,
"logits/rejected": -1.05078125,
"logps/chosen": -253.5,
"logps/rejected": -257.0,
"loss": 0.3184,
"rewards/accuracies": 0.8515625,
"rewards/chosen": 0.689453125,
"rewards/margins": 2.3984375,
"rewards/rejected": -1.71484375,
"step": 437
},
{
"epoch": 0.48451327433628316,
"grad_norm": 18.16977310180664,
"learning_rate": 2.759537767654744e-07,
"logits/chosen": -1.1796875,
"logits/rejected": -1.1796875,
"logps/chosen": -274.0,
"logps/rejected": -294.0,
"loss": 0.387,
"rewards/accuracies": 0.734375,
"rewards/chosen": 0.615234375,
"rewards/margins": 2.3203125,
"rewards/rejected": -1.703125,
"step": 438
},
{
"epoch": 0.4856194690265487,
"grad_norm": 12.303600311279297,
"learning_rate": 2.750618830608343e-07,
"logits/chosen": -1.3203125,
"logits/rejected": -1.06640625,
"logps/chosen": -235.0,
"logps/rejected": -242.5,
"loss": 0.2887,
"rewards/accuracies": 0.859375,
"rewards/chosen": 0.74609375,
"rewards/margins": 2.5546875,
"rewards/rejected": -1.8125,
"step": 439
},
{
"epoch": 0.48672566371681414,
"grad_norm": 13.665416717529297,
"learning_rate": 2.7416966702319683e-07,
"logits/chosen": -1.203125,
"logits/rejected": -1.1171875,
"logps/chosen": -283.5,
"logps/rejected": -304.0,
"loss": 0.2974,
"rewards/accuracies": 0.8515625,
"rewards/chosen": 0.880859375,
"rewards/margins": 2.6328125,
"rewards/rejected": -1.75390625,
"step": 440
},
{
"epoch": 0.48783185840707965,
"grad_norm": 14.621048927307129,
"learning_rate": 2.732771401277838e-07,
"logits/chosen": -1.21484375,
"logits/rejected": -1.109375,
"logps/chosen": -266.5,
"logps/rejected": -264.5,
"loss": 0.3651,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 0.4580078125,
"rewards/margins": 2.140625,
"rewards/rejected": -1.68359375,
"step": 441
},
{
"epoch": 0.4889380530973451,
"grad_norm": 12.872169494628906,
"learning_rate": 2.7238431385381523e-07,
"logits/chosen": -1.25,
"logits/rejected": -1.1484375,
"logps/chosen": -245.5,
"logps/rejected": -279.0,
"loss": 0.3244,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.765625,
"rewards/margins": 2.640625,
"rewards/rejected": -1.875,
"step": 442
},
{
"epoch": 0.49004424778761063,
"grad_norm": 13.446981430053711,
"learning_rate": 2.714911996843616e-07,
"logits/chosen": -1.23828125,
"logits/rejected": -1.14453125,
"logps/chosen": -263.0,
"logps/rejected": -300.0,
"loss": 0.3075,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 0.671875,
"rewards/margins": 2.5703125,
"rewards/rejected": -1.90234375,
"step": 443
},
{
"epoch": 0.4911504424778761,
"grad_norm": 14.347339630126953,
"learning_rate": 2.7059780910619617e-07,
"logits/chosen": -1.26953125,
"logits/rejected": -1.08203125,
"logps/chosen": -275.0,
"logps/rejected": -310.0,
"loss": 0.3042,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 1.0625,
"rewards/margins": 2.9140625,
"rewards/rejected": -1.84765625,
"step": 444
},
{
"epoch": 0.4922566371681416,
"grad_norm": 14.27387523651123,
"learning_rate": 2.6970415360964716e-07,
"logits/chosen": -1.27734375,
"logits/rejected": -1.2109375,
"logps/chosen": -237.0,
"logps/rejected": -258.0,
"loss": 0.3354,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.98046875,
"rewards/margins": 2.7421875,
"rewards/rejected": -1.76171875,
"step": 445
},
{
"epoch": 0.49336283185840707,
"grad_norm": 13.943259239196777,
"learning_rate": 2.6881024468845e-07,
"logits/chosen": -1.15234375,
"logits/rejected": -1.16796875,
"logps/chosen": -247.5,
"logps/rejected": -275.5,
"loss": 0.3356,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 0.69140625,
"rewards/margins": 2.734375,
"rewards/rejected": -2.046875,
"step": 446
},
{
"epoch": 0.4944690265486726,
"grad_norm": 15.396841049194336,
"learning_rate": 2.679160938395997e-07,
"logits/chosen": -1.28515625,
"logits/rejected": -1.203125,
"logps/chosen": -251.0,
"logps/rejected": -283.0,
"loss": 0.3342,
"rewards/accuracies": 0.828125,
"rewards/chosen": 0.9921875,
"rewards/margins": 2.6796875,
"rewards/rejected": -1.68359375,
"step": 447
},
{
"epoch": 0.49557522123893805,
"grad_norm": 12.922630310058594,
"learning_rate": 2.670217125632027e-07,
"logits/chosen": -1.21875,
"logits/rejected": -1.17578125,
"logps/chosen": -248.5,
"logps/rejected": -258.5,
"loss": 0.3361,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.529296875,
"rewards/margins": 2.1875,
"rewards/rejected": -1.66015625,
"step": 448
},
{
"epoch": 0.49668141592920356,
"grad_norm": 18.911989212036133,
"learning_rate": 2.661271123623291e-07,
"logits/chosen": -1.3125,
"logits/rejected": -1.19921875,
"logps/chosen": -288.0,
"logps/rejected": -278.0,
"loss": 0.4185,
"rewards/accuracies": 0.765625,
"rewards/chosen": 0.599609375,
"rewards/margins": 2.078125,
"rewards/rejected": -1.4765625,
"step": 449
},
{
"epoch": 0.497787610619469,
"grad_norm": 16.120946884155273,
"learning_rate": 2.652323047428646e-07,
"logits/chosen": -1.21484375,
"logits/rejected": -1.109375,
"logps/chosen": -279.0,
"logps/rejected": -303.0,
"loss": 0.363,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 0.9609375,
"rewards/margins": 2.609375,
"rewards/rejected": -1.65625,
"step": 450
},
{
"epoch": 0.497787610619469,
"eval_logits/chosen": -1.2554415464401245,
"eval_logits/rejected": -1.1580379009246826,
"eval_logps/chosen": -252.30845642089844,
"eval_logps/rejected": -276.3631896972656,
"eval_loss": 0.3314497768878937,
"eval_rewards/accuracies": 0.8052030205726624,
"eval_rewards/chosen": 0.85384601354599,
"eval_rewards/margins": 2.567397356033325,
"eval_rewards/rejected": -1.7136777639389038,
"eval_runtime": 193.0141,
"eval_samples_per_second": 66.591,
"eval_steps_per_second": 1.041,
"step": 450
},
{
"epoch": 0.49889380530973454,
"grad_norm": 13.481291770935059,
"learning_rate": 2.6433730121336283e-07,
"logits/chosen": -1.3671875,
"logits/rejected": -1.234375,
"logps/chosen": -241.5,
"logps/rejected": -278.0,
"loss": 0.3044,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.791015625,
"rewards/margins": 2.7578125,
"rewards/rejected": -1.9765625,
"step": 451
},
{
"epoch": 0.5,
"grad_norm": 15.87637996673584,
"learning_rate": 2.6344211328489696e-07,
"logits/chosen": -1.2734375,
"logits/rejected": -1.1953125,
"logps/chosen": -269.0,
"logps/rejected": -291.0,
"loss": 0.3646,
"rewards/accuracies": 0.765625,
"rewards/chosen": 0.68359375,
"rewards/margins": 2.28515625,
"rewards/rejected": -1.6015625,
"step": 452
},
{
"epoch": 0.5011061946902655,
"grad_norm": 12.020176887512207,
"learning_rate": 2.625467524709118e-07,
"logits/chosen": -1.2265625,
"logits/rejected": -1.10546875,
"logps/chosen": -255.0,
"logps/rejected": -283.0,
"loss": 0.2739,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.91796875,
"rewards/margins": 2.8828125,
"rewards/rejected": -1.96484375,
"step": 453
},
{
"epoch": 0.5022123893805309,
"grad_norm": 13.10580825805664,
"learning_rate": 2.616512302870757e-07,
"logits/chosen": -1.23046875,
"logits/rejected": -1.109375,
"logps/chosen": -280.0,
"logps/rejected": -286.0,
"loss": 0.33,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 0.7685546875,
"rewards/margins": 2.3203125,
"rewards/rejected": -1.55078125,
"step": 454
},
{
"epoch": 0.5033185840707964,
"grad_norm": 15.729186058044434,
"learning_rate": 2.607555582511326e-07,
"logits/chosen": -1.3828125,
"logits/rejected": -1.140625,
"logps/chosen": -289.0,
"logps/rejected": -285.0,
"loss": 0.3862,
"rewards/accuracies": 0.734375,
"rewards/chosen": 0.671875,
"rewards/margins": 2.1796875,
"rewards/rejected": -1.50390625,
"step": 455
},
{
"epoch": 0.504424778761062,
"grad_norm": 13.986113548278809,
"learning_rate": 2.5985974788275374e-07,
"logits/chosen": -1.3828125,
"logits/rejected": -1.203125,
"logps/chosen": -230.5,
"logps/rejected": -267.0,
"loss": 0.3423,
"rewards/accuracies": 0.765625,
"rewards/chosen": 1.1640625,
"rewards/margins": 2.8515625,
"rewards/rejected": -1.6875,
"step": 456
},
{
"epoch": 0.5055309734513275,
"grad_norm": 14.71418285369873,
"learning_rate": 2.5896381070338933e-07,
"logits/chosen": -1.2890625,
"logits/rejected": -1.16796875,
"logps/chosen": -274.0,
"logps/rejected": -273.0,
"loss": 0.3394,
"rewards/accuracies": 0.765625,
"rewards/chosen": 0.775390625,
"rewards/margins": 2.19921875,
"rewards/rejected": -1.421875,
"step": 457
},
{
"epoch": 0.5066371681415929,
"grad_norm": 13.852214813232422,
"learning_rate": 2.5806775823612076e-07,
"logits/chosen": -1.34375,
"logits/rejected": -1.203125,
"logps/chosen": -244.0,
"logps/rejected": -284.0,
"loss": 0.3206,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 0.888671875,
"rewards/margins": 2.5078125,
"rewards/rejected": -1.62109375,
"step": 458
},
{
"epoch": 0.5077433628318584,
"grad_norm": 13.243755340576172,
"learning_rate": 2.5717160200551213e-07,
"logits/chosen": -1.2421875,
"logits/rejected": -1.0546875,
"logps/chosen": -242.0,
"logps/rejected": -268.0,
"loss": 0.3353,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.626953125,
"rewards/margins": 2.453125,
"rewards/rejected": -1.828125,
"step": 459
},
{
"epoch": 0.5088495575221239,
"grad_norm": 13.583708763122559,
"learning_rate": 2.562753535374621e-07,
"logits/chosen": -1.25,
"logits/rejected": -1.1015625,
"logps/chosen": -244.5,
"logps/rejected": -266.5,
"loss": 0.3068,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.912109375,
"rewards/margins": 2.515625,
"rewards/rejected": -1.6015625,
"step": 460
},
{
"epoch": 0.5099557522123894,
"grad_norm": 14.627636909484863,
"learning_rate": 2.553790243590556e-07,
"logits/chosen": -1.2109375,
"logits/rejected": -1.21875,
"logps/chosen": -230.0,
"logps/rejected": -265.0,
"loss": 0.3564,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.998046875,
"rewards/margins": 2.3984375,
"rewards/rejected": -1.40234375,
"step": 461
},
{
"epoch": 0.5110619469026548,
"grad_norm": 16.162841796875,
"learning_rate": 2.5448262599841556e-07,
"logits/chosen": -1.3984375,
"logits/rejected": -1.23046875,
"logps/chosen": -256.5,
"logps/rejected": -280.0,
"loss": 0.3297,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.83984375,
"rewards/margins": 2.453125,
"rewards/rejected": -1.6171875,
"step": 462
},
{
"epoch": 0.5121681415929203,
"grad_norm": 13.07744026184082,
"learning_rate": 2.535861699845549e-07,
"logits/chosen": -1.203125,
"logits/rejected": -1.1796875,
"logps/chosen": -244.5,
"logps/rejected": -279.0,
"loss": 0.3268,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.818359375,
"rewards/margins": 2.5,
"rewards/rejected": -1.68359375,
"step": 463
},
{
"epoch": 0.5132743362831859,
"grad_norm": 13.961365699768066,
"learning_rate": 2.526896678472279e-07,
"logits/chosen": -1.265625,
"logits/rejected": -1.12109375,
"logps/chosen": -267.0,
"logps/rejected": -273.0,
"loss": 0.3112,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 0.732421875,
"rewards/margins": 2.671875,
"rewards/rejected": -1.94140625,
"step": 464
},
{
"epoch": 0.5143805309734514,
"grad_norm": 12.809479713439941,
"learning_rate": 2.51793131116782e-07,
"logits/chosen": -1.328125,
"logits/rejected": -1.15625,
"logps/chosen": -223.0,
"logps/rejected": -244.5,
"loss": 0.3162,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 0.744140625,
"rewards/margins": 2.7109375,
"rewards/rejected": -1.9765625,
"step": 465
},
{
"epoch": 0.5154867256637168,
"grad_norm": 12.547861099243164,
"learning_rate": 2.5089657132400964e-07,
"logits/chosen": -1.14453125,
"logits/rejected": -1.08984375,
"logps/chosen": -261.0,
"logps/rejected": -273.0,
"loss": 0.2895,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.85546875,
"rewards/margins": 2.6796875,
"rewards/rejected": -1.828125,
"step": 466
},
{
"epoch": 0.5165929203539823,
"grad_norm": 13.852119445800781,
"learning_rate": 2.5e-07,
"logits/chosen": -1.22265625,
"logits/rejected": -1.18359375,
"logps/chosen": -256.5,
"logps/rejected": -294.0,
"loss": 0.3104,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.8046875,
"rewards/margins": 2.6875,
"rewards/rejected": -1.8828125,
"step": 467
},
{
"epoch": 0.5176991150442478,
"grad_norm": 12.272007942199707,
"learning_rate": 2.491034286759903e-07,
"logits/chosen": -1.26953125,
"logits/rejected": -1.14453125,
"logps/chosen": -254.0,
"logps/rejected": -284.0,
"loss": 0.2965,
"rewards/accuracies": 0.890625,
"rewards/chosen": 0.87109375,
"rewards/margins": 2.75,
"rewards/rejected": -1.87109375,
"step": 468
},
{
"epoch": 0.5188053097345132,
"grad_norm": 13.289767265319824,
"learning_rate": 2.482068688832181e-07,
"logits/chosen": -1.26171875,
"logits/rejected": -1.13671875,
"logps/chosen": -236.5,
"logps/rejected": -258.0,
"loss": 0.3045,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.90234375,
"rewards/margins": 2.734375,
"rewards/rejected": -1.82421875,
"step": 469
},
{
"epoch": 0.5199115044247787,
"grad_norm": 13.52807903289795,
"learning_rate": 2.4731033215277213e-07,
"logits/chosen": -1.2578125,
"logits/rejected": -1.17578125,
"logps/chosen": -251.5,
"logps/rejected": -285.0,
"loss": 0.3189,
"rewards/accuracies": 0.828125,
"rewards/chosen": 0.828125,
"rewards/margins": 2.640625,
"rewards/rejected": -1.8203125,
"step": 470
},
{
"epoch": 0.5210176991150443,
"grad_norm": 14.39229679107666,
"learning_rate": 2.464138300154451e-07,
"logits/chosen": -1.3515625,
"logits/rejected": -1.1640625,
"logps/chosen": -254.5,
"logps/rejected": -280.0,
"loss": 0.3246,
"rewards/accuracies": 0.8515625,
"rewards/chosen": 0.787109375,
"rewards/margins": 2.6015625,
"rewards/rejected": -1.81640625,
"step": 471
},
{
"epoch": 0.5221238938053098,
"grad_norm": 14.51116943359375,
"learning_rate": 2.455173740015845e-07,
"logits/chosen": -1.30859375,
"logits/rejected": -1.12109375,
"logps/chosen": -246.0,
"logps/rejected": -269.5,
"loss": 0.3957,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.65234375,
"rewards/margins": 2.35546875,
"rewards/rejected": -1.70703125,
"step": 472
},
{
"epoch": 0.5232300884955752,
"grad_norm": 14.468950271606445,
"learning_rate": 2.4462097564094445e-07,
"logits/chosen": -1.3125,
"logits/rejected": -1.2421875,
"logps/chosen": -250.0,
"logps/rejected": -296.0,
"loss": 0.3396,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.703125,
"rewards/margins": 2.546875,
"rewards/rejected": -1.83984375,
"step": 473
},
{
"epoch": 0.5243362831858407,
"grad_norm": 12.03284740447998,
"learning_rate": 2.4372464646253794e-07,
"logits/chosen": -1.1953125,
"logits/rejected": -1.09765625,
"logps/chosen": -255.0,
"logps/rejected": -277.0,
"loss": 0.266,
"rewards/accuracies": 0.859375,
"rewards/chosen": 1.017578125,
"rewards/margins": 2.96875,
"rewards/rejected": -1.9453125,
"step": 474
},
{
"epoch": 0.5254424778761062,
"grad_norm": 19.379676818847656,
"learning_rate": 2.4282839799448785e-07,
"logits/chosen": -1.22265625,
"logits/rejected": -1.17578125,
"logps/chosen": -277.0,
"logps/rejected": -316.0,
"loss": 0.3512,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.763671875,
"rewards/margins": 2.7578125,
"rewards/rejected": -1.99609375,
"step": 475
},
{
"epoch": 0.5265486725663717,
"grad_norm": 13.170727729797363,
"learning_rate": 2.419322417638792e-07,
"logits/chosen": -1.27734375,
"logits/rejected": -1.1171875,
"logps/chosen": -255.5,
"logps/rejected": -274.0,
"loss": 0.3296,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.4990234375,
"rewards/margins": 2.421875,
"rewards/rejected": -1.9296875,
"step": 476
},
{
"epoch": 0.5276548672566371,
"grad_norm": 10.943991661071777,
"learning_rate": 2.410361892966107e-07,
"logits/chosen": -1.3046875,
"logits/rejected": -1.11328125,
"logps/chosen": -223.5,
"logps/rejected": -244.0,
"loss": 0.2629,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.806640625,
"rewards/margins": 3.1171875,
"rewards/rejected": -2.3125,
"step": 477
},
{
"epoch": 0.5287610619469026,
"grad_norm": 14.212424278259277,
"learning_rate": 2.401402521172463e-07,
"logits/chosen": -1.20703125,
"logits/rejected": -1.2109375,
"logps/chosen": -249.5,
"logps/rejected": -274.0,
"loss": 0.3509,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.65625,
"rewards/margins": 2.4140625,
"rewards/rejected": -1.75,
"step": 478
},
{
"epoch": 0.5298672566371682,
"grad_norm": 11.761503219604492,
"learning_rate": 2.392444417488673e-07,
"logits/chosen": -1.41796875,
"logits/rejected": -1.234375,
"logps/chosen": -234.5,
"logps/rejected": -278.0,
"loss": 0.2504,
"rewards/accuracies": 0.8671875,
"rewards/chosen": 1.00390625,
"rewards/margins": 2.953125,
"rewards/rejected": -1.953125,
"step": 479
},
{
"epoch": 0.5309734513274337,
"grad_norm": 15.108154296875,
"learning_rate": 2.3834876971292433e-07,
"logits/chosen": -1.2421875,
"logits/rejected": -1.2265625,
"logps/chosen": -285.0,
"logps/rejected": -303.0,
"loss": 0.3124,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.97265625,
"rewards/margins": 3.0546875,
"rewards/rejected": -2.0859375,
"step": 480
},
{
"epoch": 0.5320796460176991,
"grad_norm": 13.407039642333984,
"learning_rate": 2.3745324752908822e-07,
"logits/chosen": -1.20703125,
"logits/rejected": -1.1796875,
"logps/chosen": -253.0,
"logps/rejected": -283.0,
"loss": 0.2827,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.830078125,
"rewards/margins": 2.71875,
"rewards/rejected": -1.890625,
"step": 481
},
{
"epoch": 0.5331858407079646,
"grad_norm": 13.764703750610352,
"learning_rate": 2.365578867151031e-07,
"logits/chosen": -1.27734375,
"logits/rejected": -1.1640625,
"logps/chosen": -249.0,
"logps/rejected": -260.0,
"loss": 0.3393,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.607421875,
"rewards/margins": 2.8359375,
"rewards/rejected": -2.2265625,
"step": 482
},
{
"epoch": 0.5342920353982301,
"grad_norm": 14.601144790649414,
"learning_rate": 2.3566269878663714e-07,
"logits/chosen": -1.30078125,
"logits/rejected": -1.2109375,
"logps/chosen": -264.5,
"logps/rejected": -291.0,
"loss": 0.3486,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.587890625,
"rewards/margins": 2.3046875,
"rewards/rejected": -1.71484375,
"step": 483
},
{
"epoch": 0.5353982300884956,
"grad_norm": 13.12956714630127,
"learning_rate": 2.347676952571354e-07,
"logits/chosen": -1.26953125,
"logits/rejected": -1.16015625,
"logps/chosen": -218.0,
"logps/rejected": -243.0,
"loss": 0.3522,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.751953125,
"rewards/margins": 2.7265625,
"rewards/rejected": -1.9765625,
"step": 484
},
{
"epoch": 0.536504424778761,
"grad_norm": 13.433536529541016,
"learning_rate": 2.3387288763767095e-07,
"logits/chosen": -1.22265625,
"logits/rejected": -1.0703125,
"logps/chosen": -266.0,
"logps/rejected": -267.0,
"loss": 0.3058,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 0.6484375,
"rewards/margins": 2.7734375,
"rewards/rejected": -2.125,
"step": 485
},
{
"epoch": 0.5376106194690266,
"grad_norm": 13.798343658447266,
"learning_rate": 2.329782874367973e-07,
"logits/chosen": -1.25,
"logits/rejected": -1.125,
"logps/chosen": -250.5,
"logps/rejected": -260.0,
"loss": 0.2997,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 0.568359375,
"rewards/margins": 2.703125,
"rewards/rejected": -2.125,
"step": 486
},
{
"epoch": 0.5387168141592921,
"grad_norm": 13.606473922729492,
"learning_rate": 2.3208390616040025e-07,
"logits/chosen": -1.234375,
"logits/rejected": -1.20703125,
"logps/chosen": -265.5,
"logps/rejected": -323.0,
"loss": 0.3473,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.7421875,
"rewards/margins": 2.4609375,
"rewards/rejected": -1.71875,
"step": 487
},
{
"epoch": 0.5398230088495575,
"grad_norm": 14.441394805908203,
"learning_rate": 2.3118975531155003e-07,
"logits/chosen": -1.22265625,
"logits/rejected": -1.14453125,
"logps/chosen": -257.5,
"logps/rejected": -281.0,
"loss": 0.3566,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.501953125,
"rewards/margins": 2.21875,
"rewards/rejected": -1.71875,
"step": 488
},
{
"epoch": 0.540929203539823,
"grad_norm": 13.99435043334961,
"learning_rate": 2.3029584639035284e-07,
"logits/chosen": -1.24609375,
"logits/rejected": -1.16796875,
"logps/chosen": -251.0,
"logps/rejected": -293.0,
"loss": 0.3419,
"rewards/accuracies": 0.765625,
"rewards/chosen": 0.701171875,
"rewards/margins": 2.5234375,
"rewards/rejected": -1.828125,
"step": 489
},
{
"epoch": 0.5420353982300885,
"grad_norm": 12.558284759521484,
"learning_rate": 2.294021908938039e-07,
"logits/chosen": -1.26953125,
"logits/rejected": -1.05859375,
"logps/chosen": -243.0,
"logps/rejected": -249.5,
"loss": 0.2931,
"rewards/accuracies": 0.859375,
"rewards/chosen": 0.7470703125,
"rewards/margins": 2.9140625,
"rewards/rejected": -2.1640625,
"step": 490
},
{
"epoch": 0.543141592920354,
"grad_norm": 13.195667266845703,
"learning_rate": 2.285088003156384e-07,
"logits/chosen": -1.20703125,
"logits/rejected": -1.0625,
"logps/chosen": -268.0,
"logps/rejected": -308.0,
"loss": 0.33,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.669921875,
"rewards/margins": 2.5234375,
"rewards/rejected": -1.8515625,
"step": 491
},
{
"epoch": 0.5442477876106194,
"grad_norm": 13.966536521911621,
"learning_rate": 2.2761568614618472e-07,
"logits/chosen": -1.33203125,
"logits/rejected": -1.3203125,
"logps/chosen": -250.5,
"logps/rejected": -266.0,
"loss": 0.3732,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 0.763671875,
"rewards/margins": 2.3203125,
"rewards/rejected": -1.5546875,
"step": 492
},
{
"epoch": 0.5453539823008849,
"grad_norm": 13.840253829956055,
"learning_rate": 2.2672285987221625e-07,
"logits/chosen": -1.3359375,
"logits/rejected": -1.16015625,
"logps/chosen": -263.0,
"logps/rejected": -279.5,
"loss": 0.3326,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.62109375,
"rewards/margins": 2.4140625,
"rewards/rejected": -1.7890625,
"step": 493
},
{
"epoch": 0.5464601769911505,
"grad_norm": 13.471453666687012,
"learning_rate": 2.2583033297680315e-07,
"logits/chosen": -1.203125,
"logits/rejected": -1.06640625,
"logps/chosen": -274.0,
"logps/rejected": -307.0,
"loss": 0.3214,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.6025390625,
"rewards/margins": 2.6953125,
"rewards/rejected": -2.08984375,
"step": 494
},
{
"epoch": 0.547566371681416,
"grad_norm": 13.170902252197266,
"learning_rate": 2.2493811693916567e-07,
"logits/chosen": -1.3984375,
"logits/rejected": -1.13671875,
"logps/chosen": -255.5,
"logps/rejected": -285.0,
"loss": 0.2704,
"rewards/accuracies": 0.8515625,
"rewards/chosen": 0.822265625,
"rewards/margins": 2.7421875,
"rewards/rejected": -1.91796875,
"step": 495
},
{
"epoch": 0.5486725663716814,
"grad_norm": 14.177343368530273,
"learning_rate": 2.2404622323452562e-07,
"logits/chosen": -1.30078125,
"logits/rejected": -1.171875,
"logps/chosen": -238.0,
"logps/rejected": -284.0,
"loss": 0.3352,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 0.916015625,
"rewards/margins": 2.515625,
"rewards/rejected": -1.59765625,
"step": 496
},
{
"epoch": 0.5497787610619469,
"grad_norm": 10.760467529296875,
"learning_rate": 2.2315466333395924e-07,
"logits/chosen": -1.21484375,
"logits/rejected": -1.19921875,
"logps/chosen": -222.5,
"logps/rejected": -285.0,
"loss": 0.2274,
"rewards/accuracies": 0.8828125,
"rewards/chosen": 1.1875,
"rewards/margins": 3.3515625,
"rewards/rejected": -2.1640625,
"step": 497
},
{
"epoch": 0.5508849557522124,
"grad_norm": 15.023584365844727,
"learning_rate": 2.222634487042496e-07,
"logits/chosen": -1.296875,
"logits/rejected": -1.2109375,
"logps/chosen": -256.5,
"logps/rejected": -279.5,
"loss": 0.3246,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 0.76953125,
"rewards/margins": 2.515625,
"rewards/rejected": -1.75,
"step": 498
},
{
"epoch": 0.5519911504424779,
"grad_norm": 12.802349090576172,
"learning_rate": 2.2137259080773896e-07,
"logits/chosen": -1.28515625,
"logits/rejected": -1.21875,
"logps/chosen": -246.5,
"logps/rejected": -256.0,
"loss": 0.2918,
"rewards/accuracies": 0.8828125,
"rewards/chosen": 0.83203125,
"rewards/margins": 2.8203125,
"rewards/rejected": -1.98828125,
"step": 499
},
{
"epoch": 0.5530973451327433,
"grad_norm": 13.720869064331055,
"learning_rate": 2.204821011021815e-07,
"logits/chosen": -1.19921875,
"logits/rejected": -1.09765625,
"logps/chosen": -242.0,
"logps/rejected": -278.5,
"loss": 0.3394,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.6025390625,
"rewards/margins": 2.28125,
"rewards/rejected": -1.6796875,
"step": 500
},
{
"epoch": 0.5530973451327433,
"eval_logits/chosen": -1.2517879009246826,
"eval_logits/rejected": -1.1513915061950684,
"eval_logps/chosen": -252.49253845214844,
"eval_logps/rejected": -277.5074768066406,
"eval_loss": 0.3277411162853241,
"eval_rewards/accuracies": 0.8062752485275269,
"eval_rewards/chosen": 0.83104008436203,
"eval_rewards/margins": 2.661613702774048,
"eval_rewards/rejected": -1.8305736780166626,
"eval_runtime": 193.0734,
"eval_samples_per_second": 66.571,
"eval_steps_per_second": 1.041,
"step": 500
},
{
"epoch": 0.5542035398230089,
"grad_norm": 13.983589172363281,
"learning_rate": 2.195919910405961e-07,
"logits/chosen": -1.1875,
"logits/rejected": -1.0546875,
"logps/chosen": -243.5,
"logps/rejected": -268.0,
"loss": 0.3415,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 0.810546875,
"rewards/margins": 2.7109375,
"rewards/rejected": -1.89453125,
"step": 501
},
{
"epoch": 0.5553097345132744,
"grad_norm": 13.887030601501465,
"learning_rate": 2.1870227207111853e-07,
"logits/chosen": -1.2578125,
"logits/rejected": -1.0703125,
"logps/chosen": -271.0,
"logps/rejected": -282.0,
"loss": 0.3074,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.68359375,
"rewards/margins": 2.65625,
"rewards/rejected": -1.9765625,
"step": 502
},
{
"epoch": 0.5564159292035398,
"grad_norm": 13.680130004882812,
"learning_rate": 2.1781295563685476e-07,
"logits/chosen": -1.15625,
"logits/rejected": -1.009765625,
"logps/chosen": -280.0,
"logps/rejected": -288.0,
"loss": 0.3024,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.8046875,
"rewards/margins": 2.8828125,
"rewards/rejected": -2.0859375,
"step": 503
},
{
"epoch": 0.5575221238938053,
"grad_norm": 14.351888656616211,
"learning_rate": 2.1692405317573366e-07,
"logits/chosen": -1.28515625,
"logits/rejected": -1.16796875,
"logps/chosen": -257.0,
"logps/rejected": -259.5,
"loss": 0.3655,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 0.689453125,
"rewards/margins": 2.515625,
"rewards/rejected": -1.82421875,
"step": 504
},
{
"epoch": 0.5586283185840708,
"grad_norm": 13.825343132019043,
"learning_rate": 2.1603557612035932e-07,
"logits/chosen": -1.1796875,
"logits/rejected": -1.1875,
"logps/chosen": -274.0,
"logps/rejected": -302.0,
"loss": 0.2957,
"rewards/accuracies": 0.8515625,
"rewards/chosen": 1.046875,
"rewards/margins": 2.78125,
"rewards/rejected": -1.73828125,
"step": 505
},
{
"epoch": 0.5597345132743363,
"grad_norm": 15.617433547973633,
"learning_rate": 2.1514753589786516e-07,
"logits/chosen": -1.25,
"logits/rejected": -1.1484375,
"logps/chosen": -257.5,
"logps/rejected": -283.0,
"loss": 0.3659,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 0.775390625,
"rewards/margins": 2.359375,
"rewards/rejected": -1.578125,
"step": 506
},
{
"epoch": 0.5608407079646017,
"grad_norm": 12.907238960266113,
"learning_rate": 2.1425994392976559e-07,
"logits/chosen": -1.22265625,
"logits/rejected": -1.1875,
"logps/chosen": -253.5,
"logps/rejected": -284.0,
"loss": 0.3268,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.8828125,
"rewards/margins": 2.875,
"rewards/rejected": -1.98828125,
"step": 507
},
{
"epoch": 0.5619469026548672,
"grad_norm": 12.990286827087402,
"learning_rate": 2.1337281163181034e-07,
"logits/chosen": -1.31640625,
"logits/rejected": -1.16015625,
"logps/chosen": -280.0,
"logps/rejected": -278.0,
"loss": 0.2744,
"rewards/accuracies": 0.859375,
"rewards/chosen": 0.853515625,
"rewards/margins": 2.7265625,
"rewards/rejected": -1.875,
"step": 508
},
{
"epoch": 0.5630530973451328,
"grad_norm": 10.47897720336914,
"learning_rate": 2.1248615041383682e-07,
"logits/chosen": -1.2421875,
"logits/rejected": -1.1328125,
"logps/chosen": -234.5,
"logps/rejected": -279.5,
"loss": 0.2341,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.822265625,
"rewards/margins": 3.046875,
"rewards/rejected": -2.21875,
"step": 509
},
{
"epoch": 0.5641592920353983,
"grad_norm": 12.620576858520508,
"learning_rate": 2.1159997167962378e-07,
"logits/chosen": -1.2421875,
"logits/rejected": -1.078125,
"logps/chosen": -229.5,
"logps/rejected": -265.0,
"loss": 0.3217,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.779296875,
"rewards/margins": 2.5625,
"rewards/rejected": -1.78515625,
"step": 510
},
{
"epoch": 0.5652654867256637,
"grad_norm": 14.882222175598145,
"learning_rate": 2.1071428682674436e-07,
"logits/chosen": -1.234375,
"logits/rejected": -1.140625,
"logps/chosen": -258.0,
"logps/rejected": -293.0,
"loss": 0.3461,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 1.03125,
"rewards/margins": 2.65625,
"rewards/rejected": -1.625,
"step": 511
},
{
"epoch": 0.5663716814159292,
"grad_norm": 14.687088966369629,
"learning_rate": 2.098291072464199e-07,
"logits/chosen": -1.234375,
"logits/rejected": -1.21875,
"logps/chosen": -250.5,
"logps/rejected": -307.0,
"loss": 0.3347,
"rewards/accuracies": 0.734375,
"rewards/chosen": 1.025390625,
"rewards/margins": 2.8203125,
"rewards/rejected": -1.796875,
"step": 512
},
{
"epoch": 0.5674778761061947,
"grad_norm": 13.261859893798828,
"learning_rate": 2.0894444432337282e-07,
"logits/chosen": -1.41796875,
"logits/rejected": -1.18359375,
"logps/chosen": -252.5,
"logps/rejected": -263.0,
"loss": 0.2804,
"rewards/accuracies": 0.8515625,
"rewards/chosen": 0.830078125,
"rewards/margins": 2.796875,
"rewards/rejected": -1.96875,
"step": 513
},
{
"epoch": 0.5685840707964602,
"grad_norm": 15.263360977172852,
"learning_rate": 2.08060309435681e-07,
"logits/chosen": -1.359375,
"logits/rejected": -1.21484375,
"logps/chosen": -267.0,
"logps/rejected": -302.0,
"loss": 0.3238,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 0.931640625,
"rewards/margins": 2.7734375,
"rewards/rejected": -1.84375,
"step": 514
},
{
"epoch": 0.5696902654867256,
"grad_norm": 13.30583667755127,
"learning_rate": 2.071767139546306e-07,
"logits/chosen": -1.27734375,
"logits/rejected": -1.09375,
"logps/chosen": -253.5,
"logps/rejected": -301.0,
"loss": 0.3201,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.810546875,
"rewards/margins": 2.609375,
"rewards/rejected": -1.796875,
"step": 515
},
{
"epoch": 0.5707964601769911,
"grad_norm": 13.454291343688965,
"learning_rate": 2.062936692445705e-07,
"logits/chosen": -1.2578125,
"logits/rejected": -1.125,
"logps/chosen": -244.0,
"logps/rejected": -285.0,
"loss": 0.3014,
"rewards/accuracies": 0.828125,
"rewards/chosen": 0.8046875,
"rewards/margins": 2.6953125,
"rewards/rejected": -1.8984375,
"step": 516
},
{
"epoch": 0.5719026548672567,
"grad_norm": 14.328228950500488,
"learning_rate": 2.0541118666276577e-07,
"logits/chosen": -1.25,
"logits/rejected": -1.17578125,
"logps/chosen": -261.0,
"logps/rejected": -314.0,
"loss": 0.331,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.90625,
"rewards/margins": 2.609375,
"rewards/rejected": -1.70703125,
"step": 517
},
{
"epoch": 0.5730088495575221,
"grad_norm": 15.810956001281738,
"learning_rate": 2.045292775592515e-07,
"logits/chosen": -1.25,
"logits/rejected": -1.1875,
"logps/chosen": -258.5,
"logps/rejected": -287.0,
"loss": 0.3654,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.828125,
"rewards/margins": 2.765625,
"rewards/rejected": -1.9375,
"step": 518
},
{
"epoch": 0.5741150442477876,
"grad_norm": 14.267369270324707,
"learning_rate": 2.0364795327668722e-07,
"logits/chosen": -1.27734375,
"logits/rejected": -1.078125,
"logps/chosen": -290.0,
"logps/rejected": -282.0,
"loss": 0.3298,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.5419921875,
"rewards/margins": 2.453125,
"rewards/rejected": -1.9140625,
"step": 519
},
{
"epoch": 0.5752212389380531,
"grad_norm": 17.044023513793945,
"learning_rate": 2.0276722515021084e-07,
"logits/chosen": -1.3125,
"logits/rejected": -1.15625,
"logps/chosen": -254.5,
"logps/rejected": -288.0,
"loss": 0.4207,
"rewards/accuracies": 0.7421875,
"rewards/chosen": 0.66015625,
"rewards/margins": 2.12890625,
"rewards/rejected": -1.46875,
"step": 520
},
{
"epoch": 0.5763274336283186,
"grad_norm": 14.049108505249023,
"learning_rate": 2.0188710450729253e-07,
"logits/chosen": -1.19140625,
"logits/rejected": -1.21875,
"logps/chosen": -235.5,
"logps/rejected": -283.0,
"loss": 0.3075,
"rewards/accuracies": 0.8125,
"rewards/chosen": 1.232421875,
"rewards/margins": 3.1640625,
"rewards/rejected": -1.9375,
"step": 521
},
{
"epoch": 0.577433628318584,
"grad_norm": 14.81714153289795,
"learning_rate": 2.0100760266758953e-07,
"logits/chosen": -1.26953125,
"logits/rejected": -1.08984375,
"logps/chosen": -253.5,
"logps/rejected": -244.5,
"loss": 0.3601,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.3740234375,
"rewards/margins": 2.3671875,
"rewards/rejected": -2.0,
"step": 522
},
{
"epoch": 0.5785398230088495,
"grad_norm": 12.647814750671387,
"learning_rate": 2.0012873094280032e-07,
"logits/chosen": -1.2421875,
"logits/rejected": -1.140625,
"logps/chosen": -254.5,
"logps/rejected": -297.0,
"loss": 0.2831,
"rewards/accuracies": 0.875,
"rewards/chosen": 1.060546875,
"rewards/margins": 3.125,
"rewards/rejected": -2.0625,
"step": 523
},
{
"epoch": 0.5796460176991151,
"grad_norm": 13.28409481048584,
"learning_rate": 1.992505006365191e-07,
"logits/chosen": -1.1484375,
"logits/rejected": -1.11328125,
"logps/chosen": -268.0,
"logps/rejected": -295.0,
"loss": 0.2986,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.8828125,
"rewards/margins": 2.859375,
"rewards/rejected": -1.96875,
"step": 524
},
{
"epoch": 0.5807522123893806,
"grad_norm": 12.828714370727539,
"learning_rate": 1.983729230440907e-07,
"logits/chosen": -1.25390625,
"logits/rejected": -1.26171875,
"logps/chosen": -241.5,
"logps/rejected": -292.0,
"loss": 0.2867,
"rewards/accuracies": 0.828125,
"rewards/chosen": 1.021484375,
"rewards/margins": 3.0703125,
"rewards/rejected": -2.0546875,
"step": 525
},
{
"epoch": 0.581858407079646,
"grad_norm": 13.270194053649902,
"learning_rate": 1.974960094524647e-07,
"logits/chosen": -1.1875,
"logits/rejected": -1.13671875,
"logps/chosen": -252.5,
"logps/rejected": -284.0,
"loss": 0.3036,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.837890625,
"rewards/margins": 2.71875,
"rewards/rejected": -1.87890625,
"step": 526
},
{
"epoch": 0.5829646017699115,
"grad_norm": 14.76196575164795,
"learning_rate": 1.9661977114005095e-07,
"logits/chosen": -1.2109375,
"logits/rejected": -1.09375,
"logps/chosen": -266.0,
"logps/rejected": -282.0,
"loss": 0.3643,
"rewards/accuracies": 0.7265625,
"rewards/chosen": 0.83984375,
"rewards/margins": 2.65625,
"rewards/rejected": -1.81640625,
"step": 527
},
{
"epoch": 0.584070796460177,
"grad_norm": 13.863215446472168,
"learning_rate": 1.9574421937657423e-07,
"logits/chosen": -1.2265625,
"logits/rejected": -1.171875,
"logps/chosen": -261.0,
"logps/rejected": -295.0,
"loss": 0.2729,
"rewards/accuracies": 0.84375,
"rewards/chosen": 1.02734375,
"rewards/margins": 3.078125,
"rewards/rejected": -2.0546875,
"step": 528
},
{
"epoch": 0.5851769911504425,
"grad_norm": 17.09484100341797,
"learning_rate": 1.9486936542292897e-07,
"logits/chosen": -1.2734375,
"logits/rejected": -1.1484375,
"logps/chosen": -281.0,
"logps/rejected": -284.0,
"loss": 0.4144,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.681640625,
"rewards/margins": 2.203125,
"rewards/rejected": -1.5234375,
"step": 529
},
{
"epoch": 0.5862831858407079,
"grad_norm": 13.073324203491211,
"learning_rate": 1.9399522053103512e-07,
"logits/chosen": -1.22265625,
"logits/rejected": -1.16015625,
"logps/chosen": -256.5,
"logps/rejected": -272.5,
"loss": 0.3109,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 0.90234375,
"rewards/margins": 2.734375,
"rewards/rejected": -1.83984375,
"step": 530
},
{
"epoch": 0.5873893805309734,
"grad_norm": 16.40873908996582,
"learning_rate": 1.9312179594369267e-07,
"logits/chosen": -1.171875,
"logits/rejected": -1.1015625,
"logps/chosen": -270.0,
"logps/rejected": -298.0,
"loss": 0.3547,
"rewards/accuracies": 0.765625,
"rewards/chosen": 0.759765625,
"rewards/margins": 2.5078125,
"rewards/rejected": -1.75,
"step": 531
},
{
"epoch": 0.588495575221239,
"grad_norm": 13.864864349365234,
"learning_rate": 1.9224910289443766e-07,
"logits/chosen": -1.27734375,
"logits/rejected": -1.13671875,
"logps/chosen": -233.5,
"logps/rejected": -259.0,
"loss": 0.367,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.880859375,
"rewards/margins": 2.4765625,
"rewards/rejected": -1.6015625,
"step": 532
},
{
"epoch": 0.5896017699115044,
"grad_norm": 11.59555721282959,
"learning_rate": 1.913771526073976e-07,
"logits/chosen": -1.3203125,
"logits/rejected": -1.12890625,
"logps/chosen": -254.0,
"logps/rejected": -295.0,
"loss": 0.2546,
"rewards/accuracies": 0.875,
"rewards/chosen": 1.154296875,
"rewards/margins": 2.9140625,
"rewards/rejected": -1.75390625,
"step": 533
},
{
"epoch": 0.5907079646017699,
"grad_norm": 559.9500122070312,
"learning_rate": 1.9050595629714654e-07,
"logits/chosen": -1.2265625,
"logits/rejected": -0.99609375,
"logps/chosen": -270.0,
"logps/rejected": -344.0,
"loss": 0.3657,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.978515625,
"rewards/margins": 2.65625,
"rewards/rejected": -1.671875,
"step": 534
},
{
"epoch": 0.5918141592920354,
"grad_norm": 12.260162353515625,
"learning_rate": 1.8963552516856158e-07,
"logits/chosen": -1.2734375,
"logits/rejected": -1.16015625,
"logps/chosen": -242.0,
"logps/rejected": -265.5,
"loss": 0.2995,
"rewards/accuracies": 0.828125,
"rewards/chosen": 1.09375,
"rewards/margins": 3.015625,
"rewards/rejected": -1.921875,
"step": 535
},
{
"epoch": 0.5929203539823009,
"grad_norm": 15.339058876037598,
"learning_rate": 1.8876587041667852e-07,
"logits/chosen": -1.27734375,
"logits/rejected": -1.20703125,
"logps/chosen": -241.0,
"logps/rejected": -265.0,
"loss": 0.3577,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 0.865234375,
"rewards/margins": 2.3984375,
"rewards/rejected": -1.52734375,
"step": 536
},
{
"epoch": 0.5940265486725663,
"grad_norm": 16.724506378173828,
"learning_rate": 1.8789700322654747e-07,
"logits/chosen": -1.23828125,
"logits/rejected": -1.046875,
"logps/chosen": -251.0,
"logps/rejected": -289.0,
"loss": 0.2921,
"rewards/accuracies": 0.859375,
"rewards/chosen": 0.802734375,
"rewards/margins": 2.90625,
"rewards/rejected": -2.1015625,
"step": 537
},
{
"epoch": 0.5951327433628318,
"grad_norm": 14.673587799072266,
"learning_rate": 1.8702893477308972e-07,
"logits/chosen": -1.2265625,
"logits/rejected": -1.078125,
"logps/chosen": -253.0,
"logps/rejected": -261.0,
"loss": 0.3511,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 0.869140625,
"rewards/margins": 2.671875,
"rewards/rejected": -1.8046875,
"step": 538
},
{
"epoch": 0.5962389380530974,
"grad_norm": 15.79550552368164,
"learning_rate": 1.8616167622095324e-07,
"logits/chosen": -1.203125,
"logits/rejected": -1.12109375,
"logps/chosen": -267.0,
"logps/rejected": -313.0,
"loss": 0.3384,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 1.021484375,
"rewards/margins": 2.671875,
"rewards/rejected": -1.6484375,
"step": 539
},
{
"epoch": 0.5973451327433629,
"grad_norm": 14.441572189331055,
"learning_rate": 1.8529523872436977e-07,
"logits/chosen": -1.37890625,
"logits/rejected": -1.1640625,
"logps/chosen": -249.5,
"logps/rejected": -272.0,
"loss": 0.3185,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.921875,
"rewards/margins": 2.671875,
"rewards/rejected": -1.75,
"step": 540
},
{
"epoch": 0.5984513274336283,
"grad_norm": 13.769876480102539,
"learning_rate": 1.8442963342701105e-07,
"logits/chosen": -1.14453125,
"logits/rejected": -1.1875,
"logps/chosen": -277.0,
"logps/rejected": -275.0,
"loss": 0.2794,
"rewards/accuracies": 0.8515625,
"rewards/chosen": 0.994140625,
"rewards/margins": 2.9375,
"rewards/rejected": -1.9453125,
"step": 541
},
{
"epoch": 0.5995575221238938,
"grad_norm": 14.039227485656738,
"learning_rate": 1.8356487146184516e-07,
"logits/chosen": -1.26953125,
"logits/rejected": -1.20703125,
"logps/chosen": -234.0,
"logps/rejected": -252.5,
"loss": 0.3448,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 1.119140625,
"rewards/margins": 2.78125,
"rewards/rejected": -1.6640625,
"step": 542
},
{
"epoch": 0.6006637168141593,
"grad_norm": 12.930294036865234,
"learning_rate": 1.8270096395099403e-07,
"logits/chosen": -1.31640625,
"logits/rejected": -1.171875,
"logps/chosen": -242.5,
"logps/rejected": -273.0,
"loss": 0.298,
"rewards/accuracies": 0.859375,
"rewards/chosen": 1.0859375,
"rewards/margins": 2.703125,
"rewards/rejected": -1.6171875,
"step": 543
},
{
"epoch": 0.6017699115044248,
"grad_norm": 13.82011890411377,
"learning_rate": 1.8183792200559e-07,
"logits/chosen": -1.2890625,
"logits/rejected": -1.1640625,
"logps/chosen": -256.0,
"logps/rejected": -287.0,
"loss": 0.3418,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.943359375,
"rewards/margins": 2.359375,
"rewards/rejected": -1.4140625,
"step": 544
},
{
"epoch": 0.6028761061946902,
"grad_norm": 15.775805473327637,
"learning_rate": 1.8097575672563275e-07,
"logits/chosen": -1.34375,
"logits/rejected": -1.15625,
"logps/chosen": -249.0,
"logps/rejected": -275.0,
"loss": 0.2854,
"rewards/accuracies": 0.828125,
"rewards/chosen": 1.1875,
"rewards/margins": 3.1171875,
"rewards/rejected": -1.92578125,
"step": 545
},
{
"epoch": 0.6039823008849557,
"grad_norm": 12.980159759521484,
"learning_rate": 1.80114479199847e-07,
"logits/chosen": -1.1171875,
"logits/rejected": -1.140625,
"logps/chosen": -263.0,
"logps/rejected": -277.0,
"loss": 0.2444,
"rewards/accuracies": 0.875,
"rewards/chosen": 1.23046875,
"rewards/margins": 3.28125,
"rewards/rejected": -2.0546875,
"step": 546
},
{
"epoch": 0.6050884955752213,
"grad_norm": 13.192628860473633,
"learning_rate": 1.792541005055394e-07,
"logits/chosen": -1.2578125,
"logits/rejected": -1.22265625,
"logps/chosen": -254.0,
"logps/rejected": -286.0,
"loss": 0.3065,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 1.138671875,
"rewards/margins": 2.8984375,
"rewards/rejected": -1.7578125,
"step": 547
},
{
"epoch": 0.6061946902654868,
"grad_norm": 13.52103328704834,
"learning_rate": 1.783946317084564e-07,
"logits/chosen": -1.328125,
"logits/rejected": -1.08984375,
"logps/chosen": -253.0,
"logps/rejected": -269.0,
"loss": 0.2691,
"rewards/accuracies": 0.8671875,
"rewards/chosen": 1.01953125,
"rewards/margins": 2.9765625,
"rewards/rejected": -1.953125,
"step": 548
},
{
"epoch": 0.6073008849557522,
"grad_norm": 12.777830123901367,
"learning_rate": 1.7753608386264193e-07,
"logits/chosen": -1.19921875,
"logits/rejected": -1.20703125,
"logps/chosen": -225.0,
"logps/rejected": -274.0,
"loss": 0.3203,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 1.09765625,
"rewards/margins": 2.7578125,
"rewards/rejected": -1.66796875,
"step": 549
},
{
"epoch": 0.6084070796460177,
"grad_norm": 13.757856369018555,
"learning_rate": 1.7667846801029486e-07,
"logits/chosen": -1.32421875,
"logits/rejected": -1.13671875,
"logps/chosen": -264.0,
"logps/rejected": -278.0,
"loss": 0.2789,
"rewards/accuracies": 0.8671875,
"rewards/chosen": 1.126953125,
"rewards/margins": 2.640625,
"rewards/rejected": -1.515625,
"step": 550
},
{
"epoch": 0.6084070796460177,
"eval_logits/chosen": -1.262554407119751,
"eval_logits/rejected": -1.1598647832870483,
"eval_logps/chosen": -250.73133850097656,
"eval_logps/rejected": -276.39801025390625,
"eval_loss": 0.32494351267814636,
"eval_rewards/accuracies": 0.8066800236701965,
"eval_rewards/chosen": 1.011232852935791,
"eval_rewards/margins": 2.7289724349975586,
"eval_rewards/rejected": -1.717836618423462,
"eval_runtime": 193.0762,
"eval_samples_per_second": 66.57,
"eval_steps_per_second": 1.041,
"step": 550
},
{
"epoch": 0.6095132743362832,
"grad_norm": 16.458438873291016,
"learning_rate": 1.758217951816274e-07,
"logits/chosen": -1.25390625,
"logits/rejected": -1.15625,
"logps/chosen": -289.0,
"logps/rejected": -310.0,
"loss": 0.3871,
"rewards/accuracies": 0.7578125,
"rewards/chosen": 0.57421875,
"rewards/margins": 2.15625,
"rewards/rejected": -1.5859375,
"step": 551
},
{
"epoch": 0.6106194690265486,
"grad_norm": 15.513572692871094,
"learning_rate": 1.7496607639472327e-07,
"logits/chosen": -1.2265625,
"logits/rejected": -1.19921875,
"logps/chosen": -242.0,
"logps/rejected": -270.0,
"loss": 0.33,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 1.00390625,
"rewards/margins": 2.9375,
"rewards/rejected": -1.9375,
"step": 552
},
{
"epoch": 0.6117256637168141,
"grad_norm": 15.365250587463379,
"learning_rate": 1.7411132265539536e-07,
"logits/chosen": -1.203125,
"logits/rejected": -1.0625,
"logps/chosen": -250.0,
"logps/rejected": -297.0,
"loss": 0.3456,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.931640625,
"rewards/margins": 2.546875,
"rewards/rejected": -1.6171875,
"step": 553
},
{
"epoch": 0.6128318584070797,
"grad_norm": 14.896991729736328,
"learning_rate": 1.7325754495704507e-07,
"logits/chosen": -1.25390625,
"logits/rejected": -1.1875,
"logps/chosen": -267.5,
"logps/rejected": -315.0,
"loss": 0.3605,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.787109375,
"rewards/margins": 2.3671875,
"rewards/rejected": -1.578125,
"step": 554
},
{
"epoch": 0.6139380530973452,
"grad_norm": 14.317460060119629,
"learning_rate": 1.7240475428051997e-07,
"logits/chosen": -1.34765625,
"logits/rejected": -1.1328125,
"logps/chosen": -247.0,
"logps/rejected": -268.0,
"loss": 0.3123,
"rewards/accuracies": 0.8515625,
"rewards/chosen": 0.96875,
"rewards/margins": 2.75,
"rewards/rejected": -1.78515625,
"step": 555
},
{
"epoch": 0.6150442477876106,
"grad_norm": 13.919463157653809,
"learning_rate": 1.7155296159397356e-07,
"logits/chosen": -1.22265625,
"logits/rejected": -1.19140625,
"logps/chosen": -261.5,
"logps/rejected": -304.0,
"loss": 0.3188,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 0.88671875,
"rewards/margins": 2.7734375,
"rewards/rejected": -1.890625,
"step": 556
},
{
"epoch": 0.6161504424778761,
"grad_norm": 15.724615097045898,
"learning_rate": 1.707021778527235e-07,
"logits/chosen": -1.34765625,
"logits/rejected": -1.1953125,
"logps/chosen": -278.0,
"logps/rejected": -298.0,
"loss": 0.3411,
"rewards/accuracies": 0.78125,
"rewards/chosen": 1.0625,
"rewards/margins": 2.5234375,
"rewards/rejected": -1.4609375,
"step": 557
},
{
"epoch": 0.6172566371681416,
"grad_norm": 11.178709983825684,
"learning_rate": 1.6985241399911082e-07,
"logits/chosen": -1.41796875,
"logits/rejected": -1.21484375,
"logps/chosen": -234.5,
"logps/rejected": -259.0,
"loss": 0.2349,
"rewards/accuracies": 0.859375,
"rewards/chosen": 0.91015625,
"rewards/margins": 3.2265625,
"rewards/rejected": -2.3125,
"step": 558
},
{
"epoch": 0.6183628318584071,
"grad_norm": 13.019828796386719,
"learning_rate": 1.6900368096235931e-07,
"logits/chosen": -1.328125,
"logits/rejected": -1.1953125,
"logps/chosen": -227.5,
"logps/rejected": -288.0,
"loss": 0.3063,
"rewards/accuracies": 0.8125,
"rewards/chosen": 1.208984375,
"rewards/margins": 3.265625,
"rewards/rejected": -2.05078125,
"step": 559
},
{
"epoch": 0.6194690265486725,
"grad_norm": 15.409528732299805,
"learning_rate": 1.6815598965843519e-07,
"logits/chosen": -1.30078125,
"logits/rejected": -1.15625,
"logps/chosen": -266.0,
"logps/rejected": -326.0,
"loss": 0.2972,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 1.12109375,
"rewards/margins": 3.1875,
"rewards/rejected": -2.0703125,
"step": 560
},
{
"epoch": 0.620575221238938,
"grad_norm": 13.341466903686523,
"learning_rate": 1.67309350989906e-07,
"logits/chosen": -1.37109375,
"logits/rejected": -1.09765625,
"logps/chosen": -257.0,
"logps/rejected": -259.5,
"loss": 0.3021,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.78125,
"rewards/margins": 2.65625,
"rewards/rejected": -1.875,
"step": 561
},
{
"epoch": 0.6216814159292036,
"grad_norm": 13.608122825622559,
"learning_rate": 1.664637758458013e-07,
"logits/chosen": -1.3828125,
"logits/rejected": -1.078125,
"logps/chosen": -248.5,
"logps/rejected": -238.5,
"loss": 0.3346,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.4716796875,
"rewards/margins": 2.3359375,
"rewards/rejected": -1.86328125,
"step": 562
},
{
"epoch": 0.6227876106194691,
"grad_norm": 14.688355445861816,
"learning_rate": 1.656192751014717e-07,
"logits/chosen": -1.2578125,
"logits/rejected": -1.13671875,
"logps/chosen": -266.0,
"logps/rejected": -298.0,
"loss": 0.3522,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 0.90234375,
"rewards/margins": 2.578125,
"rewards/rejected": -1.67578125,
"step": 563
},
{
"epoch": 0.6238938053097345,
"grad_norm": 13.295293807983398,
"learning_rate": 1.647758596184498e-07,
"logits/chosen": -1.1875,
"logits/rejected": -1.17578125,
"logps/chosen": -262.5,
"logps/rejected": -287.0,
"loss": 0.3039,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 0.849609375,
"rewards/margins": 2.4921875,
"rewards/rejected": -1.64453125,
"step": 564
},
{
"epoch": 0.625,
"grad_norm": 12.697134017944336,
"learning_rate": 1.6393354024431e-07,
"logits/chosen": -1.2421875,
"logits/rejected": -1.05078125,
"logps/chosen": -257.5,
"logps/rejected": -265.0,
"loss": 0.2807,
"rewards/accuracies": 0.8515625,
"rewards/chosen": 0.87109375,
"rewards/margins": 3.015625,
"rewards/rejected": -2.14453125,
"step": 565
},
{
"epoch": 0.6261061946902655,
"grad_norm": 23.94145965576172,
"learning_rate": 1.63092327812529e-07,
"logits/chosen": -1.34375,
"logits/rejected": -1.2734375,
"logps/chosen": -246.5,
"logps/rejected": -232.5,
"loss": 0.3647,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.576171875,
"rewards/margins": 2.421875,
"rewards/rejected": -1.84375,
"step": 566
},
{
"epoch": 0.6272123893805309,
"grad_norm": 13.088120460510254,
"learning_rate": 1.622522331423467e-07,
"logits/chosen": -1.2109375,
"logits/rejected": -1.1640625,
"logps/chosen": -261.0,
"logps/rejected": -307.0,
"loss": 0.307,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.875,
"rewards/margins": 2.8046875,
"rewards/rejected": -1.92578125,
"step": 567
},
{
"epoch": 0.6283185840707964,
"grad_norm": 16.868358612060547,
"learning_rate": 1.6141326703862706e-07,
"logits/chosen": -1.2734375,
"logits/rejected": -1.24609375,
"logps/chosen": -260.0,
"logps/rejected": -292.0,
"loss": 0.4,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 0.720703125,
"rewards/margins": 2.5703125,
"rewards/rejected": -1.8515625,
"step": 568
},
{
"epoch": 0.629424778761062,
"grad_norm": 13.88227653503418,
"learning_rate": 1.605754402917186e-07,
"logits/chosen": -1.43359375,
"logits/rejected": -1.28125,
"logps/chosen": -245.0,
"logps/rejected": -266.5,
"loss": 0.2863,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 0.96484375,
"rewards/margins": 2.9296875,
"rewards/rejected": -1.96875,
"step": 569
},
{
"epoch": 0.6305309734513275,
"grad_norm": 13.3289794921875,
"learning_rate": 1.5973876367731651e-07,
"logits/chosen": -1.35546875,
"logits/rejected": -1.08203125,
"logps/chosen": -280.0,
"logps/rejected": -306.0,
"loss": 0.2719,
"rewards/accuracies": 0.84375,
"rewards/chosen": 1.14453125,
"rewards/margins": 3.078125,
"rewards/rejected": -1.93359375,
"step": 570
},
{
"epoch": 0.6316371681415929,
"grad_norm": 13.53636360168457,
"learning_rate": 1.5890324795632315e-07,
"logits/chosen": -1.3671875,
"logits/rejected": -1.28515625,
"logps/chosen": -223.0,
"logps/rejected": -262.0,
"loss": 0.3,
"rewards/accuracies": 0.8125,
"rewards/chosen": 1.017578125,
"rewards/margins": 2.90625,
"rewards/rejected": -1.89453125,
"step": 571
},
{
"epoch": 0.6327433628318584,
"grad_norm": 14.343642234802246,
"learning_rate": 1.5806890387471023e-07,
"logits/chosen": -1.3359375,
"logits/rejected": -1.12109375,
"logps/chosen": -267.0,
"logps/rejected": -291.0,
"loss": 0.2824,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.9453125,
"rewards/margins": 2.984375,
"rewards/rejected": -2.04296875,
"step": 572
},
{
"epoch": 0.6338495575221239,
"grad_norm": 13.577574729919434,
"learning_rate": 1.5723574216338065e-07,
"logits/chosen": -1.26171875,
"logits/rejected": -1.17578125,
"logps/chosen": -272.0,
"logps/rejected": -276.0,
"loss": 0.2799,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.9296875,
"rewards/margins": 2.8125,
"rewards/rejected": -1.8828125,
"step": 573
},
{
"epoch": 0.6349557522123894,
"grad_norm": 17.00868797302246,
"learning_rate": 1.5640377353802985e-07,
"logits/chosen": -1.3125,
"logits/rejected": -1.025390625,
"logps/chosen": -286.0,
"logps/rejected": -286.0,
"loss": 0.3574,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 0.73828125,
"rewards/margins": 2.6015625,
"rewards/rejected": -1.859375,
"step": 574
},
{
"epoch": 0.6360619469026548,
"grad_norm": 14.934076309204102,
"learning_rate": 1.5557300869900874e-07,
"logits/chosen": -1.2578125,
"logits/rejected": -1.09375,
"logps/chosen": -281.5,
"logps/rejected": -320.0,
"loss": 0.347,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 0.7509765625,
"rewards/margins": 2.47265625,
"rewards/rejected": -1.7265625,
"step": 575
},
{
"epoch": 0.6371681415929203,
"grad_norm": 14.064445495605469,
"learning_rate": 1.547434583311858e-07,
"logits/chosen": -1.234375,
"logits/rejected": -1.06640625,
"logps/chosen": -262.0,
"logps/rejected": -262.0,
"loss": 0.374,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 0.5078125,
"rewards/margins": 2.16796875,
"rewards/rejected": -1.65625,
"step": 576
},
{
"epoch": 0.6382743362831859,
"grad_norm": 14.291051864624023,
"learning_rate": 1.5391513310380923e-07,
"logits/chosen": -1.17578125,
"logits/rejected": -1.1484375,
"logps/chosen": -264.5,
"logps/rejected": -322.0,
"loss": 0.2885,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.8671875,
"rewards/margins": 2.8359375,
"rewards/rejected": -1.96484375,
"step": 577
},
{
"epoch": 0.6393805309734514,
"grad_norm": 15.690069198608398,
"learning_rate": 1.5308804367037049e-07,
"logits/chosen": -1.37890625,
"logits/rejected": -1.12109375,
"logps/chosen": -271.0,
"logps/rejected": -313.0,
"loss": 0.3193,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.921875,
"rewards/margins": 2.8125,
"rewards/rejected": -1.88671875,
"step": 578
},
{
"epoch": 0.6404867256637168,
"grad_norm": 14.940479278564453,
"learning_rate": 1.5226220066846662e-07,
"logits/chosen": -1.20703125,
"logits/rejected": -1.12109375,
"logps/chosen": -277.0,
"logps/rejected": -313.0,
"loss": 0.317,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.912109375,
"rewards/margins": 2.7421875,
"rewards/rejected": -1.828125,
"step": 579
},
{
"epoch": 0.6415929203539823,
"grad_norm": 14.204512596130371,
"learning_rate": 1.5143761471966387e-07,
"logits/chosen": -1.25,
"logits/rejected": -1.1953125,
"logps/chosen": -267.0,
"logps/rejected": -296.0,
"loss": 0.2923,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 0.935546875,
"rewards/margins": 2.9453125,
"rewards/rejected": -2.01953125,
"step": 580
},
{
"epoch": 0.6426991150442478,
"grad_norm": 12.345739364624023,
"learning_rate": 1.5061429642936104e-07,
"logits/chosen": -1.2734375,
"logits/rejected": -1.15625,
"logps/chosen": -238.5,
"logps/rejected": -271.0,
"loss": 0.2898,
"rewards/accuracies": 0.828125,
"rewards/chosen": 1.078125,
"rewards/margins": 3.0390625,
"rewards/rejected": -1.9609375,
"step": 581
},
{
"epoch": 0.6438053097345132,
"grad_norm": 14.108118057250977,
"learning_rate": 1.497922563866526e-07,
"logits/chosen": -1.25390625,
"logits/rejected": -1.25390625,
"logps/chosen": -225.5,
"logps/rejected": -276.0,
"loss": 0.3588,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 1.01171875,
"rewards/margins": 2.5078125,
"rewards/rejected": -1.4921875,
"step": 582
},
{
"epoch": 0.6449115044247787,
"grad_norm": 15.642598152160645,
"learning_rate": 1.4897150516419315e-07,
"logits/chosen": -1.33984375,
"logits/rejected": -1.09765625,
"logps/chosen": -262.5,
"logps/rejected": -281.0,
"loss": 0.3357,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.962890625,
"rewards/margins": 2.671875,
"rewards/rejected": -1.70703125,
"step": 583
},
{
"epoch": 0.6460176991150443,
"grad_norm": 13.628485679626465,
"learning_rate": 1.481520533180611e-07,
"logits/chosen": -1.2734375,
"logits/rejected": -1.16796875,
"logps/chosen": -245.0,
"logps/rejected": -250.0,
"loss": 0.2903,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.8515625,
"rewards/margins": 2.8359375,
"rewards/rejected": -1.9921875,
"step": 584
},
{
"epoch": 0.6471238938053098,
"grad_norm": 12.115748405456543,
"learning_rate": 1.4733391138762275e-07,
"logits/chosen": -1.265625,
"logits/rejected": -1.21484375,
"logps/chosen": -237.0,
"logps/rejected": -255.5,
"loss": 0.2511,
"rewards/accuracies": 0.828125,
"rewards/chosen": 1.16796875,
"rewards/margins": 3.484375,
"rewards/rejected": -2.3125,
"step": 585
},
{
"epoch": 0.6482300884955752,
"grad_norm": 12.073527336120605,
"learning_rate": 1.4651708989539733e-07,
"logits/chosen": -1.3203125,
"logits/rejected": -1.2265625,
"logps/chosen": -255.0,
"logps/rejected": -251.5,
"loss": 0.27,
"rewards/accuracies": 0.84375,
"rewards/chosen": 1.025390625,
"rewards/margins": 2.8671875,
"rewards/rejected": -1.84375,
"step": 586
},
{
"epoch": 0.6493362831858407,
"grad_norm": 15.416234016418457,
"learning_rate": 1.4570159934692084e-07,
"logits/chosen": -1.2265625,
"logits/rejected": -1.234375,
"logps/chosen": -264.0,
"logps/rejected": -290.0,
"loss": 0.4144,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.69921875,
"rewards/margins": 2.203125,
"rewards/rejected": -1.50390625,
"step": 587
},
{
"epoch": 0.6504424778761062,
"grad_norm": 14.182881355285645,
"learning_rate": 1.448874502306116e-07,
"logits/chosen": -1.21875,
"logits/rejected": -1.0859375,
"logps/chosen": -262.5,
"logps/rejected": -280.0,
"loss": 0.3193,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.9921875,
"rewards/margins": 2.7421875,
"rewards/rejected": -1.75390625,
"step": 588
},
{
"epoch": 0.6515486725663717,
"grad_norm": 14.108832359313965,
"learning_rate": 1.4407465301763532e-07,
"logits/chosen": -1.37890625,
"logits/rejected": -1.21875,
"logps/chosen": -249.0,
"logps/rejected": -257.5,
"loss": 0.355,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.75,
"rewards/margins": 2.734375,
"rewards/rejected": -1.98046875,
"step": 589
},
{
"epoch": 0.6526548672566371,
"grad_norm": 14.618428230285645,
"learning_rate": 1.432632181617698e-07,
"logits/chosen": -1.24609375,
"logits/rejected": -1.02734375,
"logps/chosen": -243.0,
"logps/rejected": -281.0,
"loss": 0.313,
"rewards/accuracies": 0.8125,
"rewards/chosen": 1.0078125,
"rewards/margins": 2.9296875,
"rewards/rejected": -1.92578125,
"step": 590
},
{
"epoch": 0.6537610619469026,
"grad_norm": 15.182010650634766,
"learning_rate": 1.4245315609927112e-07,
"logits/chosen": -1.2578125,
"logits/rejected": -1.14453125,
"logps/chosen": -262.0,
"logps/rejected": -269.0,
"loss": 0.3443,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 0.732421875,
"rewards/margins": 2.6484375,
"rewards/rejected": -1.91015625,
"step": 591
},
{
"epoch": 0.6548672566371682,
"grad_norm": 14.359109878540039,
"learning_rate": 1.4164447724873933e-07,
"logits/chosen": -1.19140625,
"logits/rejected": -1.16015625,
"logps/chosen": -253.5,
"logps/rejected": -288.0,
"loss": 0.3191,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 0.93359375,
"rewards/margins": 2.6171875,
"rewards/rejected": -1.6875,
"step": 592
},
{
"epoch": 0.6559734513274337,
"grad_norm": 14.253448486328125,
"learning_rate": 1.4083719201098402e-07,
"logits/chosen": -1.3359375,
"logits/rejected": -1.2421875,
"logps/chosen": -251.5,
"logps/rejected": -288.0,
"loss": 0.3304,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.8203125,
"rewards/margins": 2.4765625,
"rewards/rejected": -1.65625,
"step": 593
},
{
"epoch": 0.6570796460176991,
"grad_norm": 14.521297454833984,
"learning_rate": 1.400313107688912e-07,
"logits/chosen": -1.37109375,
"logits/rejected": -1.19140625,
"logps/chosen": -250.0,
"logps/rejected": -260.0,
"loss": 0.3297,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.982421875,
"rewards/margins": 2.859375,
"rewards/rejected": -1.875,
"step": 594
},
{
"epoch": 0.6581858407079646,
"grad_norm": 13.660658836364746,
"learning_rate": 1.39226843887289e-07,
"logits/chosen": -1.26171875,
"logits/rejected": -1.19140625,
"logps/chosen": -235.0,
"logps/rejected": -296.0,
"loss": 0.3347,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.96875,
"rewards/margins": 2.9609375,
"rewards/rejected": -1.9921875,
"step": 595
},
{
"epoch": 0.6592920353982301,
"grad_norm": 15.24756908416748,
"learning_rate": 1.384238017128152e-07,
"logits/chosen": -1.2734375,
"logits/rejected": -1.12109375,
"logps/chosen": -241.5,
"logps/rejected": -274.0,
"loss": 0.3958,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.884765625,
"rewards/margins": 2.6875,
"rewards/rejected": -1.80859375,
"step": 596
},
{
"epoch": 0.6603982300884956,
"grad_norm": 14.213970184326172,
"learning_rate": 1.3762219457378354e-07,
"logits/chosen": -1.26953125,
"logits/rejected": -1.12109375,
"logps/chosen": -240.0,
"logps/rejected": -288.0,
"loss": 0.2724,
"rewards/accuracies": 0.828125,
"rewards/chosen": 1.1484375,
"rewards/margins": 2.84375,
"rewards/rejected": -1.6953125,
"step": 597
},
{
"epoch": 0.661504424778761,
"grad_norm": 13.295817375183105,
"learning_rate": 1.3682203278005095e-07,
"logits/chosen": -1.203125,
"logits/rejected": -1.11328125,
"logps/chosen": -267.0,
"logps/rejected": -287.0,
"loss": 0.2403,
"rewards/accuracies": 0.890625,
"rewards/chosen": 1.296875,
"rewards/margins": 3.078125,
"rewards/rejected": -1.77734375,
"step": 598
},
{
"epoch": 0.6626106194690266,
"grad_norm": 13.04489517211914,
"learning_rate": 1.3602332662288534e-07,
"logits/chosen": -1.2578125,
"logits/rejected": -1.078125,
"logps/chosen": -262.0,
"logps/rejected": -269.0,
"loss": 0.2891,
"rewards/accuracies": 0.8671875,
"rewards/chosen": 0.86328125,
"rewards/margins": 2.71875,
"rewards/rejected": -1.859375,
"step": 599
},
{
"epoch": 0.6637168141592921,
"grad_norm": 15.907817840576172,
"learning_rate": 1.3522608637483266e-07,
"logits/chosen": -1.359375,
"logits/rejected": -1.203125,
"logps/chosen": -241.5,
"logps/rejected": -275.0,
"loss": 0.3724,
"rewards/accuracies": 0.734375,
"rewards/chosen": 0.548828125,
"rewards/margins": 2.2421875,
"rewards/rejected": -1.6953125,
"step": 600
},
{
"epoch": 0.6637168141592921,
"eval_logits/chosen": -1.2614272832870483,
"eval_logits/rejected": -1.1564831733703613,
"eval_logps/chosen": -251.43780517578125,
"eval_logps/rejected": -277.5970153808594,
"eval_loss": 0.3234591782093048,
"eval_rewards/accuracies": 0.809928834438324,
"eval_rewards/chosen": 0.9355371594429016,
"eval_rewards/margins": 2.773709535598755,
"eval_rewards/rejected": -1.8374922275543213,
"eval_runtime": 193.0898,
"eval_samples_per_second": 66.565,
"eval_steps_per_second": 1.041,
"step": 600
},
{
"epoch": 0.6648230088495575,
"grad_norm": 13.516127586364746,
"learning_rate": 1.3443032228958545e-07,
"logits/chosen": -1.2109375,
"logits/rejected": -1.1015625,
"logps/chosen": -252.0,
"logps/rejected": -284.0,
"loss": 0.3214,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 0.916015625,
"rewards/margins": 2.734375,
"rewards/rejected": -1.8203125,
"step": 601
},
{
"epoch": 0.665929203539823,
"grad_norm": 13.290761947631836,
"learning_rate": 1.336360446018503e-07,
"logits/chosen": -1.37890625,
"logits/rejected": -1.1953125,
"logps/chosen": -240.5,
"logps/rejected": -248.5,
"loss": 0.3253,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 1.048828125,
"rewards/margins": 2.65625,
"rewards/rejected": -1.60546875,
"step": 602
},
{
"epoch": 0.6670353982300885,
"grad_norm": 13.730428695678711,
"learning_rate": 1.3284326352721675e-07,
"logits/chosen": -1.24609375,
"logits/rejected": -1.15625,
"logps/chosen": -237.0,
"logps/rejected": -265.0,
"loss": 0.3161,
"rewards/accuracies": 0.8671875,
"rewards/chosen": 0.96875,
"rewards/margins": 3.171875,
"rewards/rejected": -2.1953125,
"step": 603
},
{
"epoch": 0.668141592920354,
"grad_norm": 13.699116706848145,
"learning_rate": 1.3205198926202544e-07,
"logits/chosen": -1.25,
"logits/rejected": -1.125,
"logps/chosen": -254.5,
"logps/rejected": -295.0,
"loss": 0.3262,
"rewards/accuracies": 0.796875,
"rewards/chosen": 1.0390625,
"rewards/margins": 2.7265625,
"rewards/rejected": -1.6875,
"step": 604
},
{
"epoch": 0.6692477876106194,
"grad_norm": 15.250642776489258,
"learning_rate": 1.312622319832375e-07,
"logits/chosen": -1.234375,
"logits/rejected": -1.13671875,
"logps/chosen": -262.0,
"logps/rejected": -275.0,
"loss": 0.3704,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 0.822265625,
"rewards/margins": 2.515625,
"rewards/rejected": -1.69140625,
"step": 605
},
{
"epoch": 0.6703539823008849,
"grad_norm": 14.902206420898438,
"learning_rate": 1.3047400184830303e-07,
"logits/chosen": -1.18359375,
"logits/rejected": -1.064453125,
"logps/chosen": -248.5,
"logps/rejected": -277.0,
"loss": 0.3634,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.76953125,
"rewards/margins": 2.3984375,
"rewards/rejected": -1.625,
"step": 606
},
{
"epoch": 0.6714601769911505,
"grad_norm": 15.015229225158691,
"learning_rate": 1.2968730899503106e-07,
"logits/chosen": -1.39453125,
"logits/rejected": -1.265625,
"logps/chosen": -254.5,
"logps/rejected": -271.0,
"loss": 0.3279,
"rewards/accuracies": 0.8515625,
"rewards/chosen": 0.943359375,
"rewards/margins": 2.625,
"rewards/rejected": -1.6796875,
"step": 607
},
{
"epoch": 0.672566371681416,
"grad_norm": 14.369139671325684,
"learning_rate": 1.2890216354145888e-07,
"logits/chosen": -1.28515625,
"logits/rejected": -1.1796875,
"logps/chosen": -241.0,
"logps/rejected": -251.0,
"loss": 0.3464,
"rewards/accuracies": 0.765625,
"rewards/chosen": 0.953125,
"rewards/margins": 2.625,
"rewards/rejected": -1.671875,
"step": 608
},
{
"epoch": 0.6736725663716814,
"grad_norm": 13.686366081237793,
"learning_rate": 1.2811857558572167e-07,
"logits/chosen": -1.1953125,
"logits/rejected": -1.056640625,
"logps/chosen": -259.0,
"logps/rejected": -262.0,
"loss": 0.3501,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 0.763671875,
"rewards/margins": 2.7421875,
"rewards/rejected": -1.9765625,
"step": 609
},
{
"epoch": 0.6747787610619469,
"grad_norm": 13.95426082611084,
"learning_rate": 1.2733655520592326e-07,
"logits/chosen": -1.30078125,
"logits/rejected": -1.1640625,
"logps/chosen": -257.5,
"logps/rejected": -308.0,
"loss": 0.2923,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 1.10546875,
"rewards/margins": 2.9765625,
"rewards/rejected": -1.875,
"step": 610
},
{
"epoch": 0.6758849557522124,
"grad_norm": 13.288588523864746,
"learning_rate": 1.265561124600057e-07,
"logits/chosen": -1.1875,
"logits/rejected": -1.0703125,
"logps/chosen": -250.0,
"logps/rejected": -277.0,
"loss": 0.3124,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.875,
"rewards/margins": 2.765625,
"rewards/rejected": -1.890625,
"step": 611
},
{
"epoch": 0.6769911504424779,
"grad_norm": 14.142792701721191,
"learning_rate": 1.2577725738562068e-07,
"logits/chosen": -1.30078125,
"logits/rejected": -1.18359375,
"logps/chosen": -244.5,
"logps/rejected": -249.0,
"loss": 0.3795,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.533203125,
"rewards/margins": 2.19921875,
"rewards/rejected": -1.66796875,
"step": 612
},
{
"epoch": 0.6780973451327433,
"grad_norm": 12.952162742614746,
"learning_rate": 1.2500000000000005e-07,
"logits/chosen": -1.08984375,
"logits/rejected": -1.109375,
"logps/chosen": -259.5,
"logps/rejected": -315.0,
"loss": 0.2972,
"rewards/accuracies": 0.828125,
"rewards/chosen": 0.9453125,
"rewards/margins": 2.984375,
"rewards/rejected": -2.03515625,
"step": 613
},
{
"epoch": 0.6792035398230089,
"grad_norm": 12.563096046447754,
"learning_rate": 1.2422435029982667e-07,
"logits/chosen": -1.33203125,
"logits/rejected": -1.2109375,
"logps/chosen": -251.0,
"logps/rejected": -277.0,
"loss": 0.2854,
"rewards/accuracies": 0.828125,
"rewards/chosen": 1.1640625,
"rewards/margins": 2.9921875,
"rewards/rejected": -1.828125,
"step": 614
},
{
"epoch": 0.6803097345132744,
"grad_norm": 15.870928764343262,
"learning_rate": 1.234503182611066e-07,
"logits/chosen": -1.38671875,
"logits/rejected": -1.22265625,
"logps/chosen": -271.5,
"logps/rejected": -310.0,
"loss": 0.3588,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.6884765625,
"rewards/margins": 2.390625,
"rewards/rejected": -1.70703125,
"step": 615
},
{
"epoch": 0.6814159292035398,
"grad_norm": 12.973315238952637,
"learning_rate": 1.2267791383904017e-07,
"logits/chosen": -1.23828125,
"logits/rejected": -1.12890625,
"logps/chosen": -229.0,
"logps/rejected": -272.5,
"loss": 0.2962,
"rewards/accuracies": 0.859375,
"rewards/chosen": 1.130859375,
"rewards/margins": 3.2109375,
"rewards/rejected": -2.09375,
"step": 616
},
{
"epoch": 0.6825221238938053,
"grad_norm": 15.841652870178223,
"learning_rate": 1.2190714696789407e-07,
"logits/chosen": -1.20703125,
"logits/rejected": -1.11328125,
"logps/chosen": -266.0,
"logps/rejected": -282.0,
"loss": 0.3952,
"rewards/accuracies": 0.6953125,
"rewards/chosen": 0.6005859375,
"rewards/margins": 2.2421875,
"rewards/rejected": -1.640625,
"step": 617
},
{
"epoch": 0.6836283185840708,
"grad_norm": 14.495512008666992,
"learning_rate": 1.2113802756087396e-07,
"logits/chosen": -1.2421875,
"logits/rejected": -1.15625,
"logps/chosen": -251.5,
"logps/rejected": -270.5,
"loss": 0.3808,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.5576171875,
"rewards/margins": 2.3359375,
"rewards/rejected": -1.77734375,
"step": 618
},
{
"epoch": 0.6847345132743363,
"grad_norm": 13.138040542602539,
"learning_rate": 1.2037056550999623e-07,
"logits/chosen": -1.08984375,
"logits/rejected": -1.046875,
"logps/chosen": -261.0,
"logps/rejected": -308.0,
"loss": 0.3147,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.82421875,
"rewards/margins": 2.8203125,
"rewards/rejected": -1.99609375,
"step": 619
},
{
"epoch": 0.6858407079646017,
"grad_norm": 15.598456382751465,
"learning_rate": 1.1960477068596154e-07,
"logits/chosen": -1.36328125,
"logits/rejected": -1.08984375,
"logps/chosen": -266.0,
"logps/rejected": -286.0,
"loss": 0.3759,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.62890625,
"rewards/margins": 2.1875,
"rewards/rejected": -1.55859375,
"step": 620
},
{
"epoch": 0.6869469026548672,
"grad_norm": 13.848457336425781,
"learning_rate": 1.1884065293802756e-07,
"logits/chosen": -1.1640625,
"logits/rejected": -1.18359375,
"logps/chosen": -244.5,
"logps/rejected": -257.0,
"loss": 0.3068,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.958984375,
"rewards/margins": 2.8828125,
"rewards/rejected": -1.92578125,
"step": 621
},
{
"epoch": 0.6880530973451328,
"grad_norm": 12.871940612792969,
"learning_rate": 1.1807822209388196e-07,
"logits/chosen": -1.2890625,
"logits/rejected": -1.2109375,
"logps/chosen": -239.0,
"logps/rejected": -281.0,
"loss": 0.2818,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 1.08984375,
"rewards/margins": 3.171875,
"rewards/rejected": -2.0859375,
"step": 622
},
{
"epoch": 0.6891592920353983,
"grad_norm": 13.695356369018555,
"learning_rate": 1.173174879595166e-07,
"logits/chosen": -1.32421875,
"logits/rejected": -1.1484375,
"logps/chosen": -244.5,
"logps/rejected": -276.0,
"loss": 0.3137,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.9609375,
"rewards/margins": 2.875,
"rewards/rejected": -1.91796875,
"step": 623
},
{
"epoch": 0.6902654867256637,
"grad_norm": 16.23243522644043,
"learning_rate": 1.1655846031910119e-07,
"logits/chosen": -1.359375,
"logits/rejected": -1.21484375,
"logps/chosen": -253.0,
"logps/rejected": -301.0,
"loss": 0.3016,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.837890625,
"rewards/margins": 3.0625,
"rewards/rejected": -2.2265625,
"step": 624
},
{
"epoch": 0.6913716814159292,
"grad_norm": 14.047713279724121,
"learning_rate": 1.1580114893485712e-07,
"logits/chosen": -1.203125,
"logits/rejected": -1.18359375,
"logps/chosen": -241.0,
"logps/rejected": -286.0,
"loss": 0.2963,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 1.08203125,
"rewards/margins": 3.265625,
"rewards/rejected": -2.1796875,
"step": 625
},
{
"epoch": 0.6924778761061947,
"grad_norm": 13.80639934539795,
"learning_rate": 1.1504556354693226e-07,
"logits/chosen": -1.3359375,
"logits/rejected": -1.15625,
"logps/chosen": -248.5,
"logps/rejected": -274.0,
"loss": 0.317,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.9296875,
"rewards/margins": 2.84375,
"rewards/rejected": -1.91796875,
"step": 626
},
{
"epoch": 0.6935840707964602,
"grad_norm": 13.272629737854004,
"learning_rate": 1.1429171387327585e-07,
"logits/chosen": -1.34375,
"logits/rejected": -1.2109375,
"logps/chosen": -238.5,
"logps/rejected": -286.0,
"loss": 0.2575,
"rewards/accuracies": 0.875,
"rewards/chosen": 1.07421875,
"rewards/margins": 3.328125,
"rewards/rejected": -2.25,
"step": 627
},
{
"epoch": 0.6946902654867256,
"grad_norm": 15.396360397338867,
"learning_rate": 1.1353960960951293e-07,
"logits/chosen": -1.2421875,
"logits/rejected": -1.15625,
"logps/chosen": -276.0,
"logps/rejected": -276.0,
"loss": 0.3754,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 0.822265625,
"rewards/margins": 2.4921875,
"rewards/rejected": -1.671875,
"step": 628
},
{
"epoch": 0.6957964601769911,
"grad_norm": 13.207889556884766,
"learning_rate": 1.1278926042882026e-07,
"logits/chosen": -1.29296875,
"logits/rejected": -1.1484375,
"logps/chosen": -249.5,
"logps/rejected": -302.0,
"loss": 0.3109,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.80078125,
"rewards/margins": 2.9140625,
"rewards/rejected": -2.11328125,
"step": 629
},
{
"epoch": 0.6969026548672567,
"grad_norm": 13.04702091217041,
"learning_rate": 1.120406759818014e-07,
"logits/chosen": -1.29296875,
"logits/rejected": -1.1640625,
"logps/chosen": -236.5,
"logps/rejected": -270.0,
"loss": 0.3229,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.900390625,
"rewards/margins": 2.6640625,
"rewards/rejected": -1.76953125,
"step": 630
},
{
"epoch": 0.6980088495575221,
"grad_norm": 14.894906997680664,
"learning_rate": 1.1129386589636292e-07,
"logits/chosen": -1.3125,
"logits/rejected": -1.18359375,
"logps/chosen": -280.0,
"logps/rejected": -280.5,
"loss": 0.316,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.83203125,
"rewards/margins": 2.828125,
"rewards/rejected": -2.0,
"step": 631
},
{
"epoch": 0.6991150442477876,
"grad_norm": 16.062137603759766,
"learning_rate": 1.1054883977759066e-07,
"logits/chosen": -1.26953125,
"logits/rejected": -1.140625,
"logps/chosen": -275.0,
"logps/rejected": -277.0,
"loss": 0.3502,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 0.69140625,
"rewards/margins": 2.71875,
"rewards/rejected": -2.0234375,
"step": 632
},
{
"epoch": 0.7002212389380531,
"grad_norm": 14.050618171691895,
"learning_rate": 1.0980560720762555e-07,
"logits/chosen": -1.19921875,
"logits/rejected": -1.1484375,
"logps/chosen": -248.0,
"logps/rejected": -288.0,
"loss": 0.3215,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.7216796875,
"rewards/margins": 2.8125,
"rewards/rejected": -2.0859375,
"step": 633
},
{
"epoch": 0.7013274336283186,
"grad_norm": 11.265563011169434,
"learning_rate": 1.0906417774554132e-07,
"logits/chosen": -1.1953125,
"logits/rejected": -1.21484375,
"logps/chosen": -234.0,
"logps/rejected": -249.5,
"loss": 0.2667,
"rewards/accuracies": 0.8671875,
"rewards/chosen": 1.05859375,
"rewards/margins": 3.203125,
"rewards/rejected": -2.140625,
"step": 634
},
{
"epoch": 0.702433628318584,
"grad_norm": 13.785270690917969,
"learning_rate": 1.0832456092722062e-07,
"logits/chosen": -1.3046875,
"logits/rejected": -1.18359375,
"logps/chosen": -268.0,
"logps/rejected": -271.0,
"loss": 0.3269,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.63671875,
"rewards/margins": 2.421875,
"rewards/rejected": -1.7890625,
"step": 635
},
{
"epoch": 0.7035398230088495,
"grad_norm": 14.249685287475586,
"learning_rate": 1.0758676626523311e-07,
"logits/chosen": -1.32421875,
"logits/rejected": -1.1796875,
"logps/chosen": -265.0,
"logps/rejected": -286.0,
"loss": 0.314,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.705078125,
"rewards/margins": 2.75,
"rewards/rejected": -2.046875,
"step": 636
},
{
"epoch": 0.7046460176991151,
"grad_norm": 12.366557121276855,
"learning_rate": 1.0685080324871278e-07,
"logits/chosen": -1.2421875,
"logits/rejected": -1.021484375,
"logps/chosen": -256.0,
"logps/rejected": -298.0,
"loss": 0.27,
"rewards/accuracies": 0.859375,
"rewards/chosen": 0.75390625,
"rewards/margins": 2.65625,
"rewards/rejected": -1.90234375,
"step": 637
},
{
"epoch": 0.7057522123893806,
"grad_norm": 16.30191421508789,
"learning_rate": 1.0611668134323575e-07,
"logits/chosen": -1.30078125,
"logits/rejected": -1.1484375,
"logps/chosen": -282.0,
"logps/rejected": -299.0,
"loss": 0.3438,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.4140625,
"rewards/margins": 2.54296875,
"rewards/rejected": -2.1328125,
"step": 638
},
{
"epoch": 0.706858407079646,
"grad_norm": 14.99670696258545,
"learning_rate": 1.0538440999069895e-07,
"logits/chosen": -1.30859375,
"logits/rejected": -1.19921875,
"logps/chosen": -255.5,
"logps/rejected": -298.0,
"loss": 0.3104,
"rewards/accuracies": 0.8515625,
"rewards/chosen": 0.896484375,
"rewards/margins": 2.8671875,
"rewards/rejected": -1.96875,
"step": 639
},
{
"epoch": 0.7079646017699115,
"grad_norm": 12.429228782653809,
"learning_rate": 1.0465399860919838e-07,
"logits/chosen": -1.2265625,
"logits/rejected": -1.1953125,
"logps/chosen": -255.5,
"logps/rejected": -273.0,
"loss": 0.2869,
"rewards/accuracies": 0.828125,
"rewards/chosen": 1.080078125,
"rewards/margins": 3.0859375,
"rewards/rejected": -2.00390625,
"step": 640
},
{
"epoch": 0.709070796460177,
"grad_norm": 12.204998970031738,
"learning_rate": 1.0392545659290788e-07,
"logits/chosen": -1.2578125,
"logits/rejected": -1.21875,
"logps/chosen": -260.5,
"logps/rejected": -274.0,
"loss": 0.2817,
"rewards/accuracies": 0.8671875,
"rewards/chosen": 0.810546875,
"rewards/margins": 2.9921875,
"rewards/rejected": -2.1875,
"step": 641
},
{
"epoch": 0.7101769911504425,
"grad_norm": 14.068879127502441,
"learning_rate": 1.0319879331195882e-07,
"logits/chosen": -1.21484375,
"logits/rejected": -1.0703125,
"logps/chosen": -254.0,
"logps/rejected": -272.0,
"loss": 0.3538,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.591796875,
"rewards/margins": 2.5859375,
"rewards/rejected": -2.0,
"step": 642
},
{
"epoch": 0.7112831858407079,
"grad_norm": 12.932374954223633,
"learning_rate": 1.0247401811231887e-07,
"logits/chosen": -1.390625,
"logits/rejected": -1.18359375,
"logps/chosen": -233.0,
"logps/rejected": -259.0,
"loss": 0.2886,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.841796875,
"rewards/margins": 2.9375,
"rewards/rejected": -2.09375,
"step": 643
},
{
"epoch": 0.7123893805309734,
"grad_norm": 12.754419326782227,
"learning_rate": 1.0175114031567245e-07,
"logits/chosen": -1.27734375,
"logits/rejected": -1.2109375,
"logps/chosen": -253.0,
"logps/rejected": -288.0,
"loss": 0.2941,
"rewards/accuracies": 0.8515625,
"rewards/chosen": 0.798828125,
"rewards/margins": 2.671875,
"rewards/rejected": -1.875,
"step": 644
},
{
"epoch": 0.713495575221239,
"grad_norm": 13.075281143188477,
"learning_rate": 1.0103016921930055e-07,
"logits/chosen": -1.3046875,
"logits/rejected": -1.18359375,
"logps/chosen": -247.5,
"logps/rejected": -276.0,
"loss": 0.324,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 0.845703125,
"rewards/margins": 2.5546875,
"rewards/rejected": -1.70703125,
"step": 645
},
{
"epoch": 0.7146017699115044,
"grad_norm": 15.02340030670166,
"learning_rate": 1.0031111409596091e-07,
"logits/chosen": -1.15625,
"logits/rejected": -1.2109375,
"logps/chosen": -246.5,
"logps/rejected": -258.0,
"loss": 0.3851,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 0.669921875,
"rewards/margins": 2.4140625,
"rewards/rejected": -1.7421875,
"step": 646
},
{
"epoch": 0.7157079646017699,
"grad_norm": 12.193872451782227,
"learning_rate": 9.95939841937693e-08,
"logits/chosen": -1.30859375,
"logits/rejected": -1.2578125,
"logps/chosen": -259.5,
"logps/rejected": -265.5,
"loss": 0.2392,
"rewards/accuracies": 0.890625,
"rewards/chosen": 0.86328125,
"rewards/margins": 3.2734375,
"rewards/rejected": -2.4140625,
"step": 647
},
{
"epoch": 0.7168141592920354,
"grad_norm": 13.42468547821045,
"learning_rate": 9.887878873608027e-08,
"logits/chosen": -1.16015625,
"logits/rejected": -1.11328125,
"logps/chosen": -263.5,
"logps/rejected": -290.0,
"loss": 0.3087,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 0.791015625,
"rewards/margins": 2.875,
"rewards/rejected": -2.08203125,
"step": 648
},
{
"epoch": 0.7179203539823009,
"grad_norm": 13.621614456176758,
"learning_rate": 9.816553692136834e-08,
"logits/chosen": -1.17578125,
"logits/rejected": -1.109375,
"logps/chosen": -256.5,
"logps/rejected": -282.0,
"loss": 0.2806,
"rewards/accuracies": 0.84375,
"rewards/chosen": 1.1484375,
"rewards/margins": 3.046875,
"rewards/rejected": -1.89453125,
"step": 649
},
{
"epoch": 0.7190265486725663,
"grad_norm": 13.231938362121582,
"learning_rate": 9.745423792310995e-08,
"logits/chosen": -1.29296875,
"logits/rejected": -1.12890625,
"logps/chosen": -243.0,
"logps/rejected": -258.5,
"loss": 0.2872,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.7265625,
"rewards/margins": 2.90625,
"rewards/rejected": -2.1796875,
"step": 650
},
{
"epoch": 0.7190265486725663,
"eval_logits/chosen": -1.2634872198104858,
"eval_logits/rejected": -1.1566191911697388,
"eval_logps/chosen": -252.76119995117188,
"eval_logps/rejected": -279.27362060546875,
"eval_loss": 0.3217768967151642,
"eval_rewards/accuracies": 0.8134269714355469,
"eval_rewards/chosen": 0.8057758212089539,
"eval_rewards/margins": 2.8031716346740723,
"eval_rewards/rejected": -1.996579647064209,
"eval_runtime": 193.0564,
"eval_samples_per_second": 66.576,
"eval_steps_per_second": 1.041,
"step": 650
},
{
"epoch": 0.7201327433628318,
"grad_norm": 13.893576622009277,
"learning_rate": 9.674490088966562e-08,
"logits/chosen": -1.2734375,
"logits/rejected": -1.12109375,
"logps/chosen": -262.5,
"logps/rejected": -296.0,
"loss": 0.2924,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.919921875,
"rewards/margins": 3.0859375,
"rewards/rejected": -2.1640625,
"step": 651
},
{
"epoch": 0.7212389380530974,
"grad_norm": 13.017692565917969,
"learning_rate": 9.603753494416184e-08,
"logits/chosen": -1.328125,
"logits/rejected": -1.28515625,
"logps/chosen": -242.0,
"logps/rejected": -248.0,
"loss": 0.2897,
"rewards/accuracies": 0.8515625,
"rewards/chosen": 0.791015625,
"rewards/margins": 2.7109375,
"rewards/rejected": -1.9140625,
"step": 652
},
{
"epoch": 0.7223451327433629,
"grad_norm": 17.18537712097168,
"learning_rate": 9.533214918437421e-08,
"logits/chosen": -1.25390625,
"logits/rejected": -1.23828125,
"logps/chosen": -283.0,
"logps/rejected": -287.0,
"loss": 0.402,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.509765625,
"rewards/margins": 2.1640625,
"rewards/rejected": -1.65625,
"step": 653
},
{
"epoch": 0.7234513274336283,
"grad_norm": 17.223974227905273,
"learning_rate": 9.462875268261e-08,
"logits/chosen": -1.19140625,
"logits/rejected": -1.2578125,
"logps/chosen": -297.0,
"logps/rejected": -311.0,
"loss": 0.3244,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.814453125,
"rewards/margins": 2.859375,
"rewards/rejected": -2.046875,
"step": 654
},
{
"epoch": 0.7245575221238938,
"grad_norm": 14.414237976074219,
"learning_rate": 9.39273544855918e-08,
"logits/chosen": -1.24609375,
"logits/rejected": -1.10546875,
"logps/chosen": -259.0,
"logps/rejected": -303.0,
"loss": 0.3058,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.888671875,
"rewards/margins": 3.140625,
"rewards/rejected": -2.25,
"step": 655
},
{
"epoch": 0.7256637168141593,
"grad_norm": 13.708487510681152,
"learning_rate": 9.32279636143411e-08,
"logits/chosen": -1.359375,
"logits/rejected": -1.16015625,
"logps/chosen": -271.0,
"logps/rejected": -277.5,
"loss": 0.3109,
"rewards/accuracies": 0.828125,
"rewards/chosen": 0.89453125,
"rewards/margins": 2.640625,
"rewards/rejected": -1.7421875,
"step": 656
},
{
"epoch": 0.7267699115044248,
"grad_norm": 14.687643051147461,
"learning_rate": 9.253058906406194e-08,
"logits/chosen": -1.203125,
"logits/rejected": -1.0625,
"logps/chosen": -280.0,
"logps/rejected": -306.0,
"loss": 0.314,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.810546875,
"rewards/margins": 2.7734375,
"rewards/rejected": -1.96875,
"step": 657
},
{
"epoch": 0.7278761061946902,
"grad_norm": 13.893321990966797,
"learning_rate": 9.183523980402582e-08,
"logits/chosen": -1.21484375,
"logits/rejected": -1.203125,
"logps/chosen": -240.0,
"logps/rejected": -291.0,
"loss": 0.3241,
"rewards/accuracies": 0.828125,
"rewards/chosen": 0.82421875,
"rewards/margins": 2.6875,
"rewards/rejected": -1.87109375,
"step": 658
},
{
"epoch": 0.7289823008849557,
"grad_norm": 13.528450965881348,
"learning_rate": 9.114192477745566e-08,
"logits/chosen": -1.3359375,
"logits/rejected": -1.1484375,
"logps/chosen": -262.0,
"logps/rejected": -278.0,
"loss": 0.3098,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.6640625,
"rewards/margins": 2.6953125,
"rewards/rejected": -2.03515625,
"step": 659
},
{
"epoch": 0.7300884955752213,
"grad_norm": 15.182424545288086,
"learning_rate": 9.045065290141138e-08,
"logits/chosen": -1.22265625,
"logits/rejected": -1.1015625,
"logps/chosen": -275.0,
"logps/rejected": -305.0,
"loss": 0.3081,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.822265625,
"rewards/margins": 2.8046875,
"rewards/rejected": -1.984375,
"step": 660
},
{
"epoch": 0.7311946902654868,
"grad_norm": 14.025420188903809,
"learning_rate": 8.976143306667491e-08,
"logits/chosen": -1.21484375,
"logits/rejected": -1.16015625,
"logps/chosen": -255.5,
"logps/rejected": -290.0,
"loss": 0.2861,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 0.857421875,
"rewards/margins": 2.9765625,
"rewards/rejected": -2.1171875,
"step": 661
},
{
"epoch": 0.7323008849557522,
"grad_norm": 12.591769218444824,
"learning_rate": 8.907427413763572e-08,
"logits/chosen": -1.32421875,
"logits/rejected": -1.1171875,
"logps/chosen": -268.0,
"logps/rejected": -275.5,
"loss": 0.2648,
"rewards/accuracies": 0.8828125,
"rewards/chosen": 0.677734375,
"rewards/margins": 2.9453125,
"rewards/rejected": -2.265625,
"step": 662
},
{
"epoch": 0.7334070796460177,
"grad_norm": 15.431063652038574,
"learning_rate": 8.838918495217712e-08,
"logits/chosen": -1.25390625,
"logits/rejected": -1.14453125,
"logps/chosen": -269.5,
"logps/rejected": -304.0,
"loss": 0.3575,
"rewards/accuracies": 0.7578125,
"rewards/chosen": 0.787109375,
"rewards/margins": 2.640625,
"rewards/rejected": -1.84765625,
"step": 663
},
{
"epoch": 0.7345132743362832,
"grad_norm": 14.970857620239258,
"learning_rate": 8.770617432156257e-08,
"logits/chosen": -1.28515625,
"logits/rejected": -1.05859375,
"logps/chosen": -268.0,
"logps/rejected": -298.0,
"loss": 0.3506,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.708984375,
"rewards/margins": 2.4921875,
"rewards/rejected": -1.78125,
"step": 664
},
{
"epoch": 0.7356194690265486,
"grad_norm": 15.439310073852539,
"learning_rate": 8.702525103032184e-08,
"logits/chosen": -1.2421875,
"logits/rejected": -1.07421875,
"logps/chosen": -248.0,
"logps/rejected": -280.5,
"loss": 0.3629,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.7626953125,
"rewards/margins": 2.59375,
"rewards/rejected": -1.828125,
"step": 665
},
{
"epoch": 0.7367256637168141,
"grad_norm": 13.227315902709961,
"learning_rate": 8.634642383613891e-08,
"logits/chosen": -1.21875,
"logits/rejected": -1.11328125,
"logps/chosen": -254.5,
"logps/rejected": -285.0,
"loss": 0.3095,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.8984375,
"rewards/margins": 2.875,
"rewards/rejected": -1.97265625,
"step": 666
},
{
"epoch": 0.7378318584070797,
"grad_norm": 12.241044044494629,
"learning_rate": 8.566970146973835e-08,
"logits/chosen": -1.30859375,
"logits/rejected": -1.125,
"logps/chosen": -257.0,
"logps/rejected": -293.0,
"loss": 0.2911,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 0.849609375,
"rewards/margins": 3.0,
"rewards/rejected": -2.15234375,
"step": 667
},
{
"epoch": 0.7389380530973452,
"grad_norm": 12.409917831420898,
"learning_rate": 8.499509263477387e-08,
"logits/chosen": -1.375,
"logits/rejected": -1.15234375,
"logps/chosen": -222.0,
"logps/rejected": -269.0,
"loss": 0.285,
"rewards/accuracies": 0.84375,
"rewards/chosen": 1.044921875,
"rewards/margins": 3.109375,
"rewards/rejected": -2.05859375,
"step": 668
},
{
"epoch": 0.7400442477876106,
"grad_norm": 16.232877731323242,
"learning_rate": 8.432260600771599e-08,
"logits/chosen": -1.31640625,
"logits/rejected": -1.171875,
"logps/chosen": -278.0,
"logps/rejected": -274.0,
"loss": 0.3434,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.6806640625,
"rewards/margins": 2.796875,
"rewards/rejected": -2.11328125,
"step": 669
},
{
"epoch": 0.7411504424778761,
"grad_norm": 12.330305099487305,
"learning_rate": 8.36522502377403e-08,
"logits/chosen": -1.34375,
"logits/rejected": -1.1640625,
"logps/chosen": -239.5,
"logps/rejected": -292.0,
"loss": 0.2725,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.86328125,
"rewards/margins": 2.7734375,
"rewards/rejected": -1.9140625,
"step": 670
},
{
"epoch": 0.7422566371681416,
"grad_norm": 15.042512893676758,
"learning_rate": 8.298403394661657e-08,
"logits/chosen": -1.24609375,
"logits/rejected": -1.18359375,
"logps/chosen": -278.0,
"logps/rejected": -262.0,
"loss": 0.3643,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 0.53515625,
"rewards/margins": 2.4609375,
"rewards/rejected": -1.92578125,
"step": 671
},
{
"epoch": 0.7433628318584071,
"grad_norm": 15.917474746704102,
"learning_rate": 8.231796572859778e-08,
"logits/chosen": -1.09765625,
"logits/rejected": -1.12109375,
"logps/chosen": -250.5,
"logps/rejected": -301.0,
"loss": 0.2963,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 1.0,
"rewards/margins": 3.1796875,
"rewards/rejected": -2.1796875,
"step": 672
},
{
"epoch": 0.7444690265486725,
"grad_norm": 13.662030220031738,
"learning_rate": 8.165405415030915e-08,
"logits/chosen": -1.35546875,
"logits/rejected": -1.14453125,
"logps/chosen": -288.0,
"logps/rejected": -279.0,
"loss": 0.2763,
"rewards/accuracies": 0.859375,
"rewards/chosen": 0.91015625,
"rewards/margins": 2.953125,
"rewards/rejected": -2.0390625,
"step": 673
},
{
"epoch": 0.745575221238938,
"grad_norm": 14.487608909606934,
"learning_rate": 8.099230775063879e-08,
"logits/chosen": -1.2890625,
"logits/rejected": -1.140625,
"logps/chosen": -261.0,
"logps/rejected": -277.0,
"loss": 0.319,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 0.75390625,
"rewards/margins": 2.7109375,
"rewards/rejected": -1.953125,
"step": 674
},
{
"epoch": 0.7466814159292036,
"grad_norm": 15.394200325012207,
"learning_rate": 8.033273504062698e-08,
"logits/chosen": -1.12109375,
"logits/rejected": -1.11328125,
"logps/chosen": -267.0,
"logps/rejected": -314.0,
"loss": 0.3292,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.70703125,
"rewards/margins": 2.84375,
"rewards/rejected": -2.1328125,
"step": 675
},
{
"epoch": 0.7477876106194691,
"grad_norm": 16.063007354736328,
"learning_rate": 7.967534450335728e-08,
"logits/chosen": -1.32421875,
"logits/rejected": -1.19140625,
"logps/chosen": -253.5,
"logps/rejected": -266.5,
"loss": 0.3824,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.556640625,
"rewards/margins": 2.4453125,
"rewards/rejected": -1.88671875,
"step": 676
},
{
"epoch": 0.7488938053097345,
"grad_norm": 15.266008377075195,
"learning_rate": 7.902014459384742e-08,
"logits/chosen": -1.21875,
"logits/rejected": -1.03515625,
"logps/chosen": -259.0,
"logps/rejected": -301.0,
"loss": 0.3159,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.875,
"rewards/margins": 3.1015625,
"rewards/rejected": -2.2265625,
"step": 677
},
{
"epoch": 0.75,
"grad_norm": 11.596745491027832,
"learning_rate": 7.836714373894015e-08,
"logits/chosen": -1.1484375,
"logits/rejected": -1.05078125,
"logps/chosen": -248.5,
"logps/rejected": -268.5,
"loss": 0.224,
"rewards/accuracies": 0.8828125,
"rewards/chosen": 1.046875,
"rewards/margins": 3.390625,
"rewards/rejected": -2.3359375,
"step": 678
},
{
"epoch": 0.7511061946902655,
"grad_norm": 12.86449909210205,
"learning_rate": 7.771635033719528e-08,
"logits/chosen": -1.26171875,
"logits/rejected": -1.09375,
"logps/chosen": -271.0,
"logps/rejected": -258.5,
"loss": 0.2782,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.80078125,
"rewards/margins": 2.8359375,
"rewards/rejected": -2.03125,
"step": 679
},
{
"epoch": 0.7522123893805309,
"grad_norm": 12.727359771728516,
"learning_rate": 7.70677727587816e-08,
"logits/chosen": -1.2578125,
"logits/rejected": -1.0859375,
"logps/chosen": -257.0,
"logps/rejected": -295.0,
"loss": 0.2793,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 0.70703125,
"rewards/margins": 2.734375,
"rewards/rejected": -2.0234375,
"step": 680
},
{
"epoch": 0.7533185840707964,
"grad_norm": 12.862136840820312,
"learning_rate": 7.642141934536874e-08,
"logits/chosen": -1.3203125,
"logits/rejected": -1.1953125,
"logps/chosen": -242.5,
"logps/rejected": -268.0,
"loss": 0.2937,
"rewards/accuracies": 0.859375,
"rewards/chosen": 0.720703125,
"rewards/margins": 3.0546875,
"rewards/rejected": -2.328125,
"step": 681
},
{
"epoch": 0.754424778761062,
"grad_norm": 13.950096130371094,
"learning_rate": 7.577729841002075e-08,
"logits/chosen": -1.17578125,
"logits/rejected": -1.12890625,
"logps/chosen": -279.0,
"logps/rejected": -308.0,
"loss": 0.2855,
"rewards/accuracies": 0.859375,
"rewards/chosen": 0.703125,
"rewards/margins": 2.875,
"rewards/rejected": -2.171875,
"step": 682
},
{
"epoch": 0.7555309734513275,
"grad_norm": 15.024590492248535,
"learning_rate": 7.513541823708827e-08,
"logits/chosen": -1.26953125,
"logits/rejected": -1.140625,
"logps/chosen": -251.0,
"logps/rejected": -292.0,
"loss": 0.3303,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.697265625,
"rewards/margins": 2.75,
"rewards/rejected": -2.05078125,
"step": 683
},
{
"epoch": 0.7566371681415929,
"grad_norm": 14.337872505187988,
"learning_rate": 7.449578708210267e-08,
"logits/chosen": -1.26953125,
"logits/rejected": -1.21484375,
"logps/chosen": -283.0,
"logps/rejected": -276.0,
"loss": 0.3292,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 0.71484375,
"rewards/margins": 2.8515625,
"rewards/rejected": -2.14453125,
"step": 684
},
{
"epoch": 0.7577433628318584,
"grad_norm": 13.712812423706055,
"learning_rate": 7.385841317166966e-08,
"logits/chosen": -1.30078125,
"logits/rejected": -1.1953125,
"logps/chosen": -250.5,
"logps/rejected": -283.0,
"loss": 0.309,
"rewards/accuracies": 0.828125,
"rewards/chosen": 0.654296875,
"rewards/margins": 2.8515625,
"rewards/rejected": -2.1953125,
"step": 685
},
{
"epoch": 0.7588495575221239,
"grad_norm": 11.522303581237793,
"learning_rate": 7.322330470336313e-08,
"logits/chosen": -1.43359375,
"logits/rejected": -1.15625,
"logps/chosen": -247.0,
"logps/rejected": -287.0,
"loss": 0.2535,
"rewards/accuracies": 0.890625,
"rewards/chosen": 0.541015625,
"rewards/margins": 2.796875,
"rewards/rejected": -2.265625,
"step": 686
},
{
"epoch": 0.7599557522123894,
"grad_norm": 16.617996215820312,
"learning_rate": 7.25904698456203e-08,
"logits/chosen": -1.21484375,
"logits/rejected": -1.0859375,
"logps/chosen": -283.0,
"logps/rejected": -304.0,
"loss": 0.3768,
"rewards/accuracies": 0.765625,
"rewards/chosen": 0.59375,
"rewards/margins": 2.578125,
"rewards/rejected": -1.98828125,
"step": 687
},
{
"epoch": 0.7610619469026548,
"grad_norm": 13.16273021697998,
"learning_rate": 7.195991673763644e-08,
"logits/chosen": -1.2578125,
"logits/rejected": -1.140625,
"logps/chosen": -256.0,
"logps/rejected": -261.5,
"loss": 0.342,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 0.5322265625,
"rewards/margins": 2.6015625,
"rewards/rejected": -2.078125,
"step": 688
},
{
"epoch": 0.7621681415929203,
"grad_norm": 14.337390899658203,
"learning_rate": 7.133165348925976e-08,
"logits/chosen": -1.25,
"logits/rejected": -1.2734375,
"logps/chosen": -253.5,
"logps/rejected": -281.0,
"loss": 0.3474,
"rewards/accuracies": 0.7578125,
"rewards/chosen": 1.0078125,
"rewards/margins": 2.8359375,
"rewards/rejected": -1.83203125,
"step": 689
},
{
"epoch": 0.7632743362831859,
"grad_norm": 13.05460262298584,
"learning_rate": 7.070568818088782e-08,
"logits/chosen": -1.29296875,
"logits/rejected": -1.06640625,
"logps/chosen": -266.5,
"logps/rejected": -289.5,
"loss": 0.3306,
"rewards/accuracies": 0.8515625,
"rewards/chosen": 0.76171875,
"rewards/margins": 2.6875,
"rewards/rejected": -1.921875,
"step": 690
},
{
"epoch": 0.7643805309734514,
"grad_norm": 13.12061595916748,
"learning_rate": 7.008202886336323e-08,
"logits/chosen": -1.296875,
"logits/rejected": -1.11328125,
"logps/chosen": -252.0,
"logps/rejected": -294.0,
"loss": 0.3064,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.6943359375,
"rewards/margins": 2.96875,
"rewards/rejected": -2.2734375,
"step": 691
},
{
"epoch": 0.7654867256637168,
"grad_norm": 15.881913185119629,
"learning_rate": 6.94606835578699e-08,
"logits/chosen": -1.2734375,
"logits/rejected": -1.09375,
"logps/chosen": -267.5,
"logps/rejected": -279.0,
"loss": 0.39,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.5244140625,
"rewards/margins": 2.265625,
"rewards/rejected": -1.734375,
"step": 692
},
{
"epoch": 0.7665929203539823,
"grad_norm": 18.117141723632812,
"learning_rate": 6.884166025583043e-08,
"logits/chosen": -1.19140625,
"logits/rejected": -1.13671875,
"logps/chosen": -289.0,
"logps/rejected": -318.0,
"loss": 0.3893,
"rewards/accuracies": 0.734375,
"rewards/chosen": 0.591796875,
"rewards/margins": 2.71875,
"rewards/rejected": -2.1328125,
"step": 693
},
{
"epoch": 0.7676991150442478,
"grad_norm": 14.054704666137695,
"learning_rate": 6.822496691880275e-08,
"logits/chosen": -1.34765625,
"logits/rejected": -1.18359375,
"logps/chosen": -250.5,
"logps/rejected": -272.0,
"loss": 0.3268,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.708984375,
"rewards/margins": 2.6875,
"rewards/rejected": -1.9765625,
"step": 694
},
{
"epoch": 0.7688053097345132,
"grad_norm": 11.454045295715332,
"learning_rate": 6.761061147837807e-08,
"logits/chosen": -1.41796875,
"logits/rejected": -1.12890625,
"logps/chosen": -254.5,
"logps/rejected": -293.0,
"loss": 0.2523,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.861328125,
"rewards/margins": 3.1484375,
"rewards/rejected": -2.28125,
"step": 695
},
{
"epoch": 0.7699115044247787,
"grad_norm": 13.245506286621094,
"learning_rate": 6.699860183607894e-08,
"logits/chosen": -1.359375,
"logits/rejected": -1.140625,
"logps/chosen": -275.0,
"logps/rejected": -273.0,
"loss": 0.3098,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.4716796875,
"rewards/margins": 2.71875,
"rewards/rejected": -2.25,
"step": 696
},
{
"epoch": 0.7710176991150443,
"grad_norm": 13.23479175567627,
"learning_rate": 6.638894586325719e-08,
"logits/chosen": -1.2890625,
"logits/rejected": -1.19140625,
"logps/chosen": -246.0,
"logps/rejected": -288.0,
"loss": 0.2909,
"rewards/accuracies": 0.8671875,
"rewards/chosen": 0.5712890625,
"rewards/margins": 2.859375,
"rewards/rejected": -2.296875,
"step": 697
},
{
"epoch": 0.7721238938053098,
"grad_norm": 14.469060897827148,
"learning_rate": 6.578165140099317e-08,
"logits/chosen": -1.26953125,
"logits/rejected": -1.20703125,
"logps/chosen": -252.5,
"logps/rejected": -277.0,
"loss": 0.3493,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.79296875,
"rewards/margins": 2.8359375,
"rewards/rejected": -2.04296875,
"step": 698
},
{
"epoch": 0.7732300884955752,
"grad_norm": 15.471959114074707,
"learning_rate": 6.517672625999465e-08,
"logits/chosen": -1.20703125,
"logits/rejected": -1.09375,
"logps/chosen": -254.0,
"logps/rejected": -284.0,
"loss": 0.3456,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 0.5849609375,
"rewards/margins": 2.78125,
"rewards/rejected": -2.203125,
"step": 699
},
{
"epoch": 0.7743362831858407,
"grad_norm": 13.424947738647461,
"learning_rate": 6.457417822049627e-08,
"logits/chosen": -1.35546875,
"logits/rejected": -1.13671875,
"logps/chosen": -260.0,
"logps/rejected": -280.0,
"loss": 0.3278,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.681640625,
"rewards/margins": 2.5703125,
"rewards/rejected": -1.890625,
"step": 700
},
{
"epoch": 0.7743362831858407,
"eval_logits/chosen": -1.267957091331482,
"eval_logits/rejected": -1.1595537662506104,
"eval_logps/chosen": -253.52735900878906,
"eval_logps/rejected": -279.9950256347656,
"eval_loss": 0.3205508887767792,
"eval_rewards/accuracies": 0.8137379288673401,
"eval_rewards/chosen": 0.73013836145401,
"eval_rewards/margins": 2.814093589782715,
"eval_rewards/rejected": -2.0833332538604736,
"eval_runtime": 193.1227,
"eval_samples_per_second": 66.554,
"eval_steps_per_second": 1.041,
"step": 700
},
{
"epoch": 0.7754424778761062,
"grad_norm": 14.452095985412598,
"learning_rate": 6.397401503215991e-08,
"logits/chosen": -1.2578125,
"logits/rejected": -1.125,
"logps/chosen": -268.0,
"logps/rejected": -296.0,
"loss": 0.3012,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.92578125,
"rewards/margins": 3.0703125,
"rewards/rejected": -2.14453125,
"step": 701
},
{
"epoch": 0.7765486725663717,
"grad_norm": 13.497214317321777,
"learning_rate": 6.33762444139744e-08,
"logits/chosen": -1.53515625,
"logits/rejected": -1.15625,
"logps/chosen": -244.5,
"logps/rejected": -292.0,
"loss": 0.3147,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.78515625,
"rewards/margins": 2.78125,
"rewards/rejected": -2.0,
"step": 702
},
{
"epoch": 0.7776548672566371,
"grad_norm": 13.972166061401367,
"learning_rate": 6.278087405415683e-08,
"logits/chosen": -1.3125,
"logits/rejected": -1.140625,
"logps/chosen": -258.0,
"logps/rejected": -260.0,
"loss": 0.2868,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.8046875,
"rewards/margins": 3.03125,
"rewards/rejected": -2.2265625,
"step": 703
},
{
"epoch": 0.7787610619469026,
"grad_norm": 13.861830711364746,
"learning_rate": 6.218791161005335e-08,
"logits/chosen": -1.2265625,
"logits/rejected": -1.09765625,
"logps/chosen": -238.5,
"logps/rejected": -299.0,
"loss": 0.2945,
"rewards/accuracies": 0.8671875,
"rewards/chosen": 0.76953125,
"rewards/margins": 3.0,
"rewards/rejected": -2.234375,
"step": 704
},
{
"epoch": 0.7798672566371682,
"grad_norm": 15.766735076904297,
"learning_rate": 6.159736470804059e-08,
"logits/chosen": -1.3046875,
"logits/rejected": -1.2421875,
"logps/chosen": -250.5,
"logps/rejected": -261.0,
"loss": 0.3834,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 0.6416015625,
"rewards/margins": 2.21875,
"rewards/rejected": -1.57421875,
"step": 705
},
{
"epoch": 0.7809734513274337,
"grad_norm": 13.845602989196777,
"learning_rate": 6.100924094342785e-08,
"logits/chosen": -1.4140625,
"logits/rejected": -1.19921875,
"logps/chosen": -230.5,
"logps/rejected": -236.0,
"loss": 0.3024,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.705078125,
"rewards/margins": 2.9609375,
"rewards/rejected": -2.2578125,
"step": 706
},
{
"epoch": 0.7820796460176991,
"grad_norm": 14.929936408996582,
"learning_rate": 6.042354788035942e-08,
"logits/chosen": -1.18359375,
"logits/rejected": -1.04296875,
"logps/chosen": -269.0,
"logps/rejected": -291.0,
"loss": 0.3403,
"rewards/accuracies": 0.7578125,
"rewards/chosen": 0.6162109375,
"rewards/margins": 2.7109375,
"rewards/rejected": -2.09765625,
"step": 707
},
{
"epoch": 0.7831858407079646,
"grad_norm": 13.97938346862793,
"learning_rate": 5.984029305171678e-08,
"logits/chosen": -1.26953125,
"logits/rejected": -1.2734375,
"logps/chosen": -245.0,
"logps/rejected": -287.0,
"loss": 0.2896,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 1.0,
"rewards/margins": 3.21875,
"rewards/rejected": -2.21875,
"step": 708
},
{
"epoch": 0.7842920353982301,
"grad_norm": 14.008685111999512,
"learning_rate": 5.925948395902253e-08,
"logits/chosen": -1.32421875,
"logits/rejected": -1.21875,
"logps/chosen": -272.0,
"logps/rejected": -313.0,
"loss": 0.3008,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.884765625,
"rewards/margins": 2.9765625,
"rewards/rejected": -2.0859375,
"step": 709
},
{
"epoch": 0.7853982300884956,
"grad_norm": 12.459348678588867,
"learning_rate": 5.868112807234313e-08,
"logits/chosen": -1.29296875,
"logits/rejected": -1.140625,
"logps/chosen": -269.0,
"logps/rejected": -371.0,
"loss": 0.262,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.994140625,
"rewards/margins": 3.0390625,
"rewards/rejected": -2.046875,
"step": 710
},
{
"epoch": 0.786504424778761,
"grad_norm": 14.358124732971191,
"learning_rate": 5.810523283019339e-08,
"logits/chosen": -1.3046875,
"logits/rejected": -1.1875,
"logps/chosen": -282.0,
"logps/rejected": -273.0,
"loss": 0.3423,
"rewards/accuracies": 0.859375,
"rewards/chosen": 0.484375,
"rewards/margins": 2.484375,
"rewards/rejected": -2.00390625,
"step": 711
},
{
"epoch": 0.7876106194690266,
"grad_norm": 12.388589859008789,
"learning_rate": 5.753180563944057e-08,
"logits/chosen": -1.37109375,
"logits/rejected": -1.09375,
"logps/chosen": -232.0,
"logps/rejected": -247.5,
"loss": 0.2437,
"rewards/accuracies": 0.859375,
"rewards/chosen": 0.931640625,
"rewards/margins": 3.46875,
"rewards/rejected": -2.5390625,
"step": 712
},
{
"epoch": 0.7887168141592921,
"grad_norm": 12.301764488220215,
"learning_rate": 5.6960853875208935e-08,
"logits/chosen": -1.19921875,
"logits/rejected": -1.1640625,
"logps/chosen": -252.0,
"logps/rejected": -267.0,
"loss": 0.3027,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 0.671875,
"rewards/margins": 2.8125,
"rewards/rejected": -2.1328125,
"step": 713
},
{
"epoch": 0.7898230088495575,
"grad_norm": 14.501238822937012,
"learning_rate": 5.6392384880785294e-08,
"logits/chosen": -1.37109375,
"logits/rejected": -1.2265625,
"logps/chosen": -276.0,
"logps/rejected": -285.0,
"loss": 0.3198,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.501953125,
"rewards/margins": 2.6328125,
"rewards/rejected": -2.125,
"step": 714
},
{
"epoch": 0.790929203539823,
"grad_norm": 12.956294059753418,
"learning_rate": 5.5826405967524357e-08,
"logits/chosen": -1.1484375,
"logits/rejected": -1.0859375,
"logps/chosen": -255.0,
"logps/rejected": -297.0,
"loss": 0.272,
"rewards/accuracies": 0.859375,
"rewards/chosen": 0.779296875,
"rewards/margins": 3.203125,
"rewards/rejected": -2.421875,
"step": 715
},
{
"epoch": 0.7920353982300885,
"grad_norm": 14.246673583984375,
"learning_rate": 5.526292441475447e-08,
"logits/chosen": -1.32421875,
"logits/rejected": -1.140625,
"logps/chosen": -269.0,
"logps/rejected": -308.0,
"loss": 0.2897,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.818359375,
"rewards/margins": 2.765625,
"rewards/rejected": -1.94921875,
"step": 716
},
{
"epoch": 0.793141592920354,
"grad_norm": 14.141976356506348,
"learning_rate": 5.470194746968451e-08,
"logits/chosen": -1.265625,
"logits/rejected": -1.2265625,
"logps/chosen": -246.0,
"logps/rejected": -288.0,
"loss": 0.3056,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.7265625,
"rewards/margins": 3.0859375,
"rewards/rejected": -2.359375,
"step": 717
},
{
"epoch": 0.7942477876106194,
"grad_norm": 13.89908218383789,
"learning_rate": 5.4143482347310116e-08,
"logits/chosen": -1.3046875,
"logits/rejected": -1.10546875,
"logps/chosen": -273.0,
"logps/rejected": -295.0,
"loss": 0.3041,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.87890625,
"rewards/margins": 2.828125,
"rewards/rejected": -1.94921875,
"step": 718
},
{
"epoch": 0.7953539823008849,
"grad_norm": 11.489982604980469,
"learning_rate": 5.358753623032136e-08,
"logits/chosen": -1.359375,
"logits/rejected": -1.12109375,
"logps/chosen": -228.0,
"logps/rejected": -249.0,
"loss": 0.2602,
"rewards/accuracies": 0.859375,
"rewards/chosen": 0.88671875,
"rewards/margins": 3.09375,
"rewards/rejected": -2.2109375,
"step": 719
},
{
"epoch": 0.7964601769911505,
"grad_norm": 13.466360092163086,
"learning_rate": 5.3034116269010194e-08,
"logits/chosen": -1.453125,
"logits/rejected": -1.234375,
"logps/chosen": -269.0,
"logps/rejected": -295.0,
"loss": 0.3119,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.59765625,
"rewards/margins": 2.4921875,
"rewards/rejected": -1.890625,
"step": 720
},
{
"epoch": 0.797566371681416,
"grad_norm": 15.516824722290039,
"learning_rate": 5.248322958117815e-08,
"logits/chosen": -1.1875,
"logits/rejected": -1.15625,
"logps/chosen": -261.0,
"logps/rejected": -273.0,
"loss": 0.361,
"rewards/accuracies": 0.765625,
"rewards/chosen": 0.91015625,
"rewards/margins": 2.6875,
"rewards/rejected": -1.78125,
"step": 721
},
{
"epoch": 0.7986725663716814,
"grad_norm": 14.861969947814941,
"learning_rate": 5.1934883252045507e-08,
"logits/chosen": -1.234375,
"logits/rejected": -1.16796875,
"logps/chosen": -249.5,
"logps/rejected": -279.0,
"loss": 0.3549,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.65625,
"rewards/margins": 2.8046875,
"rewards/rejected": -2.1484375,
"step": 722
},
{
"epoch": 0.7997787610619469,
"grad_norm": 14.74849796295166,
"learning_rate": 5.138908433415945e-08,
"logits/chosen": -1.28125,
"logits/rejected": -1.23828125,
"logps/chosen": -271.0,
"logps/rejected": -311.0,
"loss": 0.2943,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 0.8203125,
"rewards/margins": 3.1171875,
"rewards/rejected": -2.3046875,
"step": 723
},
{
"epoch": 0.8008849557522124,
"grad_norm": 13.291254043579102,
"learning_rate": 5.0845839847303894e-08,
"logits/chosen": -1.25390625,
"logits/rejected": -1.11328125,
"logps/chosen": -244.5,
"logps/rejected": -257.0,
"loss": 0.3242,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.666015625,
"rewards/margins": 2.8671875,
"rewards/rejected": -2.1953125,
"step": 724
},
{
"epoch": 0.8019911504424779,
"grad_norm": 12.395694732666016,
"learning_rate": 5.030515677840882e-08,
"logits/chosen": -1.1875,
"logits/rejected": -1.1171875,
"logps/chosen": -240.5,
"logps/rejected": -276.0,
"loss": 0.3041,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 0.7890625,
"rewards/margins": 3.03125,
"rewards/rejected": -2.2421875,
"step": 725
},
{
"epoch": 0.8030973451327433,
"grad_norm": 13.156864166259766,
"learning_rate": 4.9767042081460626e-08,
"logits/chosen": -1.421875,
"logits/rejected": -1.12109375,
"logps/chosen": -253.5,
"logps/rejected": -286.0,
"loss": 0.2806,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 0.701171875,
"rewards/margins": 2.8359375,
"rewards/rejected": -2.1328125,
"step": 726
},
{
"epoch": 0.8042035398230089,
"grad_norm": 13.708073616027832,
"learning_rate": 4.923150267741266e-08,
"logits/chosen": -1.1953125,
"logits/rejected": -1.13671875,
"logps/chosen": -272.0,
"logps/rejected": -310.0,
"loss": 0.2606,
"rewards/accuracies": 0.8671875,
"rewards/chosen": 0.796875,
"rewards/margins": 3.2734375,
"rewards/rejected": -2.4765625,
"step": 727
},
{
"epoch": 0.8053097345132744,
"grad_norm": 13.454339981079102,
"learning_rate": 4.869854545409627e-08,
"logits/chosen": -1.2578125,
"logits/rejected": -1.2109375,
"logps/chosen": -243.0,
"logps/rejected": -296.0,
"loss": 0.2951,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.841796875,
"rewards/margins": 2.8671875,
"rewards/rejected": -2.02734375,
"step": 728
},
{
"epoch": 0.8064159292035398,
"grad_norm": 13.385002136230469,
"learning_rate": 4.816817726613187e-08,
"logits/chosen": -1.30078125,
"logits/rejected": -1.20703125,
"logps/chosen": -255.5,
"logps/rejected": -267.0,
"loss": 0.3009,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.5869140625,
"rewards/margins": 2.875,
"rewards/rejected": -2.2890625,
"step": 729
},
{
"epoch": 0.8075221238938053,
"grad_norm": 11.77560806274414,
"learning_rate": 4.7640404934841284e-08,
"logits/chosen": -1.25390625,
"logits/rejected": -1.1171875,
"logps/chosen": -239.5,
"logps/rejected": -256.5,
"loss": 0.2937,
"rewards/accuracies": 0.828125,
"rewards/chosen": 0.65234375,
"rewards/margins": 2.7734375,
"rewards/rejected": -2.12109375,
"step": 730
},
{
"epoch": 0.8086283185840708,
"grad_norm": 14.025035858154297,
"learning_rate": 4.7115235248159776e-08,
"logits/chosen": -1.3359375,
"logits/rejected": -1.14453125,
"logps/chosen": -283.0,
"logps/rejected": -303.0,
"loss": 0.2726,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 1.037109375,
"rewards/margins": 3.1328125,
"rewards/rejected": -2.10546875,
"step": 731
},
{
"epoch": 0.8097345132743363,
"grad_norm": 13.514138221740723,
"learning_rate": 4.659267496054847e-08,
"logits/chosen": -1.2890625,
"logits/rejected": -1.091796875,
"logps/chosen": -248.5,
"logps/rejected": -266.5,
"loss": 0.2988,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.646484375,
"rewards/margins": 2.84375,
"rewards/rejected": -2.1953125,
"step": 732
},
{
"epoch": 0.8108407079646017,
"grad_norm": 15.020828247070312,
"learning_rate": 4.60727307929081e-08,
"logits/chosen": -1.28125,
"logits/rejected": -1.06640625,
"logps/chosen": -258.5,
"logps/rejected": -275.0,
"loss": 0.3037,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.75390625,
"rewards/margins": 3.09375,
"rewards/rejected": -2.34375,
"step": 733
},
{
"epoch": 0.8119469026548672,
"grad_norm": 14.957762718200684,
"learning_rate": 4.555540943249187e-08,
"logits/chosen": -1.3515625,
"logits/rejected": -1.19921875,
"logps/chosen": -248.5,
"logps/rejected": -304.0,
"loss": 0.3,
"rewards/accuracies": 0.8515625,
"rewards/chosen": 0.705078125,
"rewards/margins": 2.8515625,
"rewards/rejected": -2.1484375,
"step": 734
},
{
"epoch": 0.8130530973451328,
"grad_norm": 12.412934303283691,
"learning_rate": 4.5040717532820046e-08,
"logits/chosen": -1.28125,
"logits/rejected": -1.15234375,
"logps/chosen": -257.5,
"logps/rejected": -296.0,
"loss": 0.282,
"rewards/accuracies": 0.8671875,
"rewards/chosen": 0.7607421875,
"rewards/margins": 2.953125,
"rewards/rejected": -2.1953125,
"step": 735
},
{
"epoch": 0.8141592920353983,
"grad_norm": 15.76734733581543,
"learning_rate": 4.4528661713594125e-08,
"logits/chosen": -1.3515625,
"logits/rejected": -1.1796875,
"logps/chosen": -238.5,
"logps/rejected": -262.0,
"loss": 0.3355,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.8427734375,
"rewards/margins": 2.96875,
"rewards/rejected": -2.125,
"step": 736
},
{
"epoch": 0.8152654867256637,
"grad_norm": 16.009498596191406,
"learning_rate": 4.4019248560611454e-08,
"logits/chosen": -1.1953125,
"logits/rejected": -1.16796875,
"logps/chosen": -271.0,
"logps/rejected": -292.0,
"loss": 0.34,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.8203125,
"rewards/margins": 2.7578125,
"rewards/rejected": -1.9375,
"step": 737
},
{
"epoch": 0.8163716814159292,
"grad_norm": 12.171030044555664,
"learning_rate": 4.3512484625681e-08,
"logits/chosen": -1.29296875,
"logits/rejected": -1.046875,
"logps/chosen": -257.0,
"logps/rejected": -285.0,
"loss": 0.2528,
"rewards/accuracies": 0.8515625,
"rewards/chosen": 0.6953125,
"rewards/margins": 2.9921875,
"rewards/rejected": -2.2890625,
"step": 738
},
{
"epoch": 0.8174778761061947,
"grad_norm": 14.278532981872559,
"learning_rate": 4.3008376426538903e-08,
"logits/chosen": -1.31640625,
"logits/rejected": -1.2265625,
"logps/chosen": -250.5,
"logps/rejected": -258.5,
"loss": 0.3722,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 0.5029296875,
"rewards/margins": 2.4921875,
"rewards/rejected": -1.9921875,
"step": 739
},
{
"epoch": 0.8185840707964602,
"grad_norm": 12.9563570022583,
"learning_rate": 4.250693044676429e-08,
"logits/chosen": -1.2734375,
"logits/rejected": -1.17578125,
"logps/chosen": -270.0,
"logps/rejected": -287.0,
"loss": 0.2685,
"rewards/accuracies": 0.8671875,
"rewards/chosen": 0.685546875,
"rewards/margins": 3.1640625,
"rewards/rejected": -2.4765625,
"step": 740
},
{
"epoch": 0.8196902654867256,
"grad_norm": 13.755107879638672,
"learning_rate": 4.2008153135696584e-08,
"logits/chosen": -1.21484375,
"logits/rejected": -1.09375,
"logps/chosen": -246.5,
"logps/rejected": -285.0,
"loss": 0.3042,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.736328125,
"rewards/margins": 2.9765625,
"rewards/rejected": -2.2421875,
"step": 741
},
{
"epoch": 0.8207964601769911,
"grad_norm": 12.855173110961914,
"learning_rate": 4.151205090835183e-08,
"logits/chosen": -1.2734375,
"logits/rejected": -1.10546875,
"logps/chosen": -243.5,
"logps/rejected": -283.0,
"loss": 0.2732,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.916015625,
"rewards/margins": 3.0859375,
"rewards/rejected": -2.1640625,
"step": 742
},
{
"epoch": 0.8219026548672567,
"grad_norm": 15.404345512390137,
"learning_rate": 4.1018630145340735e-08,
"logits/chosen": -1.29296875,
"logits/rejected": -1.34375,
"logps/chosen": -255.0,
"logps/rejected": -262.5,
"loss": 0.2993,
"rewards/accuracies": 0.828125,
"rewards/chosen": 0.810546875,
"rewards/margins": 3.09375,
"rewards/rejected": -2.28125,
"step": 743
},
{
"epoch": 0.8230088495575221,
"grad_norm": 13.697175979614258,
"learning_rate": 4.0527897192786433e-08,
"logits/chosen": -1.2421875,
"logits/rejected": -1.171875,
"logps/chosen": -279.0,
"logps/rejected": -289.0,
"loss": 0.2732,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.83984375,
"rewards/margins": 3.046875,
"rewards/rejected": -2.19921875,
"step": 744
},
{
"epoch": 0.8241150442477876,
"grad_norm": 14.881061553955078,
"learning_rate": 4.003985836224255e-08,
"logits/chosen": -1.296875,
"logits/rejected": -1.30859375,
"logps/chosen": -256.5,
"logps/rejected": -284.0,
"loss": 0.3474,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.5068359375,
"rewards/margins": 2.375,
"rewards/rejected": -1.87109375,
"step": 745
},
{
"epoch": 0.8252212389380531,
"grad_norm": 13.085796356201172,
"learning_rate": 3.955451993061268e-08,
"logits/chosen": -1.33984375,
"logits/rejected": -1.12890625,
"logps/chosen": -258.0,
"logps/rejected": -292.0,
"loss": 0.2616,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.91796875,
"rewards/margins": 3.09375,
"rewards/rejected": -2.1796875,
"step": 746
},
{
"epoch": 0.8263274336283186,
"grad_norm": 13.392922401428223,
"learning_rate": 3.9071888140068926e-08,
"logits/chosen": -1.2109375,
"logits/rejected": -1.16796875,
"logps/chosen": -256.0,
"logps/rejected": -316.0,
"loss": 0.2815,
"rewards/accuracies": 0.859375,
"rewards/chosen": 0.986328125,
"rewards/margins": 3.1171875,
"rewards/rejected": -2.1328125,
"step": 747
},
{
"epoch": 0.827433628318584,
"grad_norm": 12.065234184265137,
"learning_rate": 3.859196919797228e-08,
"logits/chosen": -1.3359375,
"logits/rejected": -1.12109375,
"logps/chosen": -247.5,
"logps/rejected": -264.0,
"loss": 0.3147,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.7265625,
"rewards/margins": 2.9609375,
"rewards/rejected": -2.2421875,
"step": 748
},
{
"epoch": 0.8285398230088495,
"grad_norm": 14.233034133911133,
"learning_rate": 3.811476927679227e-08,
"logits/chosen": -1.16015625,
"logits/rejected": -1.1640625,
"logps/chosen": -265.0,
"logps/rejected": -300.0,
"loss": 0.3261,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.6884765625,
"rewards/margins": 2.75,
"rewards/rejected": -2.0703125,
"step": 749
},
{
"epoch": 0.8296460176991151,
"grad_norm": 14.785301208496094,
"learning_rate": 3.764029451402778e-08,
"logits/chosen": -1.265625,
"logits/rejected": -1.19140625,
"logps/chosen": -236.0,
"logps/rejected": -282.0,
"loss": 0.297,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.791015625,
"rewards/margins": 3.1640625,
"rewards/rejected": -2.375,
"step": 750
},
{
"epoch": 0.8296460176991151,
"eval_logits/chosen": -1.270017147064209,
"eval_logits/rejected": -1.1617498397827148,
"eval_logps/chosen": -253.53233337402344,
"eval_logps/rejected": -280.19403076171875,
"eval_loss": 0.31980380415916443,
"eval_rewards/accuracies": 0.8147646188735962,
"eval_rewards/chosen": 0.72982257604599,
"eval_rewards/margins": 2.827347755432129,
"eval_rewards/rejected": -2.09759783744812,
"eval_runtime": 193.0983,
"eval_samples_per_second": 66.562,
"eval_steps_per_second": 1.041,
"step": 750
},
{
"epoch": 0.8307522123893806,
"grad_norm": 13.682051658630371,
"learning_rate": 3.716855101212826e-08,
"logits/chosen": -1.26953125,
"logits/rejected": -1.1953125,
"logps/chosen": -270.0,
"logps/rejected": -284.5,
"loss": 0.3091,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 0.740234375,
"rewards/margins": 2.8125,
"rewards/rejected": -2.0703125,
"step": 751
},
{
"epoch": 0.831858407079646,
"grad_norm": 14.422385215759277,
"learning_rate": 3.6699544838415034e-08,
"logits/chosen": -1.328125,
"logits/rejected": -1.1015625,
"logps/chosen": -268.0,
"logps/rejected": -266.0,
"loss": 0.3043,
"rewards/accuracies": 0.8515625,
"rewards/chosen": 0.724609375,
"rewards/margins": 2.875,
"rewards/rejected": -2.15234375,
"step": 752
},
{
"epoch": 0.8329646017699115,
"grad_norm": 14.678279876708984,
"learning_rate": 3.623328202500322e-08,
"logits/chosen": -1.21484375,
"logits/rejected": -1.13671875,
"logps/chosen": -280.0,
"logps/rejected": -305.0,
"loss": 0.3304,
"rewards/accuracies": 0.828125,
"rewards/chosen": 0.6796875,
"rewards/margins": 2.4765625,
"rewards/rejected": -1.796875,
"step": 753
},
{
"epoch": 0.834070796460177,
"grad_norm": 12.621984481811523,
"learning_rate": 3.576976856872438e-08,
"logits/chosen": -1.421875,
"logits/rejected": -1.0859375,
"logps/chosen": -252.0,
"logps/rejected": -276.0,
"loss": 0.294,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.73046875,
"rewards/margins": 2.8515625,
"rewards/rejected": -2.1171875,
"step": 754
},
{
"epoch": 0.8351769911504425,
"grad_norm": 11.676498413085938,
"learning_rate": 3.530901043104928e-08,
"logits/chosen": -1.26171875,
"logits/rejected": -1.1953125,
"logps/chosen": -227.0,
"logps/rejected": -266.0,
"loss": 0.2778,
"rewards/accuracies": 0.8828125,
"rewards/chosen": 1.048828125,
"rewards/margins": 3.109375,
"rewards/rejected": -2.0625,
"step": 755
},
{
"epoch": 0.8362831858407079,
"grad_norm": 12.77115249633789,
"learning_rate": 3.4851013538011035e-08,
"logits/chosen": -1.3125,
"logits/rejected": -1.17578125,
"logps/chosen": -252.5,
"logps/rejected": -290.0,
"loss": 0.2771,
"rewards/accuracies": 0.8515625,
"rewards/chosen": 0.7578125,
"rewards/margins": 3.0078125,
"rewards/rejected": -2.2578125,
"step": 756
},
{
"epoch": 0.8373893805309734,
"grad_norm": 13.537567138671875,
"learning_rate": 3.439578378012925e-08,
"logits/chosen": -1.34765625,
"logits/rejected": -1.17578125,
"logps/chosen": -251.0,
"logps/rejected": -285.5,
"loss": 0.2978,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.9921875,
"rewards/margins": 3.140625,
"rewards/rejected": -2.1484375,
"step": 757
},
{
"epoch": 0.838495575221239,
"grad_norm": 12.62022590637207,
"learning_rate": 3.394332701233391e-08,
"logits/chosen": -1.265625,
"logits/rejected": -1.1796875,
"logps/chosen": -242.5,
"logps/rejected": -261.0,
"loss": 0.2755,
"rewards/accuracies": 0.8671875,
"rewards/chosen": 0.8359375,
"rewards/margins": 3.0078125,
"rewards/rejected": -2.1640625,
"step": 758
},
{
"epoch": 0.8396017699115044,
"grad_norm": 14.283227920532227,
"learning_rate": 3.349364905389032e-08,
"logits/chosen": -1.15234375,
"logits/rejected": -1.158203125,
"logps/chosen": -269.0,
"logps/rejected": -290.0,
"loss": 0.3305,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.658203125,
"rewards/margins": 2.8828125,
"rewards/rejected": -2.2265625,
"step": 759
},
{
"epoch": 0.8407079646017699,
"grad_norm": 12.961087226867676,
"learning_rate": 3.304675568832427e-08,
"logits/chosen": -1.265625,
"logits/rejected": -1.109375,
"logps/chosen": -263.0,
"logps/rejected": -279.5,
"loss": 0.3033,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 0.755859375,
"rewards/margins": 2.5078125,
"rewards/rejected": -1.75,
"step": 760
},
{
"epoch": 0.8418141592920354,
"grad_norm": 14.768875122070312,
"learning_rate": 3.260265266334725e-08,
"logits/chosen": -1.21484375,
"logits/rejected": -1.125,
"logps/chosen": -256.0,
"logps/rejected": -282.0,
"loss": 0.382,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.765625,
"rewards/margins": 2.40234375,
"rewards/rejected": -1.63671875,
"step": 761
},
{
"epoch": 0.8429203539823009,
"grad_norm": 16.72699546813965,
"learning_rate": 3.216134569078316e-08,
"logits/chosen": -1.25,
"logits/rejected": -1.22265625,
"logps/chosen": -266.0,
"logps/rejected": -300.0,
"loss": 0.3642,
"rewards/accuracies": 0.765625,
"rewards/chosen": 0.90625,
"rewards/margins": 2.7421875,
"rewards/rejected": -1.83203125,
"step": 762
},
{
"epoch": 0.8440265486725663,
"grad_norm": 12.911907196044922,
"learning_rate": 3.172284044649437e-08,
"logits/chosen": -1.265625,
"logits/rejected": -1.1171875,
"logps/chosen": -260.5,
"logps/rejected": -308.0,
"loss": 0.3017,
"rewards/accuracies": 0.828125,
"rewards/chosen": 0.6640625,
"rewards/margins": 2.6640625,
"rewards/rejected": -2.0,
"step": 763
},
{
"epoch": 0.8451327433628318,
"grad_norm": 15.997196197509766,
"learning_rate": 3.128714257030882e-08,
"logits/chosen": -1.27734375,
"logits/rejected": -1.15625,
"logps/chosen": -284.0,
"logps/rejected": -301.0,
"loss": 0.3964,
"rewards/accuracies": 0.7421875,
"rewards/chosen": 0.607421875,
"rewards/margins": 2.3203125,
"rewards/rejected": -1.71875,
"step": 764
},
{
"epoch": 0.8462389380530974,
"grad_norm": 14.732622146606445,
"learning_rate": 3.085425766594768e-08,
"logits/chosen": -1.2265625,
"logits/rejected": -1.23828125,
"logps/chosen": -262.0,
"logps/rejected": -245.0,
"loss": 0.3107,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.724609375,
"rewards/margins": 2.9375,
"rewards/rejected": -2.21875,
"step": 765
},
{
"epoch": 0.8473451327433629,
"grad_norm": 14.123418807983398,
"learning_rate": 3.042419130095292e-08,
"logits/chosen": -1.26953125,
"logits/rejected": -1.19140625,
"logps/chosen": -249.0,
"logps/rejected": -303.0,
"loss": 0.2951,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.73828125,
"rewards/margins": 3.2109375,
"rewards/rejected": -2.4765625,
"step": 766
},
{
"epoch": 0.8484513274336283,
"grad_norm": 15.25007438659668,
"learning_rate": 2.999694900661609e-08,
"logits/chosen": -1.2578125,
"logits/rejected": -1.12109375,
"logps/chosen": -280.0,
"logps/rejected": -286.0,
"loss": 0.3976,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.296875,
"rewards/margins": 1.96484375,
"rewards/rejected": -1.66796875,
"step": 767
},
{
"epoch": 0.8495575221238938,
"grad_norm": 14.552936553955078,
"learning_rate": 2.9572536277906984e-08,
"logits/chosen": -1.2421875,
"logits/rejected": -1.15234375,
"logps/chosen": -251.5,
"logps/rejected": -294.0,
"loss": 0.3292,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.5849609375,
"rewards/margins": 2.734375,
"rewards/rejected": -2.1484375,
"step": 768
},
{
"epoch": 0.8506637168141593,
"grad_norm": 12.925614356994629,
"learning_rate": 2.9150958573402885e-08,
"logits/chosen": -1.3828125,
"logits/rejected": -1.171875,
"logps/chosen": -267.0,
"logps/rejected": -305.0,
"loss": 0.2762,
"rewards/accuracies": 0.8515625,
"rewards/chosen": 0.775390625,
"rewards/margins": 2.8984375,
"rewards/rejected": -2.125,
"step": 769
},
{
"epoch": 0.8517699115044248,
"grad_norm": 14.300766944885254,
"learning_rate": 2.8732221315218573e-08,
"logits/chosen": -1.18359375,
"logits/rejected": -1.12109375,
"logps/chosen": -257.0,
"logps/rejected": -279.0,
"loss": 0.344,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.52587890625,
"rewards/margins": 2.484375,
"rewards/rejected": -1.953125,
"step": 770
},
{
"epoch": 0.8528761061946902,
"grad_norm": 13.133272171020508,
"learning_rate": 2.8316329888936315e-08,
"logits/chosen": -1.2578125,
"logits/rejected": -1.1015625,
"logps/chosen": -250.5,
"logps/rejected": -269.0,
"loss": 0.2487,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 0.927734375,
"rewards/margins": 3.046875,
"rewards/rejected": -2.12109375,
"step": 771
},
{
"epoch": 0.8539823008849557,
"grad_norm": 12.045042991638184,
"learning_rate": 2.7903289643537e-08,
"logits/chosen": -1.34375,
"logits/rejected": -1.1171875,
"logps/chosen": -256.5,
"logps/rejected": -267.5,
"loss": 0.2765,
"rewards/accuracies": 0.859375,
"rewards/chosen": 0.76171875,
"rewards/margins": 2.96875,
"rewards/rejected": -2.2109375,
"step": 772
},
{
"epoch": 0.8550884955752213,
"grad_norm": 12.052350044250488,
"learning_rate": 2.7493105891330832e-08,
"logits/chosen": -1.28125,
"logits/rejected": -1.16015625,
"logps/chosen": -240.0,
"logps/rejected": -274.0,
"loss": 0.2838,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.81640625,
"rewards/margins": 3.1171875,
"rewards/rejected": -2.296875,
"step": 773
},
{
"epoch": 0.8561946902654868,
"grad_norm": 12.869089126586914,
"learning_rate": 2.7085783907889514e-08,
"logits/chosen": -1.26953125,
"logits/rejected": -1.1796875,
"logps/chosen": -260.0,
"logps/rejected": -274.0,
"loss": 0.3115,
"rewards/accuracies": 0.828125,
"rewards/chosen": 0.6953125,
"rewards/margins": 2.546875,
"rewards/rejected": -1.8515625,
"step": 774
},
{
"epoch": 0.8573008849557522,
"grad_norm": 13.210247993469238,
"learning_rate": 2.6681328931977942e-08,
"logits/chosen": -1.2578125,
"logits/rejected": -1.03125,
"logps/chosen": -247.5,
"logps/rejected": -286.0,
"loss": 0.2939,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 0.896484375,
"rewards/margins": 2.9765625,
"rewards/rejected": -2.078125,
"step": 775
},
{
"epoch": 0.8584070796460177,
"grad_norm": 13.413789749145508,
"learning_rate": 2.6279746165487255e-08,
"logits/chosen": -1.26171875,
"logits/rejected": -1.171875,
"logps/chosen": -267.0,
"logps/rejected": -282.0,
"loss": 0.3004,
"rewards/accuracies": 0.828125,
"rewards/chosen": 0.658203125,
"rewards/margins": 2.6015625,
"rewards/rejected": -1.94921875,
"step": 776
},
{
"epoch": 0.8595132743362832,
"grad_norm": 13.01457691192627,
"learning_rate": 2.5881040773367502e-08,
"logits/chosen": -1.1875,
"logits/rejected": -1.05859375,
"logps/chosen": -240.0,
"logps/rejected": -257.0,
"loss": 0.3088,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.716796875,
"rewards/margins": 3.03125,
"rewards/rejected": -2.31640625,
"step": 777
},
{
"epoch": 0.8606194690265486,
"grad_norm": 12.700637817382812,
"learning_rate": 2.5485217883561616e-08,
"logits/chosen": -1.30859375,
"logits/rejected": -1.1484375,
"logps/chosen": -248.5,
"logps/rejected": -279.0,
"loss": 0.2977,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.826171875,
"rewards/margins": 2.9375,
"rewards/rejected": -2.11328125,
"step": 778
},
{
"epoch": 0.8617256637168141,
"grad_norm": 13.09081745147705,
"learning_rate": 2.5092282586939183e-08,
"logits/chosen": -1.3828125,
"logits/rejected": -1.1328125,
"logps/chosen": -272.0,
"logps/rejected": -284.5,
"loss": 0.2959,
"rewards/accuracies": 0.828125,
"rewards/chosen": 0.755859375,
"rewards/margins": 2.7265625,
"rewards/rejected": -1.96875,
"step": 779
},
{
"epoch": 0.8628318584070797,
"grad_norm": 12.912965774536133,
"learning_rate": 2.470223993723103e-08,
"logits/chosen": -1.171875,
"logits/rejected": -1.10546875,
"logps/chosen": -259.0,
"logps/rejected": -283.5,
"loss": 0.304,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.73828125,
"rewards/margins": 2.9609375,
"rewards/rejected": -2.21875,
"step": 780
},
{
"epoch": 0.8639380530973452,
"grad_norm": 13.398490905761719,
"learning_rate": 2.4315094950964343e-08,
"logits/chosen": -1.375,
"logits/rejected": -1.1953125,
"logps/chosen": -272.5,
"logps/rejected": -278.5,
"loss": 0.3286,
"rewards/accuracies": 0.8515625,
"rewards/chosen": 0.61328125,
"rewards/margins": 2.4140625,
"rewards/rejected": -1.796875,
"step": 781
},
{
"epoch": 0.8650442477876106,
"grad_norm": 13.045671463012695,
"learning_rate": 2.393085260739794e-08,
"logits/chosen": -1.36328125,
"logits/rejected": -1.15625,
"logps/chosen": -242.0,
"logps/rejected": -263.5,
"loss": 0.3228,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.80859375,
"rewards/margins": 2.7421875,
"rewards/rejected": -1.94140625,
"step": 782
},
{
"epoch": 0.8661504424778761,
"grad_norm": 15.309684753417969,
"learning_rate": 2.3549517848458435e-08,
"logits/chosen": -1.26171875,
"logits/rejected": -1.1328125,
"logps/chosen": -279.0,
"logps/rejected": -301.0,
"loss": 0.3618,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.724609375,
"rewards/margins": 2.4765625,
"rewards/rejected": -1.75,
"step": 783
},
{
"epoch": 0.8672566371681416,
"grad_norm": 12.972829818725586,
"learning_rate": 2.3171095578676637e-08,
"logits/chosen": -1.2578125,
"logits/rejected": -1.1171875,
"logps/chosen": -255.5,
"logps/rejected": -305.0,
"loss": 0.2948,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.84375,
"rewards/margins": 2.9140625,
"rewards/rejected": -2.078125,
"step": 784
},
{
"epoch": 0.8683628318584071,
"grad_norm": 12.639619827270508,
"learning_rate": 2.2795590665124263e-08,
"logits/chosen": -1.2421875,
"logits/rejected": -1.09375,
"logps/chosen": -235.0,
"logps/rejected": -269.5,
"loss": 0.2619,
"rewards/accuracies": 0.8828125,
"rewards/chosen": 0.7578125,
"rewards/margins": 3.171875,
"rewards/rejected": -2.421875,
"step": 785
},
{
"epoch": 0.8694690265486725,
"grad_norm": 12.34381103515625,
"learning_rate": 2.2423007937351634e-08,
"logits/chosen": -1.1953125,
"logits/rejected": -1.13671875,
"logps/chosen": -254.0,
"logps/rejected": -264.0,
"loss": 0.2839,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.521484375,
"rewards/margins": 2.8828125,
"rewards/rejected": -2.359375,
"step": 786
},
{
"epoch": 0.870575221238938,
"grad_norm": 13.31490707397461,
"learning_rate": 2.205335218732543e-08,
"logits/chosen": -1.2734375,
"logits/rejected": -1.21875,
"logps/chosen": -259.0,
"logps/rejected": -280.0,
"loss": 0.3176,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.666015625,
"rewards/margins": 2.65625,
"rewards/rejected": -1.9921875,
"step": 787
},
{
"epoch": 0.8716814159292036,
"grad_norm": 14.77593994140625,
"learning_rate": 2.1686628169366923e-08,
"logits/chosen": -1.109375,
"logits/rejected": -1.1171875,
"logps/chosen": -266.0,
"logps/rejected": -297.0,
"loss": 0.3291,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.771484375,
"rewards/margins": 2.8671875,
"rewards/rejected": -2.1015625,
"step": 788
},
{
"epoch": 0.8727876106194691,
"grad_norm": 12.58286190032959,
"learning_rate": 2.1322840600091096e-08,
"logits/chosen": -1.265625,
"logits/rejected": -1.15625,
"logps/chosen": -249.5,
"logps/rejected": -260.0,
"loss": 0.2995,
"rewards/accuracies": 0.828125,
"rewards/chosen": 0.3896484375,
"rewards/margins": 2.6328125,
"rewards/rejected": -2.2421875,
"step": 789
},
{
"epoch": 0.8738938053097345,
"grad_norm": 13.99928092956543,
"learning_rate": 2.0961994158345763e-08,
"logits/chosen": -1.34765625,
"logits/rejected": -1.0859375,
"logps/chosen": -254.5,
"logps/rejected": -263.5,
"loss": 0.2972,
"rewards/accuracies": 0.828125,
"rewards/chosen": 0.662109375,
"rewards/margins": 2.8359375,
"rewards/rejected": -2.16796875,
"step": 790
},
{
"epoch": 0.875,
"grad_norm": 11.941873550415039,
"learning_rate": 2.0604093485151548e-08,
"logits/chosen": -1.31640625,
"logits/rejected": -1.1171875,
"logps/chosen": -261.0,
"logps/rejected": -270.0,
"loss": 0.2886,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.818359375,
"rewards/margins": 3.15625,
"rewards/rejected": -2.3359375,
"step": 791
},
{
"epoch": 0.8761061946902655,
"grad_norm": 17.870344161987305,
"learning_rate": 2.0249143183642097e-08,
"logits/chosen": -1.3984375,
"logits/rejected": -1.1953125,
"logps/chosen": -244.0,
"logps/rejected": -268.0,
"loss": 0.4293,
"rewards/accuracies": 0.765625,
"rewards/chosen": 0.4189453125,
"rewards/margins": 2.26171875,
"rewards/rejected": -1.83984375,
"step": 792
},
{
"epoch": 0.8772123893805309,
"grad_norm": 12.3770112991333,
"learning_rate": 1.989714781900484e-08,
"logits/chosen": -1.3671875,
"logits/rejected": -1.14453125,
"logps/chosen": -264.0,
"logps/rejected": -284.0,
"loss": 0.2621,
"rewards/accuracies": 0.8671875,
"rewards/chosen": 0.869140625,
"rewards/margins": 3.03125,
"rewards/rejected": -2.1640625,
"step": 793
},
{
"epoch": 0.8783185840707964,
"grad_norm": 14.804219245910645,
"learning_rate": 1.95481119184224e-08,
"logits/chosen": -1.1953125,
"logits/rejected": -1.140625,
"logps/chosen": -245.0,
"logps/rejected": -302.0,
"loss": 0.3552,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.638671875,
"rewards/margins": 2.6875,
"rewards/rejected": -2.04296875,
"step": 794
},
{
"epoch": 0.879424778761062,
"grad_norm": 13.518996238708496,
"learning_rate": 1.9202039971014243e-08,
"logits/chosen": -1.375,
"logits/rejected": -1.20703125,
"logps/chosen": -241.5,
"logps/rejected": -263.0,
"loss": 0.3375,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.69921875,
"rewards/margins": 2.734375,
"rewards/rejected": -2.03125,
"step": 795
},
{
"epoch": 0.8805309734513275,
"grad_norm": 13.753449440002441,
"learning_rate": 1.8858936427779137e-08,
"logits/chosen": -1.21875,
"logits/rejected": -1.12109375,
"logps/chosen": -262.5,
"logps/rejected": -289.0,
"loss": 0.2857,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.75,
"rewards/margins": 2.8203125,
"rewards/rejected": -2.0703125,
"step": 796
},
{
"epoch": 0.8816371681415929,
"grad_norm": 15.924559593200684,
"learning_rate": 1.8518805701537548e-08,
"logits/chosen": -1.25,
"logits/rejected": -1.09765625,
"logps/chosen": -253.5,
"logps/rejected": -264.5,
"loss": 0.3678,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.671875,
"rewards/margins": 2.640625,
"rewards/rejected": -1.96484375,
"step": 797
},
{
"epoch": 0.8827433628318584,
"grad_norm": 12.930898666381836,
"learning_rate": 1.818165216687531e-08,
"logits/chosen": -1.26171875,
"logits/rejected": -1.1640625,
"logps/chosen": -248.5,
"logps/rejected": -258.0,
"loss": 0.2994,
"rewards/accuracies": 0.859375,
"rewards/chosen": 0.732421875,
"rewards/margins": 2.9296875,
"rewards/rejected": -2.1953125,
"step": 798
},
{
"epoch": 0.8838495575221239,
"grad_norm": 14.774980545043945,
"learning_rate": 1.7847480160087025e-08,
"logits/chosen": -1.26953125,
"logits/rejected": -1.125,
"logps/chosen": -250.5,
"logps/rejected": -290.0,
"loss": 0.3111,
"rewards/accuracies": 0.828125,
"rewards/chosen": 0.724609375,
"rewards/margins": 2.953125,
"rewards/rejected": -2.234375,
"step": 799
},
{
"epoch": 0.8849557522123894,
"grad_norm": 13.716545104980469,
"learning_rate": 1.7516293979120523e-08,
"logits/chosen": -1.2578125,
"logits/rejected": -1.1328125,
"logps/chosen": -262.5,
"logps/rejected": -267.0,
"loss": 0.3317,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.5703125,
"rewards/margins": 2.640625,
"rewards/rejected": -2.0625,
"step": 800
},
{
"epoch": 0.8849557522123894,
"eval_logits/chosen": -1.2699394226074219,
"eval_logits/rejected": -1.161244511604309,
"eval_logps/chosen": -253.7014923095703,
"eval_logps/rejected": -280.33831787109375,
"eval_loss": 0.31928393244743347,
"eval_rewards/accuracies": 0.8145930171012878,
"eval_rewards/chosen": 0.7206642627716064,
"eval_rewards/margins": 2.836987018585205,
"eval_rewards/rejected": -2.1172263622283936,
"eval_runtime": 193.0847,
"eval_samples_per_second": 66.567,
"eval_steps_per_second": 1.041,
"step": 800
},
{
"epoch": 0.8860619469026548,
"grad_norm": 14.249678611755371,
"learning_rate": 1.7188097883521352e-08,
"logits/chosen": -1.28515625,
"logits/rejected": -1.171875,
"logps/chosen": -248.5,
"logps/rejected": -251.0,
"loss": 0.2843,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.751953125,
"rewards/margins": 2.984375,
"rewards/rejected": -2.234375,
"step": 801
},
{
"epoch": 0.8871681415929203,
"grad_norm": 12.044215202331543,
"learning_rate": 1.6862896094378244e-08,
"logits/chosen": -1.296875,
"logits/rejected": -1.2578125,
"logps/chosen": -236.0,
"logps/rejected": -261.5,
"loss": 0.2971,
"rewards/accuracies": 0.828125,
"rewards/chosen": 0.8515625,
"rewards/margins": 3.1171875,
"rewards/rejected": -2.265625,
"step": 802
},
{
"epoch": 0.8882743362831859,
"grad_norm": 13.170328140258789,
"learning_rate": 1.654069279426873e-08,
"logits/chosen": -1.21484375,
"logits/rejected": -1.125,
"logps/chosen": -255.5,
"logps/rejected": -300.0,
"loss": 0.2789,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 1.017578125,
"rewards/margins": 3.1328125,
"rewards/rejected": -2.1171875,
"step": 803
},
{
"epoch": 0.8893805309734514,
"grad_norm": 13.188612937927246,
"learning_rate": 1.6221492127205166e-08,
"logits/chosen": -1.26953125,
"logits/rejected": -1.24609375,
"logps/chosen": -269.0,
"logps/rejected": -293.0,
"loss": 0.2959,
"rewards/accuracies": 0.828125,
"rewards/chosen": 0.900390625,
"rewards/margins": 2.8125,
"rewards/rejected": -1.921875,
"step": 804
},
{
"epoch": 0.8904867256637168,
"grad_norm": 17.259361267089844,
"learning_rate": 1.5905298198581774e-08,
"logits/chosen": -1.25,
"logits/rejected": -1.171875,
"logps/chosen": -271.0,
"logps/rejected": -301.0,
"loss": 0.3979,
"rewards/accuracies": 0.7421875,
"rewards/chosen": 0.814453125,
"rewards/margins": 2.4921875,
"rewards/rejected": -1.6796875,
"step": 805
},
{
"epoch": 0.8915929203539823,
"grad_norm": 13.314188957214355,
"learning_rate": 1.5592115075121508e-08,
"logits/chosen": -1.3203125,
"logits/rejected": -1.15234375,
"logps/chosen": -250.5,
"logps/rejected": -294.0,
"loss": 0.3297,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 0.822265625,
"rewards/margins": 2.546875,
"rewards/rejected": -1.7265625,
"step": 806
},
{
"epoch": 0.8926991150442478,
"grad_norm": 12.087140083312988,
"learning_rate": 1.5281946784824002e-08,
"logits/chosen": -1.33203125,
"logits/rejected": -1.15625,
"logps/chosen": -250.0,
"logps/rejected": -297.0,
"loss": 0.2368,
"rewards/accuracies": 0.8828125,
"rewards/chosen": 0.66015625,
"rewards/margins": 3.1953125,
"rewards/rejected": -2.5234375,
"step": 807
},
{
"epoch": 0.8938053097345132,
"grad_norm": 15.906932830810547,
"learning_rate": 1.4974797316913673e-08,
"logits/chosen": -1.28125,
"logits/rejected": -1.15625,
"logps/chosen": -287.0,
"logps/rejected": -302.0,
"loss": 0.3282,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.671875,
"rewards/margins": 2.625,
"rewards/rejected": -1.9609375,
"step": 808
},
{
"epoch": 0.8949115044247787,
"grad_norm": 13.53934383392334,
"learning_rate": 1.4670670621788229e-08,
"logits/chosen": -1.1328125,
"logits/rejected": -1.1015625,
"logps/chosen": -266.0,
"logps/rejected": -293.0,
"loss": 0.2885,
"rewards/accuracies": 0.8515625,
"rewards/chosen": 1.05859375,
"rewards/margins": 3.359375,
"rewards/rejected": -2.3046875,
"step": 809
},
{
"epoch": 0.8960176991150443,
"grad_norm": 13.705190658569336,
"learning_rate": 1.4369570610968274e-08,
"logits/chosen": -1.3515625,
"logits/rejected": -1.17578125,
"logps/chosen": -250.0,
"logps/rejected": -271.0,
"loss": 0.346,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.693359375,
"rewards/margins": 2.6171875,
"rewards/rejected": -1.92578125,
"step": 810
},
{
"epoch": 0.8971238938053098,
"grad_norm": 12.113191604614258,
"learning_rate": 1.4071501157046666e-08,
"logits/chosen": -1.19921875,
"logits/rejected": -1.1328125,
"logps/chosen": -256.0,
"logps/rejected": -274.5,
"loss": 0.2523,
"rewards/accuracies": 0.8671875,
"rewards/chosen": 0.84375,
"rewards/margins": 3.1171875,
"rewards/rejected": -2.27734375,
"step": 811
},
{
"epoch": 0.8982300884955752,
"grad_norm": 13.494206428527832,
"learning_rate": 1.3776466093638695e-08,
"logits/chosen": -1.234375,
"logits/rejected": -1.0546875,
"logps/chosen": -241.0,
"logps/rejected": -272.0,
"loss": 0.2704,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.990234375,
"rewards/margins": 3.4140625,
"rewards/rejected": -2.4296875,
"step": 812
},
{
"epoch": 0.8993362831858407,
"grad_norm": 11.447568893432617,
"learning_rate": 1.3484469215333082e-08,
"logits/chosen": -1.34765625,
"logits/rejected": -1.19921875,
"logps/chosen": -252.5,
"logps/rejected": -244.0,
"loss": 0.257,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.6953125,
"rewards/margins": 3.109375,
"rewards/rejected": -2.4140625,
"step": 813
},
{
"epoch": 0.9004424778761062,
"grad_norm": 17.08716583251953,
"learning_rate": 1.3195514277642817e-08,
"logits/chosen": -1.41015625,
"logits/rejected": -1.3125,
"logps/chosen": -264.0,
"logps/rejected": -257.5,
"loss": 0.4052,
"rewards/accuracies": 0.7265625,
"rewards/chosen": 0.4658203125,
"rewards/margins": 2.4140625,
"rewards/rejected": -1.94921875,
"step": 814
},
{
"epoch": 0.9015486725663717,
"grad_norm": 12.849235534667969,
"learning_rate": 1.2909604996957091e-08,
"logits/chosen": -1.2890625,
"logits/rejected": -1.14453125,
"logps/chosen": -252.5,
"logps/rejected": -279.0,
"loss": 0.2986,
"rewards/accuracies": 0.8515625,
"rewards/chosen": 0.75,
"rewards/margins": 2.734375,
"rewards/rejected": -1.98046875,
"step": 815
},
{
"epoch": 0.9026548672566371,
"grad_norm": 16.0147647857666,
"learning_rate": 1.2626745050493493e-08,
"logits/chosen": -1.36328125,
"logits/rejected": -1.23828125,
"logps/chosen": -245.5,
"logps/rejected": -296.0,
"loss": 0.3544,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 0.650390625,
"rewards/margins": 2.578125,
"rewards/rejected": -1.93359375,
"step": 816
},
{
"epoch": 0.9037610619469026,
"grad_norm": 12.192747116088867,
"learning_rate": 1.234693807625048e-08,
"logits/chosen": -1.234375,
"logits/rejected": -1.21875,
"logps/chosen": -256.0,
"logps/rejected": -277.0,
"loss": 0.2743,
"rewards/accuracies": 0.859375,
"rewards/chosen": 0.802734375,
"rewards/margins": 3.2421875,
"rewards/rejected": -2.4296875,
"step": 817
},
{
"epoch": 0.9048672566371682,
"grad_norm": 19.156158447265625,
"learning_rate": 1.2070187672960947e-08,
"logits/chosen": -1.4296875,
"logits/rejected": -1.125,
"logps/chosen": -261.0,
"logps/rejected": -283.0,
"loss": 0.4435,
"rewards/accuracies": 0.765625,
"rewards/chosen": 0.70703125,
"rewards/margins": 2.515625,
"rewards/rejected": -1.8125,
"step": 818
},
{
"epoch": 0.9059734513274337,
"grad_norm": 14.084782600402832,
"learning_rate": 1.179649740004557e-08,
"logits/chosen": -1.24609375,
"logits/rejected": -1.0703125,
"logps/chosen": -273.0,
"logps/rejected": -272.5,
"loss": 0.2877,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.4453125,
"rewards/margins": 2.546875,
"rewards/rejected": -2.1015625,
"step": 819
},
{
"epoch": 0.9070796460176991,
"grad_norm": 14.487624168395996,
"learning_rate": 1.1525870777567393e-08,
"logits/chosen": -1.234375,
"logits/rejected": -1.16015625,
"logps/chosen": -273.0,
"logps/rejected": -278.0,
"loss": 0.3505,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.5,
"rewards/margins": 2.34375,
"rewards/rejected": -1.83984375,
"step": 820
},
{
"epoch": 0.9081858407079646,
"grad_norm": 13.851645469665527,
"learning_rate": 1.1258311286186207e-08,
"logits/chosen": -1.28125,
"logits/rejected": -1.1015625,
"logps/chosen": -244.0,
"logps/rejected": -292.0,
"loss": 0.2884,
"rewards/accuracies": 0.828125,
"rewards/chosen": 0.9921875,
"rewards/margins": 3.2578125,
"rewards/rejected": -2.265625,
"step": 821
},
{
"epoch": 0.9092920353982301,
"grad_norm": 13.431646347045898,
"learning_rate": 1.0993822367114047e-08,
"logits/chosen": -1.359375,
"logits/rejected": -1.1640625,
"logps/chosen": -285.0,
"logps/rejected": -291.0,
"loss": 0.2858,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.703125,
"rewards/margins": 2.6796875,
"rewards/rejected": -1.96875,
"step": 822
},
{
"epoch": 0.9103982300884956,
"grad_norm": 15.518174171447754,
"learning_rate": 1.0732407422070794e-08,
"logits/chosen": -1.3359375,
"logits/rejected": -1.19140625,
"logps/chosen": -230.5,
"logps/rejected": -270.0,
"loss": 0.3882,
"rewards/accuracies": 0.765625,
"rewards/chosen": 0.5517578125,
"rewards/margins": 2.5546875,
"rewards/rejected": -1.99609375,
"step": 823
},
{
"epoch": 0.911504424778761,
"grad_norm": 12.571428298950195,
"learning_rate": 1.0474069813240505e-08,
"logits/chosen": -1.23046875,
"logits/rejected": -1.1328125,
"logps/chosen": -241.0,
"logps/rejected": -290.0,
"loss": 0.3193,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.53515625,
"rewards/margins": 2.59375,
"rewards/rejected": -2.0546875,
"step": 824
},
{
"epoch": 0.9126106194690266,
"grad_norm": 14.974266052246094,
"learning_rate": 1.021881286322801e-08,
"logits/chosen": -1.2421875,
"logits/rejected": -1.109375,
"logps/chosen": -264.5,
"logps/rejected": -283.0,
"loss": 0.3549,
"rewards/accuracies": 0.828125,
"rewards/chosen": 0.5546875,
"rewards/margins": 2.23828125,
"rewards/rejected": -1.6796875,
"step": 825
},
{
"epoch": 0.9137168141592921,
"grad_norm": 12.049909591674805,
"learning_rate": 9.966639855016446e-09,
"logits/chosen": -1.3984375,
"logits/rejected": -1.203125,
"logps/chosen": -238.0,
"logps/rejected": -257.0,
"loss": 0.2548,
"rewards/accuracies": 0.890625,
"rewards/chosen": 0.775390625,
"rewards/margins": 3.203125,
"rewards/rejected": -2.4296875,
"step": 826
},
{
"epoch": 0.9148230088495575,
"grad_norm": 16.12934112548828,
"learning_rate": 9.71755403192484e-09,
"logits/chosen": -1.27734375,
"logits/rejected": -1.12890625,
"logps/chosen": -274.0,
"logps/rejected": -281.0,
"loss": 0.3717,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 0.437744140625,
"rewards/margins": 2.4609375,
"rewards/rejected": -2.0234375,
"step": 827
},
{
"epoch": 0.915929203539823,
"grad_norm": 15.575227737426758,
"learning_rate": 9.47155859756632e-09,
"logits/chosen": -1.34765625,
"logits/rejected": -1.2265625,
"logps/chosen": -244.0,
"logps/rejected": -277.0,
"loss": 0.3755,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 0.701171875,
"rewards/margins": 2.8046875,
"rewards/rejected": -2.109375,
"step": 828
},
{
"epoch": 0.9170353982300885,
"grad_norm": 13.580742835998535,
"learning_rate": 9.228656715807249e-09,
"logits/chosen": -1.2265625,
"logits/rejected": -1.125,
"logps/chosen": -264.0,
"logps/rejected": -301.0,
"loss": 0.2762,
"rewards/accuracies": 0.8515625,
"rewards/chosen": 0.828125,
"rewards/margins": 3.140625,
"rewards/rejected": -2.3125,
"step": 829
},
{
"epoch": 0.918141592920354,
"grad_norm": 14.229433059692383,
"learning_rate": 8.988851510726092e-09,
"logits/chosen": -1.375,
"logits/rejected": -1.109375,
"logps/chosen": -269.0,
"logps/rejected": -276.0,
"loss": 0.2769,
"rewards/accuracies": 0.8515625,
"rewards/chosen": 0.5478515625,
"rewards/margins": 2.8828125,
"rewards/rejected": -2.3359375,
"step": 830
},
{
"epoch": 0.9192477876106194,
"grad_norm": 12.20298957824707,
"learning_rate": 8.752146066573597e-09,
"logits/chosen": -1.171875,
"logits/rejected": -1.1484375,
"logps/chosen": -254.0,
"logps/rejected": -291.0,
"loss": 0.2699,
"rewards/accuracies": 0.828125,
"rewards/chosen": 0.734375,
"rewards/margins": 2.8125,
"rewards/rejected": -2.08203125,
"step": 831
},
{
"epoch": 0.9203539823008849,
"grad_norm": 14.036704063415527,
"learning_rate": 8.518543427732949e-09,
"logits/chosen": -1.296875,
"logits/rejected": -1.1171875,
"logps/chosen": -265.0,
"logps/rejected": -267.0,
"loss": 0.3249,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.578125,
"rewards/margins": 2.5625,
"rewards/rejected": -1.98046875,
"step": 832
},
{
"epoch": 0.9214601769911505,
"grad_norm": 12.48025131225586,
"learning_rate": 8.288046598680627e-09,
"logits/chosen": -1.234375,
"logits/rejected": -1.12109375,
"logps/chosen": -260.0,
"logps/rejected": -268.0,
"loss": 0.2814,
"rewards/accuracies": 0.8515625,
"rewards/chosen": 0.66796875,
"rewards/margins": 3.0859375,
"rewards/rejected": -2.421875,
"step": 833
},
{
"epoch": 0.922566371681416,
"grad_norm": 12.8703031539917,
"learning_rate": 8.060658543947829e-09,
"logits/chosen": -1.2890625,
"logits/rejected": -1.30078125,
"logps/chosen": -223.0,
"logps/rejected": -258.5,
"loss": 0.2808,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.8125,
"rewards/margins": 2.953125,
"rewards/rejected": -2.1484375,
"step": 834
},
{
"epoch": 0.9236725663716814,
"grad_norm": 13.693394660949707,
"learning_rate": 7.836382188082302e-09,
"logits/chosen": -1.234375,
"logits/rejected": -1.203125,
"logps/chosen": -264.0,
"logps/rejected": -289.0,
"loss": 0.2979,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 0.48828125,
"rewards/margins": 2.7421875,
"rewards/rejected": -2.2421875,
"step": 835
},
{
"epoch": 0.9247787610619469,
"grad_norm": 12.683737754821777,
"learning_rate": 7.61522041561069e-09,
"logits/chosen": -1.30859375,
"logits/rejected": -1.11328125,
"logps/chosen": -246.0,
"logps/rejected": -265.0,
"loss": 0.2762,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.81640625,
"rewards/margins": 2.859375,
"rewards/rejected": -2.046875,
"step": 836
},
{
"epoch": 0.9258849557522124,
"grad_norm": 15.07400894165039,
"learning_rate": 7.397176071001543e-09,
"logits/chosen": -1.35546875,
"logits/rejected": -1.1796875,
"logps/chosen": -251.0,
"logps/rejected": -267.0,
"loss": 0.3266,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.708984375,
"rewards/margins": 2.6171875,
"rewards/rejected": -1.90625,
"step": 837
},
{
"epoch": 0.9269911504424779,
"grad_norm": 12.571556091308594,
"learning_rate": 7.182251958628538e-09,
"logits/chosen": -1.33984375,
"logits/rejected": -1.21875,
"logps/chosen": -236.5,
"logps/rejected": -259.0,
"loss": 0.2943,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 0.5966796875,
"rewards/margins": 2.7109375,
"rewards/rejected": -2.12109375,
"step": 838
},
{
"epoch": 0.9280973451327433,
"grad_norm": 12.665489196777344,
"learning_rate": 6.970450842734649e-09,
"logits/chosen": -1.375,
"logits/rejected": -1.171875,
"logps/chosen": -260.0,
"logps/rejected": -276.0,
"loss": 0.2713,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.7734375,
"rewards/margins": 3.0078125,
"rewards/rejected": -2.234375,
"step": 839
},
{
"epoch": 0.9292035398230089,
"grad_norm": 15.426192283630371,
"learning_rate": 6.761775447396506e-09,
"logits/chosen": -1.26171875,
"logits/rejected": -1.203125,
"logps/chosen": -244.0,
"logps/rejected": -297.0,
"loss": 0.3234,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.791015625,
"rewards/margins": 3.015625,
"rewards/rejected": -2.234375,
"step": 840
},
{
"epoch": 0.9303097345132744,
"grad_norm": 12.329756736755371,
"learning_rate": 6.556228456489232e-09,
"logits/chosen": -1.1875,
"logits/rejected": -1.0859375,
"logps/chosen": -253.5,
"logps/rejected": -280.0,
"loss": 0.2926,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.6484375,
"rewards/margins": 2.9453125,
"rewards/rejected": -2.296875,
"step": 841
},
{
"epoch": 0.9314159292035398,
"grad_norm": 16.28516387939453,
"learning_rate": 6.353812513652052e-09,
"logits/chosen": -1.2109375,
"logits/rejected": -1.10546875,
"logps/chosen": -260.0,
"logps/rejected": -282.0,
"loss": 0.3844,
"rewards/accuracies": 0.7578125,
"rewards/chosen": 0.529296875,
"rewards/margins": 2.4453125,
"rewards/rejected": -1.9140625,
"step": 842
},
{
"epoch": 0.9325221238938053,
"grad_norm": 16.69934844970703,
"learning_rate": 6.154530222254372e-09,
"logits/chosen": -1.25390625,
"logits/rejected": -1.21875,
"logps/chosen": -245.5,
"logps/rejected": -280.0,
"loss": 0.3776,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 0.6630859375,
"rewards/margins": 2.53125,
"rewards/rejected": -1.8671875,
"step": 843
},
{
"epoch": 0.9336283185840708,
"grad_norm": 15.312355995178223,
"learning_rate": 5.958384145362038e-09,
"logits/chosen": -1.27734375,
"logits/rejected": -1.1796875,
"logps/chosen": -267.5,
"logps/rejected": -304.0,
"loss": 0.3446,
"rewards/accuracies": 0.828125,
"rewards/chosen": 0.619140625,
"rewards/margins": 2.7265625,
"rewards/rejected": -2.109375,
"step": 844
},
{
"epoch": 0.9347345132743363,
"grad_norm": 13.851134300231934,
"learning_rate": 5.765376805704575e-09,
"logits/chosen": -1.296875,
"logits/rejected": -1.16015625,
"logps/chosen": -242.5,
"logps/rejected": -286.0,
"loss": 0.312,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.826171875,
"rewards/margins": 2.78125,
"rewards/rejected": -1.953125,
"step": 845
},
{
"epoch": 0.9358407079646017,
"grad_norm": 12.364534378051758,
"learning_rate": 5.575510685642798e-09,
"logits/chosen": -1.1328125,
"logits/rejected": -1.1875,
"logps/chosen": -265.0,
"logps/rejected": -298.0,
"loss": 0.2532,
"rewards/accuracies": 0.8671875,
"rewards/chosen": 1.015625,
"rewards/margins": 3.3671875,
"rewards/rejected": -2.3515625,
"step": 846
},
{
"epoch": 0.9369469026548672,
"grad_norm": 15.209588050842285,
"learning_rate": 5.38878822713662e-09,
"logits/chosen": -1.25390625,
"logits/rejected": -1.10546875,
"logps/chosen": -279.0,
"logps/rejected": -300.0,
"loss": 0.3528,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 0.505859375,
"rewards/margins": 2.6796875,
"rewards/rejected": -2.1796875,
"step": 847
},
{
"epoch": 0.9380530973451328,
"grad_norm": 13.730789184570312,
"learning_rate": 5.205211831713935e-09,
"logits/chosen": -1.37109375,
"logits/rejected": -1.125,
"logps/chosen": -239.5,
"logps/rejected": -240.0,
"loss": 0.3282,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.6494140625,
"rewards/margins": 2.921875,
"rewards/rejected": -2.2734375,
"step": 848
},
{
"epoch": 0.9391592920353983,
"grad_norm": 13.921919822692871,
"learning_rate": 5.024783860439474e-09,
"logits/chosen": -1.28125,
"logits/rejected": -1.08203125,
"logps/chosen": -228.0,
"logps/rejected": -262.0,
"loss": 0.3565,
"rewards/accuracies": 0.7421875,
"rewards/chosen": 0.517578125,
"rewards/margins": 2.5703125,
"rewards/rejected": -2.046875,
"step": 849
},
{
"epoch": 0.9402654867256637,
"grad_norm": 15.472764015197754,
"learning_rate": 4.8475066338846685e-09,
"logits/chosen": -1.3515625,
"logits/rejected": -1.15234375,
"logps/chosen": -252.0,
"logps/rejected": -282.0,
"loss": 0.3386,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.658203125,
"rewards/margins": 2.8203125,
"rewards/rejected": -2.1640625,
"step": 850
},
{
"epoch": 0.9402654867256637,
"eval_logits/chosen": -1.2672574520111084,
"eval_logits/rejected": -1.1583489179611206,
"eval_logps/chosen": -253.592041015625,
"eval_logps/rejected": -280.3034973144531,
"eval_loss": 0.31901347637176514,
"eval_rewards/accuracies": 0.8145152926445007,
"eval_rewards/chosen": 0.7268248796463013,
"eval_rewards/margins": 2.8418843746185303,
"eval_rewards/rejected": -2.1149721145629883,
"eval_runtime": 192.9475,
"eval_samples_per_second": 66.614,
"eval_steps_per_second": 1.042,
"step": 850
},
{
"epoch": 0.9413716814159292,
"grad_norm": 13.594935417175293,
"learning_rate": 4.673382432097667e-09,
"logits/chosen": -1.3515625,
"logits/rejected": -1.23046875,
"logps/chosen": -256.0,
"logps/rejected": -263.0,
"loss": 0.3324,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 0.6171875,
"rewards/margins": 2.7265625,
"rewards/rejected": -2.1171875,
"step": 851
},
{
"epoch": 0.9424778761061947,
"grad_norm": 14.526602745056152,
"learning_rate": 4.5024134945740036e-09,
"logits/chosen": -1.3203125,
"logits/rejected": -1.22265625,
"logps/chosen": -229.5,
"logps/rejected": -244.5,
"loss": 0.3492,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.546875,
"rewards/margins": 2.6640625,
"rewards/rejected": -2.109375,
"step": 852
},
{
"epoch": 0.9435840707964602,
"grad_norm": 16.662525177001953,
"learning_rate": 4.334602020227867e-09,
"logits/chosen": -1.34375,
"logits/rejected": -1.15625,
"logps/chosen": -285.0,
"logps/rejected": -290.0,
"loss": 0.3672,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.4638671875,
"rewards/margins": 2.3984375,
"rewards/rejected": -1.94140625,
"step": 853
},
{
"epoch": 0.9446902654867256,
"grad_norm": 14.094331741333008,
"learning_rate": 4.169950167363767e-09,
"logits/chosen": -1.265625,
"logits/rejected": -1.0625,
"logps/chosen": -263.0,
"logps/rejected": -297.0,
"loss": 0.3088,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.84765625,
"rewards/margins": 2.9375,
"rewards/rejected": -2.09375,
"step": 854
},
{
"epoch": 0.9457964601769911,
"grad_norm": 14.868205070495605,
"learning_rate": 4.0084600536488265e-09,
"logits/chosen": -1.38671875,
"logits/rejected": -1.17578125,
"logps/chosen": -238.0,
"logps/rejected": -290.0,
"loss": 0.3156,
"rewards/accuracies": 0.828125,
"rewards/chosen": 0.8359375,
"rewards/margins": 2.8125,
"rewards/rejected": -1.9765625,
"step": 855
},
{
"epoch": 0.9469026548672567,
"grad_norm": 13.155553817749023,
"learning_rate": 3.850133756085505e-09,
"logits/chosen": -1.31640625,
"logits/rejected": -1.15234375,
"logps/chosen": -270.0,
"logps/rejected": -290.0,
"loss": 0.3135,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 0.7265625,
"rewards/margins": 2.7890625,
"rewards/rejected": -2.0625,
"step": 856
},
{
"epoch": 0.9480088495575221,
"grad_norm": 13.842921257019043,
"learning_rate": 3.694973310984839e-09,
"logits/chosen": -1.359375,
"logits/rejected": -1.15625,
"logps/chosen": -258.0,
"logps/rejected": -281.0,
"loss": 0.3115,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.759765625,
"rewards/margins": 2.6171875,
"rewards/rejected": -1.859375,
"step": 857
},
{
"epoch": 0.9491150442477876,
"grad_norm": 13.213567733764648,
"learning_rate": 3.5429807139403524e-09,
"logits/chosen": -1.22265625,
"logits/rejected": -1.048828125,
"logps/chosen": -243.0,
"logps/rejected": -300.0,
"loss": 0.2749,
"rewards/accuracies": 0.859375,
"rewards/chosen": 0.87890625,
"rewards/margins": 3.2109375,
"rewards/rejected": -2.328125,
"step": 858
},
{
"epoch": 0.9502212389380531,
"grad_norm": 11.955760955810547,
"learning_rate": 3.3941579198023816e-09,
"logits/chosen": -1.484375,
"logits/rejected": -1.13671875,
"logps/chosen": -218.0,
"logps/rejected": -260.0,
"loss": 0.2961,
"rewards/accuracies": 0.8515625,
"rewards/chosen": 0.8046875,
"rewards/margins": 2.7421875,
"rewards/rejected": -1.93359375,
"step": 859
},
{
"epoch": 0.9513274336283186,
"grad_norm": 13.337422370910645,
"learning_rate": 3.248506842652793e-09,
"logits/chosen": -1.2578125,
"logits/rejected": -1.12109375,
"logps/chosen": -249.0,
"logps/rejected": -309.0,
"loss": 0.2853,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.8349609375,
"rewards/margins": 3.078125,
"rewards/rejected": -2.25,
"step": 860
},
{
"epoch": 0.952433628318584,
"grad_norm": 12.912832260131836,
"learning_rate": 3.106029355780582e-09,
"logits/chosen": -1.234375,
"logits/rejected": -1.15234375,
"logps/chosen": -271.0,
"logps/rejected": -282.0,
"loss": 0.3052,
"rewards/accuracies": 0.828125,
"rewards/chosen": 0.447265625,
"rewards/margins": 2.65625,
"rewards/rejected": -2.20703125,
"step": 861
},
{
"epoch": 0.9535398230088495,
"grad_norm": 14.942134857177734,
"learning_rate": 2.9667272916575337e-09,
"logits/chosen": -1.20703125,
"logits/rejected": -1.04296875,
"logps/chosen": -247.5,
"logps/rejected": -279.0,
"loss": 0.3356,
"rewards/accuracies": 0.734375,
"rewards/chosen": 0.69921875,
"rewards/margins": 2.8671875,
"rewards/rejected": -2.1640625,
"step": 862
},
{
"epoch": 0.9546460176991151,
"grad_norm": 11.314682960510254,
"learning_rate": 2.830602441914881e-09,
"logits/chosen": -1.23828125,
"logits/rejected": -1.1875,
"logps/chosen": -263.0,
"logps/rejected": -277.0,
"loss": 0.2615,
"rewards/accuracies": 0.828125,
"rewards/chosen": 0.80859375,
"rewards/margins": 3.1953125,
"rewards/rejected": -2.390625,
"step": 863
},
{
"epoch": 0.9557522123893806,
"grad_norm": 13.024490356445312,
"learning_rate": 2.6976565573202102e-09,
"logits/chosen": -1.23828125,
"logits/rejected": -1.23046875,
"logps/chosen": -249.0,
"logps/rejected": -275.0,
"loss": 0.2961,
"rewards/accuracies": 0.8515625,
"rewards/chosen": 0.734375,
"rewards/margins": 2.7890625,
"rewards/rejected": -2.0546875,
"step": 864
},
{
"epoch": 0.956858407079646,
"grad_norm": 13.718770980834961,
"learning_rate": 2.5678913477547302e-09,
"logits/chosen": -1.39453125,
"logits/rejected": -1.1796875,
"logps/chosen": -274.0,
"logps/rejected": -312.0,
"loss": 0.2869,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.853515625,
"rewards/margins": 2.7890625,
"rewards/rejected": -1.9296875,
"step": 865
},
{
"epoch": 0.9579646017699115,
"grad_norm": 13.562867164611816,
"learning_rate": 2.441308482191623e-09,
"logits/chosen": -1.12890625,
"logits/rejected": -1.0078125,
"logps/chosen": -252.5,
"logps/rejected": -291.0,
"loss": 0.3117,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 0.845703125,
"rewards/margins": 2.8359375,
"rewards/rejected": -1.98828125,
"step": 866
},
{
"epoch": 0.959070796460177,
"grad_norm": 13.698179244995117,
"learning_rate": 2.3179095886743384e-09,
"logits/chosen": -1.2890625,
"logits/rejected": -1.234375,
"logps/chosen": -230.5,
"logps/rejected": -266.5,
"loss": 0.3103,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.9375,
"rewards/margins": 3.1484375,
"rewards/rejected": -2.20703125,
"step": 867
},
{
"epoch": 0.9601769911504425,
"grad_norm": 13.498557090759277,
"learning_rate": 2.1976962542956945e-09,
"logits/chosen": -1.2890625,
"logits/rejected": -1.22265625,
"logps/chosen": -245.5,
"logps/rejected": -281.0,
"loss": 0.3036,
"rewards/accuracies": 0.828125,
"rewards/chosen": 0.935546875,
"rewards/margins": 2.9453125,
"rewards/rejected": -2.0,
"step": 868
},
{
"epoch": 0.9612831858407079,
"grad_norm": 12.500775337219238,
"learning_rate": 2.0806700251775055e-09,
"logits/chosen": -1.296875,
"logits/rejected": -1.171875,
"logps/chosen": -232.5,
"logps/rejected": -262.0,
"loss": 0.2973,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.724609375,
"rewards/margins": 2.953125,
"rewards/rejected": -2.2265625,
"step": 869
},
{
"epoch": 0.9623893805309734,
"grad_norm": 13.277873992919922,
"learning_rate": 1.966832406450708e-09,
"logits/chosen": -1.3828125,
"logits/rejected": -1.15234375,
"logps/chosen": -232.0,
"logps/rejected": -260.0,
"loss": 0.3434,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.5185546875,
"rewards/margins": 2.3984375,
"rewards/rejected": -1.87890625,
"step": 870
},
{
"epoch": 0.963495575221239,
"grad_norm": 14.751419067382812,
"learning_rate": 1.85618486223596e-09,
"logits/chosen": -1.234375,
"logits/rejected": -1.1484375,
"logps/chosen": -269.0,
"logps/rejected": -291.0,
"loss": 0.3578,
"rewards/accuracies": 0.828125,
"rewards/chosen": 0.4072265625,
"rewards/margins": 2.484375,
"rewards/rejected": -2.07421875,
"step": 871
},
{
"epoch": 0.9646017699115044,
"grad_norm": 16.29852294921875,
"learning_rate": 1.748728815624878e-09,
"logits/chosen": -1.328125,
"logits/rejected": -1.109375,
"logps/chosen": -273.0,
"logps/rejected": -264.0,
"loss": 0.3518,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.50927734375,
"rewards/margins": 2.734375,
"rewards/rejected": -2.2265625,
"step": 872
},
{
"epoch": 0.9657079646017699,
"grad_norm": 16.783334732055664,
"learning_rate": 1.6444656486615805e-09,
"logits/chosen": -1.1328125,
"logits/rejected": -1.07421875,
"logps/chosen": -287.0,
"logps/rejected": -307.0,
"loss": 0.3656,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 0.478515625,
"rewards/margins": 2.4765625,
"rewards/rejected": -2.00390625,
"step": 873
},
{
"epoch": 0.9668141592920354,
"grad_norm": 16.244199752807617,
"learning_rate": 1.5433967023250894e-09,
"logits/chosen": -1.37109375,
"logits/rejected": -1.1015625,
"logps/chosen": -275.0,
"logps/rejected": -317.0,
"loss": 0.3542,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 0.7353515625,
"rewards/margins": 2.8359375,
"rewards/rejected": -2.09375,
"step": 874
},
{
"epoch": 0.9679203539823009,
"grad_norm": 13.663660049438477,
"learning_rate": 1.4455232765120396e-09,
"logits/chosen": -1.3359375,
"logits/rejected": -1.22265625,
"logps/chosen": -244.5,
"logps/rejected": -268.0,
"loss": 0.3567,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.701171875,
"rewards/margins": 2.625,
"rewards/rejected": -1.921875,
"step": 875
},
{
"epoch": 0.9690265486725663,
"grad_norm": 12.790926933288574,
"learning_rate": 1.3508466300198306e-09,
"logits/chosen": -1.4296875,
"logits/rejected": -1.21875,
"logps/chosen": -232.5,
"logps/rejected": -262.0,
"loss": 0.3053,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.685546875,
"rewards/margins": 2.921875,
"rewards/rejected": -2.234375,
"step": 876
},
{
"epoch": 0.9701327433628318,
"grad_norm": 15.329063415527344,
"learning_rate": 1.2593679805306401e-09,
"logits/chosen": -1.20703125,
"logits/rejected": -1.20703125,
"logps/chosen": -254.5,
"logps/rejected": -278.0,
"loss": 0.3161,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.87890625,
"rewards/margins": 2.9609375,
"rewards/rejected": -2.07421875,
"step": 877
},
{
"epoch": 0.9712389380530974,
"grad_norm": 15.826077461242676,
"learning_rate": 1.1710885045956021e-09,
"logits/chosen": -1.41015625,
"logits/rejected": -1.26953125,
"logps/chosen": -257.5,
"logps/rejected": -281.0,
"loss": 0.3719,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 0.478515625,
"rewards/margins": 2.2734375,
"rewards/rejected": -1.79296875,
"step": 878
},
{
"epoch": 0.9723451327433629,
"grad_norm": 15.952284812927246,
"learning_rate": 1.0860093376197642e-09,
"logits/chosen": -1.28125,
"logits/rejected": -1.05078125,
"logps/chosen": -260.0,
"logps/rejected": -289.0,
"loss": 0.3437,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.73828125,
"rewards/margins": 2.859375,
"rewards/rejected": -2.1171875,
"step": 879
},
{
"epoch": 0.9734513274336283,
"grad_norm": 13.334358215332031,
"learning_rate": 1.0041315738474055e-09,
"logits/chosen": -1.203125,
"logits/rejected": -1.0859375,
"logps/chosen": -261.5,
"logps/rejected": -312.0,
"loss": 0.2845,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 0.869140625,
"rewards/margins": 3.125,
"rewards/rejected": -2.265625,
"step": 880
},
{
"epoch": 0.9745575221238938,
"grad_norm": 11.215107917785645,
"learning_rate": 9.254562663480458e-10,
"logits/chosen": -1.3125,
"logits/rejected": -1.2265625,
"logps/chosen": -241.5,
"logps/rejected": -287.0,
"loss": 0.2595,
"rewards/accuracies": 0.8828125,
"rewards/chosen": 1.03125,
"rewards/margins": 3.1484375,
"rewards/rejected": -2.125,
"step": 881
},
{
"epoch": 0.9756637168141593,
"grad_norm": 13.879293441772461,
"learning_rate": 8.499844270028755e-10,
"logits/chosen": -1.3046875,
"logits/rejected": -1.10546875,
"logps/chosen": -250.5,
"logps/rejected": -267.5,
"loss": 0.3143,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 0.494140625,
"rewards/margins": 2.84375,
"rewards/rejected": -2.3515625,
"step": 882
},
{
"epoch": 0.9767699115044248,
"grad_norm": 425.59161376953125,
"learning_rate": 7.777170264917365e-10,
"logits/chosen": -1.2421875,
"logits/rejected": -1.0703125,
"logps/chosen": -260.0,
"logps/rejected": -347.0,
"loss": 0.4257,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.646484375,
"rewards/margins": 2.890625,
"rewards/rejected": -2.2421875,
"step": 883
},
{
"epoch": 0.9778761061946902,
"grad_norm": 25.028003692626953,
"learning_rate": 7.086549942805498e-10,
"logits/chosen": -1.19921875,
"logits/rejected": -1.11328125,
"logps/chosen": -285.0,
"logps/rejected": -282.0,
"loss": 0.3772,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 0.56640625,
"rewards/margins": 2.359375,
"rewards/rejected": -1.79296875,
"step": 884
},
{
"epoch": 0.9789823008849557,
"grad_norm": 15.059175491333008,
"learning_rate": 6.427992186095744e-10,
"logits/chosen": -1.28515625,
"logits/rejected": -1.19140625,
"logps/chosen": -228.0,
"logps/rejected": -271.0,
"loss": 0.3026,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.94921875,
"rewards/margins": 3.140625,
"rewards/rejected": -2.1953125,
"step": 885
},
{
"epoch": 0.9800884955752213,
"grad_norm": 12.722869873046875,
"learning_rate": 5.801505464817502e-10,
"logits/chosen": -1.171875,
"logits/rejected": -1.08984375,
"logps/chosen": -244.0,
"logps/rejected": -279.0,
"loss": 0.3066,
"rewards/accuracies": 0.828125,
"rewards/chosen": 0.61328125,
"rewards/margins": 2.6640625,
"rewards/rejected": -2.0546875,
"step": 886
},
{
"epoch": 0.9811946902654868,
"grad_norm": 13.964948654174805,
"learning_rate": 5.207097836519569e-10,
"logits/chosen": -1.2421875,
"logits/rejected": -1.1171875,
"logps/chosen": -249.5,
"logps/rejected": -287.0,
"loss": 0.3159,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.8046875,
"rewards/margins": 2.953125,
"rewards/rejected": -2.1484375,
"step": 887
},
{
"epoch": 0.9823008849557522,
"grad_norm": 13.418638229370117,
"learning_rate": 4.644776946165774e-10,
"logits/chosen": -1.2734375,
"logits/rejected": -1.1484375,
"logps/chosen": -246.0,
"logps/rejected": -253.5,
"loss": 0.3351,
"rewards/accuracies": 0.8671875,
"rewards/chosen": 0.580078125,
"rewards/margins": 2.7734375,
"rewards/rejected": -2.1953125,
"step": 888
},
{
"epoch": 0.9834070796460177,
"grad_norm": 11.94414234161377,
"learning_rate": 4.114550026037278e-10,
"logits/chosen": -1.30078125,
"logits/rejected": -1.10546875,
"logps/chosen": -237.0,
"logps/rejected": -285.0,
"loss": 0.2559,
"rewards/accuracies": 0.8671875,
"rewards/chosen": 0.701171875,
"rewards/margins": 3.1484375,
"rewards/rejected": -2.453125,
"step": 889
},
{
"epoch": 0.9845132743362832,
"grad_norm": 14.505678176879883,
"learning_rate": 3.6164238956384876e-10,
"logits/chosen": -1.21484375,
"logits/rejected": -1.26953125,
"logps/chosen": -248.5,
"logps/rejected": -281.0,
"loss": 0.2998,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.876953125,
"rewards/margins": 2.875,
"rewards/rejected": -2.00390625,
"step": 890
},
{
"epoch": 0.9856194690265486,
"grad_norm": 12.155240058898926,
"learning_rate": 3.150404961611008e-10,
"logits/chosen": -1.234375,
"logits/rejected": -1.140625,
"logps/chosen": -240.0,
"logps/rejected": -276.0,
"loss": 0.2918,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.685546875,
"rewards/margins": 3.0078125,
"rewards/rejected": -2.3203125,
"step": 891
},
{
"epoch": 0.9867256637168141,
"grad_norm": 13.752731323242188,
"learning_rate": 2.716499217649271e-10,
"logits/chosen": -1.2109375,
"logits/rejected": -1.1640625,
"logps/chosen": -241.5,
"logps/rejected": -277.0,
"loss": 0.3461,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.638671875,
"rewards/margins": 2.6015625,
"rewards/rejected": -1.9609375,
"step": 892
},
{
"epoch": 0.9878318584070797,
"grad_norm": 14.712821960449219,
"learning_rate": 2.3147122444250323e-10,
"logits/chosen": -1.2265625,
"logits/rejected": -1.15234375,
"logps/chosen": -242.0,
"logps/rejected": -274.0,
"loss": 0.3957,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.4921875,
"rewards/margins": 2.5234375,
"rewards/rejected": -2.03125,
"step": 893
},
{
"epoch": 0.9889380530973452,
"grad_norm": 13.806950569152832,
"learning_rate": 1.9450492095149373e-10,
"logits/chosen": -1.27734375,
"logits/rejected": -1.06640625,
"logps/chosen": -251.0,
"logps/rejected": -282.0,
"loss": 0.3152,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.84375,
"rewards/margins": 2.9921875,
"rewards/rejected": -2.1484375,
"step": 894
},
{
"epoch": 0.9900442477876106,
"grad_norm": 13.336440086364746,
"learning_rate": 1.607514867333626e-10,
"logits/chosen": -1.17578125,
"logits/rejected": -1.0546875,
"logps/chosen": -273.5,
"logps/rejected": -280.0,
"loss": 0.3012,
"rewards/accuracies": 0.8515625,
"rewards/chosen": 0.646484375,
"rewards/margins": 2.7578125,
"rewards/rejected": -2.1015625,
"step": 895
},
{
"epoch": 0.9911504424778761,
"grad_norm": 14.591585159301758,
"learning_rate": 1.3021135590740583e-10,
"logits/chosen": -1.30078125,
"logits/rejected": -1.10546875,
"logps/chosen": -255.0,
"logps/rejected": -281.0,
"loss": 0.356,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.66015625,
"rewards/margins": 2.4921875,
"rewards/rejected": -1.83203125,
"step": 896
},
{
"epoch": 0.9922566371681416,
"grad_norm": 14.33768367767334,
"learning_rate": 1.028849212649785e-10,
"logits/chosen": -1.33203125,
"logits/rejected": -1.1875,
"logps/chosen": -272.0,
"logps/rejected": -264.0,
"loss": 0.3197,
"rewards/accuracies": 0.8046875,
"rewards/chosen": 0.716796875,
"rewards/margins": 2.765625,
"rewards/rejected": -2.046875,
"step": 897
},
{
"epoch": 0.9933628318584071,
"grad_norm": 14.789177894592285,
"learning_rate": 7.877253426458175e-11,
"logits/chosen": -1.2890625,
"logits/rejected": -1.125,
"logps/chosen": -253.5,
"logps/rejected": -296.0,
"loss": 0.3679,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.57421875,
"rewards/margins": 2.3984375,
"rewards/rejected": -1.82421875,
"step": 898
},
{
"epoch": 0.9944690265486725,
"grad_norm": 14.226619720458984,
"learning_rate": 5.7874505027283304e-11,
"logits/chosen": -1.265625,
"logits/rejected": -1.12890625,
"logps/chosen": -256.0,
"logps/rejected": -263.5,
"loss": 0.3177,
"rewards/accuracies": 0.8515625,
"rewards/chosen": 0.6015625,
"rewards/margins": 2.6796875,
"rewards/rejected": -2.0703125,
"step": 899
},
{
"epoch": 0.995575221238938,
"grad_norm": 14.491003036499023,
"learning_rate": 4.0191102332748364e-11,
"logits/chosen": -1.37109375,
"logits/rejected": -1.2421875,
"logps/chosen": -261.0,
"logps/rejected": -300.0,
"loss": 0.2955,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.892578125,
"rewards/margins": 2.9140625,
"rewards/rejected": -2.0234375,
"step": 900
},
{
"epoch": 0.995575221238938,
"eval_logits/chosen": -1.2664412260055542,
"eval_logits/rejected": -1.1579796075820923,
"eval_logps/chosen": -253.59701538085938,
"eval_logps/rejected": -280.3333435058594,
"eval_loss": 0.31904885172843933,
"eval_rewards/accuracies": 0.8163970708847046,
"eval_rewards/chosen": 0.7264896035194397,
"eval_rewards/margins": 2.841573476791382,
"eval_rewards/rejected": -2.1163711547851562,
"eval_runtime": 193.0253,
"eval_samples_per_second": 66.587,
"eval_steps_per_second": 1.041,
"step": 900
},
{
"epoch": 0.9966814159292036,
"grad_norm": 11.916271209716797,
"learning_rate": 2.5722553615770137e-11,
"logits/chosen": -1.31640625,
"logits/rejected": -1.11328125,
"logps/chosen": -246.0,
"logps/rejected": -269.5,
"loss": 0.2564,
"rewards/accuracies": 0.859375,
"rewards/chosen": 0.962890625,
"rewards/margins": 3.4609375,
"rewards/rejected": -2.4921875,
"step": 901
},
{
"epoch": 0.9977876106194691,
"grad_norm": 15.206621170043945,
"learning_rate": 1.4469044963355547e-11,
"logits/chosen": -1.2109375,
"logits/rejected": -1.08984375,
"logps/chosen": -250.5,
"logps/rejected": -298.0,
"loss": 0.3246,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 0.677734375,
"rewards/margins": 2.6953125,
"rewards/rejected": -2.015625,
"step": 902
},
{
"epoch": 0.9988938053097345,
"grad_norm": 16.438447952270508,
"learning_rate": 6.430721112282711e-12,
"logits/chosen": -1.265625,
"logits/rejected": -1.19921875,
"logps/chosen": -255.5,
"logps/rejected": -294.0,
"loss": 0.4007,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.6181640625,
"rewards/margins": 2.5703125,
"rewards/rejected": -1.953125,
"step": 903
},
{
"epoch": 1.0,
"grad_norm": 13.59802532196045,
"learning_rate": 1.6076854473801027e-12,
"logits/chosen": -1.359375,
"logits/rejected": -1.16796875,
"logps/chosen": -269.0,
"logps/rejected": -293.0,
"loss": 0.2951,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.6875,
"rewards/margins": 2.75,
"rewards/rejected": -2.0625,
"step": 904
}
],
"logging_steps": 1,
"max_steps": 904,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"train_dataloader_state_dict": null,
"trial_name": null,
"trial_params": null
}