Files
llama-3.1-8b-ultrafeedback-…/trainer_state.json
ModelHub XC 7a74f79f49 初始化项目,由ModelHub XC社区提供模型
Model: vukien2301/llama-3.1-8b-ultrafeedback-dpo-from-epoch1
Source: Original Platform
2026-06-05 12:06:16 +08:00

3885 lines
112 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 226,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004424778761061947,
"grad_norm": 2.562241554260254,
"learning_rate": 7e-07,
"logits/chosen": -0.2119140625,
"logits/rejected": -0.1328125,
"logps/chosen": -242.0,
"logps/rejected": -178.0,
"loss": 0.6914,
"loss/chosen-sft": 1.0,
"loss/dpo": 0.69140625,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.008849557522123894,
"grad_norm": 2.733123302459717,
"learning_rate": 7e-07,
"logits/chosen": -0.263671875,
"logits/rejected": -0.19140625,
"logps/chosen": -225.0,
"logps/rejected": -175.0,
"loss": 0.6929,
"loss/chosen-sft": 1.09375,
"loss/dpo": 0.69140625,
"rewards/accuracies": 0.3125,
"rewards/chosen": 0.00019550323486328125,
"rewards/margins": -0.00183868408203125,
"rewards/rejected": 0.0020294189453125,
"step": 2
},
{
"epoch": 0.01327433628318584,
"grad_norm": 5.071370601654053,
"learning_rate": 7e-07,
"logits/chosen": -0.287109375,
"logits/rejected": -0.1796875,
"logps/chosen": -258.0,
"logps/rejected": -195.0,
"loss": 0.6914,
"loss/chosen-sft": 1.1171875,
"loss/dpo": 0.69140625,
"rewards/accuracies": 0.21875,
"rewards/chosen": -0.000743865966796875,
"rewards/margins": -3.910064697265625e-05,
"rewards/rejected": -0.000701904296875,
"step": 3
},
{
"epoch": 0.017699115044247787,
"grad_norm": 9.871452331542969,
"learning_rate": 7e-07,
"logits/chosen": -0.294921875,
"logits/rejected": -0.310546875,
"logps/chosen": -270.0,
"logps/rejected": -236.0,
"loss": 0.6914,
"loss/chosen-sft": 1.09375,
"loss/dpo": 0.69140625,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.0017242431640625,
"rewards/margins": 0.002197265625,
"rewards/rejected": -0.00390625,
"step": 4
},
{
"epoch": 0.022123893805309734,
"grad_norm": 8.816597938537598,
"learning_rate": 7e-07,
"logits/chosen": -0.1328125,
"logits/rejected": -0.2421875,
"logps/chosen": -280.0,
"logps/rejected": -249.0,
"loss": 0.6914,
"loss/chosen-sft": 1.1484375,
"loss/dpo": 0.6953125,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.00156402587890625,
"rewards/margins": -0.0022735595703125,
"rewards/rejected": 0.000705718994140625,
"step": 5
},
{
"epoch": 0.02654867256637168,
"grad_norm": 8.637555122375488,
"learning_rate": 7e-07,
"logits/chosen": -0.12890625,
"logits/rejected": -0.1611328125,
"logps/chosen": -223.0,
"logps/rejected": -172.0,
"loss": 0.6895,
"loss/chosen-sft": 1.0,
"loss/dpo": 0.69140625,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.005889892578125,
"rewards/margins": 0.006195068359375,
"rewards/rejected": -0.00031280517578125,
"step": 6
},
{
"epoch": 0.030973451327433628,
"grad_norm": 4.954357147216797,
"learning_rate": 7e-07,
"logits/chosen": -0.27734375,
"logits/rejected": -0.125,
"logps/chosen": -312.0,
"logps/rejected": -228.0,
"loss": 0.6899,
"loss/chosen-sft": 1.078125,
"loss/dpo": 0.69140625,
"rewards/accuracies": 0.40625,
"rewards/chosen": 0.00113677978515625,
"rewards/margins": 0.0034027099609375,
"rewards/rejected": -0.0022735595703125,
"step": 7
},
{
"epoch": 0.035398230088495575,
"grad_norm": 7.5598249435424805,
"learning_rate": 7e-07,
"logits/chosen": -0.095703125,
"logits/rejected": -0.1416015625,
"logps/chosen": -240.0,
"logps/rejected": -233.0,
"loss": 0.6875,
"loss/chosen-sft": 1.0546875,
"loss/dpo": 0.6875,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.006988525390625,
"rewards/margins": 0.01007080078125,
"rewards/rejected": -0.003082275390625,
"step": 8
},
{
"epoch": 0.03982300884955752,
"grad_norm": 2.6033551692962646,
"learning_rate": 7e-07,
"logits/chosen": -0.294921875,
"logits/rejected": -0.28125,
"logps/chosen": -292.0,
"logps/rejected": -212.0,
"loss": 0.6885,
"loss/chosen-sft": 1.15625,
"loss/dpo": 0.6875,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.00665283203125,
"rewards/margins": 0.0135498046875,
"rewards/rejected": -0.00689697265625,
"step": 9
},
{
"epoch": 0.04424778761061947,
"grad_norm": 10.546786308288574,
"learning_rate": 7e-07,
"logits/chosen": -0.267578125,
"logits/rejected": -0.050537109375,
"logps/chosen": -308.0,
"logps/rejected": -210.0,
"loss": 0.687,
"loss/chosen-sft": 1.078125,
"loss/dpo": 0.68359375,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.0142822265625,
"rewards/margins": 0.02099609375,
"rewards/rejected": -0.006683349609375,
"step": 10
},
{
"epoch": 0.048672566371681415,
"grad_norm": 9.844188690185547,
"learning_rate": 7e-07,
"logits/chosen": -0.3046875,
"logits/rejected": -0.177734375,
"logps/chosen": -243.0,
"logps/rejected": -234.0,
"loss": 0.687,
"loss/chosen-sft": 1.1796875,
"loss/dpo": 0.68359375,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.01025390625,
"rewards/margins": 0.016357421875,
"rewards/rejected": -0.006103515625,
"step": 11
},
{
"epoch": 0.05309734513274336,
"grad_norm": 12.557535171508789,
"learning_rate": 7e-07,
"logits/chosen": -0.2265625,
"logits/rejected": -0.2275390625,
"logps/chosen": -244.0,
"logps/rejected": -219.0,
"loss": 0.686,
"loss/chosen-sft": 1.078125,
"loss/dpo": 0.6875,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.0025177001953125,
"rewards/margins": 0.0113525390625,
"rewards/rejected": -0.0087890625,
"step": 12
},
{
"epoch": 0.05752212389380531,
"grad_norm": 8.109821319580078,
"learning_rate": 7e-07,
"logits/chosen": -0.1953125,
"logits/rejected": -0.224609375,
"logps/chosen": -272.0,
"logps/rejected": -217.0,
"loss": 0.6836,
"loss/chosen-sft": 1.03125,
"loss/dpo": 0.68359375,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.00726318359375,
"rewards/margins": 0.0164794921875,
"rewards/rejected": -0.00921630859375,
"step": 13
},
{
"epoch": 0.061946902654867256,
"grad_norm": 8.9277982711792,
"learning_rate": 7e-07,
"logits/chosen": -0.19140625,
"logits/rejected": -0.2412109375,
"logps/chosen": -294.0,
"logps/rejected": -198.0,
"loss": 0.6816,
"loss/chosen-sft": 1.109375,
"loss/dpo": 0.6796875,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.007171630859375,
"rewards/margins": 0.025146484375,
"rewards/rejected": -0.0179443359375,
"step": 14
},
{
"epoch": 0.06637168141592921,
"grad_norm": 3.4456562995910645,
"learning_rate": 7e-07,
"logits/chosen": -0.07470703125,
"logits/rejected": -0.0712890625,
"logps/chosen": -239.0,
"logps/rejected": -227.0,
"loss": 0.6826,
"loss/chosen-sft": 0.93359375,
"loss/dpo": 0.6796875,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.00970458984375,
"rewards/margins": 0.0260009765625,
"rewards/rejected": -0.016357421875,
"step": 15
},
{
"epoch": 0.07079646017699115,
"grad_norm": 3.63268780708313,
"learning_rate": 7e-07,
"logits/chosen": -0.244140625,
"logits/rejected": -0.255859375,
"logps/chosen": -264.0,
"logps/rejected": -208.0,
"loss": 0.6826,
"loss/chosen-sft": 1.0,
"loss/dpo": 0.6796875,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.007537841796875,
"rewards/margins": 0.03173828125,
"rewards/rejected": -0.0242919921875,
"step": 16
},
{
"epoch": 0.0752212389380531,
"grad_norm": 14.086406707763672,
"learning_rate": 7e-07,
"logits/chosen": -0.30078125,
"logits/rejected": -0.271484375,
"logps/chosen": -214.0,
"logps/rejected": -197.0,
"loss": 0.6807,
"loss/chosen-sft": 0.9296875,
"loss/dpo": 0.6875,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.004791259765625,
"rewards/margins": 0.01422119140625,
"rewards/rejected": -0.0189208984375,
"step": 17
},
{
"epoch": 0.07964601769911504,
"grad_norm": 3.81648850440979,
"learning_rate": 7e-07,
"logits/chosen": -0.1435546875,
"logits/rejected": -0.1494140625,
"logps/chosen": -268.0,
"logps/rejected": -223.0,
"loss": 0.6797,
"loss/chosen-sft": 1.0,
"loss/dpo": 0.6796875,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.0057373046875,
"rewards/margins": 0.02734375,
"rewards/rejected": -0.0216064453125,
"step": 18
},
{
"epoch": 0.084070796460177,
"grad_norm": 3.1519317626953125,
"learning_rate": 7e-07,
"logits/chosen": -0.2109375,
"logits/rejected": -0.1220703125,
"logps/chosen": -247.0,
"logps/rejected": -236.0,
"loss": 0.6748,
"loss/chosen-sft": 1.0625,
"loss/dpo": 0.67578125,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.004730224609375,
"rewards/margins": 0.04052734375,
"rewards/rejected": -0.035888671875,
"step": 19
},
{
"epoch": 0.08849557522123894,
"grad_norm": 8.66562271118164,
"learning_rate": 7e-07,
"logits/chosen": -0.1982421875,
"logits/rejected": -0.1904296875,
"logps/chosen": -304.0,
"logps/rejected": -245.0,
"loss": 0.6733,
"loss/chosen-sft": 0.99609375,
"loss/dpo": 0.671875,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.0106201171875,
"rewards/margins": 0.0400390625,
"rewards/rejected": -0.029296875,
"step": 20
},
{
"epoch": 0.09292035398230089,
"grad_norm": 3.798952579498291,
"learning_rate": 7e-07,
"logits/chosen": -0.2578125,
"logits/rejected": -0.0281982421875,
"logps/chosen": -296.0,
"logps/rejected": -190.0,
"loss": 0.6782,
"loss/chosen-sft": 1.125,
"loss/dpo": 0.68359375,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.006195068359375,
"rewards/margins": 0.0238037109375,
"rewards/rejected": -0.030029296875,
"step": 21
},
{
"epoch": 0.09734513274336283,
"grad_norm": 6.044543743133545,
"learning_rate": 7e-07,
"logits/chosen": -0.1728515625,
"logits/rejected": -0.263671875,
"logps/chosen": -238.0,
"logps/rejected": -225.0,
"loss": 0.6743,
"loss/chosen-sft": 0.98828125,
"loss/dpo": 0.68359375,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.004638671875,
"rewards/margins": 0.01611328125,
"rewards/rejected": -0.020751953125,
"step": 22
},
{
"epoch": 0.10176991150442478,
"grad_norm": 6.223972797393799,
"learning_rate": 7e-07,
"logits/chosen": -0.36328125,
"logits/rejected": -0.37890625,
"logps/chosen": -256.0,
"logps/rejected": -235.0,
"loss": 0.6768,
"loss/chosen-sft": 1.09375,
"loss/dpo": 0.6796875,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.0225830078125,
"rewards/margins": 0.03173828125,
"rewards/rejected": -0.05419921875,
"step": 23
},
{
"epoch": 0.10619469026548672,
"grad_norm": 2.817391872406006,
"learning_rate": 7e-07,
"logits/chosen": -0.2353515625,
"logits/rejected": -0.27734375,
"logps/chosen": -258.0,
"logps/rejected": -196.0,
"loss": 0.6699,
"loss/chosen-sft": 1.09375,
"loss/dpo": 0.67578125,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.00830078125,
"rewards/margins": 0.03515625,
"rewards/rejected": -0.043212890625,
"step": 24
},
{
"epoch": 0.11061946902654868,
"grad_norm": 5.743912220001221,
"learning_rate": 7e-07,
"logits/chosen": -0.12890625,
"logits/rejected": -0.07861328125,
"logps/chosen": -268.0,
"logps/rejected": -231.0,
"loss": 0.6685,
"loss/chosen-sft": 0.9140625,
"loss/dpo": 0.66796875,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.0189208984375,
"rewards/margins": 0.050537109375,
"rewards/rejected": -0.0693359375,
"step": 25
},
{
"epoch": 0.11504424778761062,
"grad_norm": 3.4631690979003906,
"learning_rate": 7e-07,
"logits/chosen": 0.046142578125,
"logits/rejected": 0.018798828125,
"logps/chosen": -213.0,
"logps/rejected": -252.0,
"loss": 0.6699,
"loss/chosen-sft": 0.87109375,
"loss/dpo": 0.65625,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.0076904296875,
"rewards/margins": 0.07470703125,
"rewards/rejected": -0.06689453125,
"step": 26
},
{
"epoch": 0.11946902654867257,
"grad_norm": 2.302494525909424,
"learning_rate": 7e-07,
"logits/chosen": -0.2734375,
"logits/rejected": -0.296875,
"logps/chosen": -294.0,
"logps/rejected": -219.0,
"loss": 0.6733,
"loss/chosen-sft": 1.1796875,
"loss/dpo": 0.6640625,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.0025177001953125,
"rewards/margins": 0.058837890625,
"rewards/rejected": -0.061279296875,
"step": 27
},
{
"epoch": 0.12389380530973451,
"grad_norm": 15.041751861572266,
"learning_rate": 7e-07,
"logits/chosen": -0.326171875,
"logits/rejected": -0.2255859375,
"logps/chosen": -324.0,
"logps/rejected": -215.0,
"loss": 0.6597,
"loss/chosen-sft": 1.1328125,
"loss/dpo": 0.65625,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.0016632080078125,
"rewards/margins": 0.076171875,
"rewards/rejected": -0.07470703125,
"step": 28
},
{
"epoch": 0.12831858407079647,
"grad_norm": 10.651082992553711,
"learning_rate": 7e-07,
"logits/chosen": -0.055908203125,
"logits/rejected": -0.04638671875,
"logps/chosen": -264.0,
"logps/rejected": -233.0,
"loss": 0.6704,
"loss/chosen-sft": 0.9140625,
"loss/dpo": 0.65625,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.0001506805419921875,
"rewards/margins": 0.0751953125,
"rewards/rejected": -0.0751953125,
"step": 29
},
{
"epoch": 0.13274336283185842,
"grad_norm": 17.013574600219727,
"learning_rate": 7e-07,
"logits/chosen": -0.21484375,
"logits/rejected": -0.138671875,
"logps/chosen": -225.0,
"logps/rejected": -202.0,
"loss": 0.6685,
"loss/chosen-sft": 0.984375,
"loss/dpo": 0.67578125,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.0390625,
"rewards/margins": 0.038818359375,
"rewards/rejected": -0.07763671875,
"step": 30
},
{
"epoch": 0.13716814159292035,
"grad_norm": 3.365304708480835,
"learning_rate": 7e-07,
"logits/chosen": -0.267578125,
"logits/rejected": -0.25390625,
"logps/chosen": -296.0,
"logps/rejected": -262.0,
"loss": 0.6641,
"loss/chosen-sft": 1.1640625,
"loss/dpo": 0.6640625,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.0126953125,
"rewards/margins": 0.06689453125,
"rewards/rejected": -0.07958984375,
"step": 31
},
{
"epoch": 0.1415929203539823,
"grad_norm": 5.34974479675293,
"learning_rate": 7e-07,
"logits/chosen": -0.263671875,
"logits/rejected": -0.294921875,
"logps/chosen": -234.0,
"logps/rejected": -255.0,
"loss": 0.668,
"loss/chosen-sft": 1.0546875,
"loss/dpo": 0.65625,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.0081787109375,
"rewards/margins": 0.0830078125,
"rewards/rejected": -0.07470703125,
"step": 32
},
{
"epoch": 0.14601769911504425,
"grad_norm": 5.2689948081970215,
"learning_rate": 7e-07,
"logits/chosen": -0.255859375,
"logits/rejected": -0.38671875,
"logps/chosen": -221.0,
"logps/rejected": -318.0,
"loss": 0.6582,
"loss/chosen-sft": 1.0390625,
"loss/dpo": 0.6484375,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.0267333984375,
"rewards/margins": 0.09814453125,
"rewards/rejected": -0.12451171875,
"step": 33
},
{
"epoch": 0.1504424778761062,
"grad_norm": 13.799201011657715,
"learning_rate": 7e-07,
"logits/chosen": -0.275390625,
"logits/rejected": -0.21875,
"logps/chosen": -238.0,
"logps/rejected": -208.0,
"loss": 0.6733,
"loss/chosen-sft": 1.046875,
"loss/dpo": 0.6484375,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.03662109375,
"rewards/margins": 0.09765625,
"rewards/rejected": -0.1337890625,
"step": 34
},
{
"epoch": 0.15486725663716813,
"grad_norm": 13.07458782196045,
"learning_rate": 7e-07,
"logits/chosen": -0.369140625,
"logits/rejected": -0.24609375,
"logps/chosen": -280.0,
"logps/rejected": -255.0,
"loss": 0.6631,
"loss/chosen-sft": 1.1015625,
"loss/dpo": 0.66015625,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.078125,
"rewards/margins": 0.07470703125,
"rewards/rejected": -0.15234375,
"step": 35
},
{
"epoch": 0.1592920353982301,
"grad_norm": 2.256340742111206,
"learning_rate": 7e-07,
"logits/chosen": -0.287109375,
"logits/rejected": -0.28125,
"logps/chosen": -226.0,
"logps/rejected": -240.0,
"loss": 0.6543,
"loss/chosen-sft": 1.015625,
"loss/dpo": 0.6484375,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.049072265625,
"rewards/margins": 0.095703125,
"rewards/rejected": -0.1455078125,
"step": 36
},
{
"epoch": 0.16371681415929204,
"grad_norm": 8.795002937316895,
"learning_rate": 7e-07,
"logits/chosen": -0.2294921875,
"logits/rejected": -0.21484375,
"logps/chosen": -225.0,
"logps/rejected": -216.0,
"loss": 0.6523,
"loss/chosen-sft": 0.9765625,
"loss/dpo": 0.671875,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.06787109375,
"rewards/margins": 0.047607421875,
"rewards/rejected": -0.115234375,
"step": 37
},
{
"epoch": 0.168141592920354,
"grad_norm": 2.735612154006958,
"learning_rate": 7e-07,
"logits/chosen": -0.271484375,
"logits/rejected": -0.26171875,
"logps/chosen": -322.0,
"logps/rejected": -236.0,
"loss": 0.6455,
"loss/chosen-sft": 1.234375,
"loss/dpo": 0.625,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.029052734375,
"rewards/margins": 0.1474609375,
"rewards/rejected": -0.1767578125,
"step": 38
},
{
"epoch": 0.17256637168141592,
"grad_norm": 17.598873138427734,
"learning_rate": 7e-07,
"logits/chosen": -0.23046875,
"logits/rejected": -0.357421875,
"logps/chosen": -237.0,
"logps/rejected": -253.0,
"loss": 0.6367,
"loss/chosen-sft": 1.0625,
"loss/dpo": 0.64453125,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.053955078125,
"rewards/margins": 0.10888671875,
"rewards/rejected": -0.1630859375,
"step": 39
},
{
"epoch": 0.17699115044247787,
"grad_norm": 4.538353443145752,
"learning_rate": 7e-07,
"logits/chosen": -0.1953125,
"logits/rejected": -0.263671875,
"logps/chosen": -240.0,
"logps/rejected": -290.0,
"loss": 0.6475,
"loss/chosen-sft": 0.9296875,
"loss/dpo": 0.6328125,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.0625,
"rewards/margins": 0.13671875,
"rewards/rejected": -0.19921875,
"step": 40
},
{
"epoch": 0.18141592920353983,
"grad_norm": 17.900062561035156,
"learning_rate": 7e-07,
"logits/chosen": -0.349609375,
"logits/rejected": -0.306640625,
"logps/chosen": -260.0,
"logps/rejected": -284.0,
"loss": 0.6538,
"loss/chosen-sft": 1.03125,
"loss/dpo": 0.65625,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.09912109375,
"rewards/margins": 0.08154296875,
"rewards/rejected": -0.1806640625,
"step": 41
},
{
"epoch": 0.18584070796460178,
"grad_norm": 2.5172457695007324,
"learning_rate": 7e-07,
"logits/chosen": -0.40625,
"logits/rejected": -0.38671875,
"logps/chosen": -272.0,
"logps/rejected": -216.0,
"loss": 0.6426,
"loss/chosen-sft": 1.234375,
"loss/dpo": 0.625,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.07080078125,
"rewards/margins": 0.1708984375,
"rewards/rejected": -0.2421875,
"step": 42
},
{
"epoch": 0.1902654867256637,
"grad_norm": 10.608353614807129,
"learning_rate": 7e-07,
"logits/chosen": -0.400390625,
"logits/rejected": -0.353515625,
"logps/chosen": -240.0,
"logps/rejected": -224.0,
"loss": 0.6455,
"loss/chosen-sft": 1.140625,
"loss/dpo": 0.6328125,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.0888671875,
"rewards/margins": 0.1357421875,
"rewards/rejected": -0.224609375,
"step": 43
},
{
"epoch": 0.19469026548672566,
"grad_norm": 17.277069091796875,
"learning_rate": 7e-07,
"logits/chosen": -0.33984375,
"logits/rejected": -0.37109375,
"logps/chosen": -300.0,
"logps/rejected": -215.0,
"loss": 0.6396,
"loss/chosen-sft": 1.1640625,
"loss/dpo": 0.60546875,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.06005859375,
"rewards/margins": 0.2041015625,
"rewards/rejected": -0.263671875,
"step": 44
},
{
"epoch": 0.19911504424778761,
"grad_norm": 18.799257278442383,
"learning_rate": 7e-07,
"logits/chosen": -0.0888671875,
"logits/rejected": -0.171875,
"logps/chosen": -234.0,
"logps/rejected": -260.0,
"loss": 0.6382,
"loss/chosen-sft": 0.890625,
"loss/dpo": 0.60546875,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.08203125,
"rewards/margins": 0.2197265625,
"rewards/rejected": -0.302734375,
"step": 45
},
{
"epoch": 0.20353982300884957,
"grad_norm": 34.8527717590332,
"learning_rate": 7e-07,
"logits/chosen": -0.3828125,
"logits/rejected": -0.46875,
"logps/chosen": -246.0,
"logps/rejected": -255.0,
"loss": 0.6538,
"loss/chosen-sft": 1.109375,
"loss/dpo": 0.71875,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.240234375,
"rewards/margins": -0.01708984375,
"rewards/rejected": -0.22265625,
"step": 46
},
{
"epoch": 0.2079646017699115,
"grad_norm": 6.215828895568848,
"learning_rate": 7e-07,
"logits/chosen": -0.578125,
"logits/rejected": -0.49609375,
"logps/chosen": -288.0,
"logps/rejected": -284.0,
"loss": 0.6431,
"loss/chosen-sft": 1.25,
"loss/dpo": 0.61328125,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.0625,
"rewards/margins": 0.18359375,
"rewards/rejected": -0.2470703125,
"step": 47
},
{
"epoch": 0.21238938053097345,
"grad_norm": 2.940314531326294,
"learning_rate": 7e-07,
"logits/chosen": -0.361328125,
"logits/rejected": -0.400390625,
"logps/chosen": -288.0,
"logps/rejected": -214.0,
"loss": 0.627,
"loss/chosen-sft": 1.171875,
"loss/dpo": 0.62109375,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.1357421875,
"rewards/margins": 0.1748046875,
"rewards/rejected": -0.310546875,
"step": 48
},
{
"epoch": 0.2168141592920354,
"grad_norm": 124.2203598022461,
"learning_rate": 7e-07,
"logits/chosen": -0.2255859375,
"logits/rejected": -0.27734375,
"logps/chosen": -276.0,
"logps/rejected": -280.0,
"loss": 0.6323,
"loss/chosen-sft": 1.015625,
"loss/dpo": 0.61328125,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.076171875,
"rewards/margins": 0.177734375,
"rewards/rejected": -0.25390625,
"step": 49
},
{
"epoch": 0.22123893805309736,
"grad_norm": 49.72818374633789,
"learning_rate": 7e-07,
"logits/chosen": -0.5859375,
"logits/rejected": -0.55859375,
"logps/chosen": -312.0,
"logps/rejected": -266.0,
"loss": 0.6343,
"loss/chosen-sft": 1.1796875,
"loss/dpo": 0.609375,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.10791015625,
"rewards/margins": 0.2021484375,
"rewards/rejected": -0.310546875,
"step": 50
},
{
"epoch": 0.22566371681415928,
"grad_norm": 110.1352310180664,
"learning_rate": 7e-07,
"logits/chosen": -0.44921875,
"logits/rejected": -0.431640625,
"logps/chosen": -274.0,
"logps/rejected": -264.0,
"loss": 0.6245,
"loss/chosen-sft": 1.171875,
"loss/dpo": 0.61328125,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.1162109375,
"rewards/margins": 0.208984375,
"rewards/rejected": -0.32421875,
"step": 51
},
{
"epoch": 0.23008849557522124,
"grad_norm": 14.234803199768066,
"learning_rate": 7e-07,
"logits/chosen": -0.40625,
"logits/rejected": -0.28515625,
"logps/chosen": -268.0,
"logps/rejected": -256.0,
"loss": 0.627,
"loss/chosen-sft": 1.046875,
"loss/dpo": 0.66796875,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.220703125,
"rewards/margins": 0.0849609375,
"rewards/rejected": -0.306640625,
"step": 52
},
{
"epoch": 0.2345132743362832,
"grad_norm": 25.839595794677734,
"learning_rate": 7e-07,
"logits/chosen": -0.61328125,
"logits/rejected": -0.5390625,
"logps/chosen": -302.0,
"logps/rejected": -251.0,
"loss": 0.6152,
"loss/chosen-sft": 1.3203125,
"loss/dpo": 0.58203125,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.0849609375,
"rewards/margins": 0.2578125,
"rewards/rejected": -0.34375,
"step": 53
},
{
"epoch": 0.23893805309734514,
"grad_norm": 17.5559024810791,
"learning_rate": 7e-07,
"logits/chosen": -0.32421875,
"logits/rejected": -0.4140625,
"logps/chosen": -241.0,
"logps/rejected": -272.0,
"loss": 0.6279,
"loss/chosen-sft": 1.109375,
"loss/dpo": 0.61328125,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.173828125,
"rewards/margins": 0.21875,
"rewards/rejected": -0.392578125,
"step": 54
},
{
"epoch": 0.24336283185840707,
"grad_norm": 4.360561847686768,
"learning_rate": 7e-07,
"logits/chosen": -0.2353515625,
"logits/rejected": -0.10107421875,
"logps/chosen": -284.0,
"logps/rejected": -237.0,
"loss": 0.605,
"loss/chosen-sft": 1.0546875,
"loss/dpo": 0.6015625,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.1689453125,
"rewards/margins": 0.2353515625,
"rewards/rejected": -0.404296875,
"step": 55
},
{
"epoch": 0.24778761061946902,
"grad_norm": 16.581727981567383,
"learning_rate": 7e-07,
"logits/chosen": -0.431640625,
"logits/rejected": -0.39453125,
"logps/chosen": -340.0,
"logps/rejected": -294.0,
"loss": 0.6133,
"loss/chosen-sft": 1.25,
"loss/dpo": 0.6484375,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.2177734375,
"rewards/margins": 0.1396484375,
"rewards/rejected": -0.357421875,
"step": 56
},
{
"epoch": 0.252212389380531,
"grad_norm": 49.88325119018555,
"learning_rate": 7e-07,
"logits/chosen": -0.396484375,
"logits/rejected": -0.173828125,
"logps/chosen": -226.0,
"logps/rejected": -191.0,
"loss": 0.6279,
"loss/chosen-sft": 1.0546875,
"loss/dpo": 0.68359375,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.1884765625,
"rewards/margins": 0.036865234375,
"rewards/rejected": -0.2255859375,
"step": 57
},
{
"epoch": 0.25663716814159293,
"grad_norm": 24.68882179260254,
"learning_rate": 7e-07,
"logits/chosen": -0.37890625,
"logits/rejected": -0.28515625,
"logps/chosen": -244.0,
"logps/rejected": -260.0,
"loss": 0.6133,
"loss/chosen-sft": 1.078125,
"loss/dpo": 0.60546875,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.23828125,
"rewards/margins": 0.2255859375,
"rewards/rejected": -0.46484375,
"step": 58
},
{
"epoch": 0.2610619469026549,
"grad_norm": 15.78062915802002,
"learning_rate": 7e-07,
"logits/chosen": -0.294921875,
"logits/rejected": -0.375,
"logps/chosen": -300.0,
"logps/rejected": -255.0,
"loss": 0.6138,
"loss/chosen-sft": 1.1796875,
"loss/dpo": 0.59375,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.154296875,
"rewards/margins": 0.2421875,
"rewards/rejected": -0.396484375,
"step": 59
},
{
"epoch": 0.26548672566371684,
"grad_norm": 20.29654884338379,
"learning_rate": 7e-07,
"logits/chosen": -0.318359375,
"logits/rejected": -0.376953125,
"logps/chosen": -280.0,
"logps/rejected": -300.0,
"loss": 0.6206,
"loss/chosen-sft": 1.2265625,
"loss/dpo": 0.6484375,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.2265625,
"rewards/margins": 0.1357421875,
"rewards/rejected": -0.361328125,
"step": 60
},
{
"epoch": 0.26991150442477874,
"grad_norm": 43.65996551513672,
"learning_rate": 7e-07,
"logits/chosen": -0.36328125,
"logits/rejected": -0.3046875,
"logps/chosen": -238.0,
"logps/rejected": -278.0,
"loss": 0.6255,
"loss/chosen-sft": 1.25,
"loss/dpo": 0.59375,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.1572265625,
"rewards/margins": 0.2578125,
"rewards/rejected": -0.416015625,
"step": 61
},
{
"epoch": 0.2743362831858407,
"grad_norm": 72.5498046875,
"learning_rate": 7e-07,
"logits/chosen": -0.359375,
"logits/rejected": -0.373046875,
"logps/chosen": -318.0,
"logps/rejected": -270.0,
"loss": 0.6245,
"loss/chosen-sft": 1.1015625,
"loss/dpo": 0.59375,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.1884765625,
"rewards/margins": 0.275390625,
"rewards/rejected": -0.46484375,
"step": 62
},
{
"epoch": 0.27876106194690264,
"grad_norm": 99.62593078613281,
"learning_rate": 7e-07,
"logits/chosen": -0.265625,
"logits/rejected": -0.33984375,
"logps/chosen": -294.0,
"logps/rejected": -258.0,
"loss": 0.627,
"loss/chosen-sft": 1.0703125,
"loss/dpo": 0.65234375,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.267578125,
"rewards/margins": 0.1328125,
"rewards/rejected": -0.400390625,
"step": 63
},
{
"epoch": 0.2831858407079646,
"grad_norm": 71.84874725341797,
"learning_rate": 7e-07,
"logits/chosen": -0.359375,
"logits/rejected": -0.392578125,
"logps/chosen": -260.0,
"logps/rejected": -219.0,
"loss": 0.6138,
"loss/chosen-sft": 1.171875,
"loss/dpo": 0.671875,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.248046875,
"rewards/margins": 0.087890625,
"rewards/rejected": -0.3359375,
"step": 64
},
{
"epoch": 0.28761061946902655,
"grad_norm": 43.867332458496094,
"learning_rate": 7e-07,
"logits/chosen": -0.494140625,
"logits/rejected": -0.38671875,
"logps/chosen": -298.0,
"logps/rejected": -276.0,
"loss": 0.6162,
"loss/chosen-sft": 1.234375,
"loss/dpo": 0.6171875,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.251953125,
"rewards/margins": 0.2021484375,
"rewards/rejected": -0.455078125,
"step": 65
},
{
"epoch": 0.2920353982300885,
"grad_norm": 15.392335891723633,
"learning_rate": 7e-07,
"logits/chosen": -0.275390625,
"logits/rejected": -0.1953125,
"logps/chosen": -241.0,
"logps/rejected": -234.0,
"loss": 0.6211,
"loss/chosen-sft": 1.1953125,
"loss/dpo": 0.703125,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.251953125,
"rewards/margins": 0.017578125,
"rewards/rejected": -0.26953125,
"step": 66
},
{
"epoch": 0.29646017699115046,
"grad_norm": 17.315120697021484,
"learning_rate": 7e-07,
"logits/chosen": -0.271484375,
"logits/rejected": -0.3828125,
"logps/chosen": -268.0,
"logps/rejected": -292.0,
"loss": 0.6226,
"loss/chosen-sft": 1.0078125,
"loss/dpo": 0.6171875,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.30078125,
"rewards/margins": 0.203125,
"rewards/rejected": -0.50390625,
"step": 67
},
{
"epoch": 0.3008849557522124,
"grad_norm": 17.68255615234375,
"learning_rate": 7e-07,
"logits/chosen": -0.369140625,
"logits/rejected": -0.50390625,
"logps/chosen": -266.0,
"logps/rejected": -248.0,
"loss": 0.6143,
"loss/chosen-sft": 1.109375,
"loss/dpo": 0.58203125,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.138671875,
"rewards/margins": 0.3046875,
"rewards/rejected": -0.443359375,
"step": 68
},
{
"epoch": 0.3053097345132743,
"grad_norm": 31.16883087158203,
"learning_rate": 7e-07,
"logits/chosen": -0.373046875,
"logits/rejected": -0.369140625,
"logps/chosen": -256.0,
"logps/rejected": -235.0,
"loss": 0.6191,
"loss/chosen-sft": 1.15625,
"loss/dpo": 0.59765625,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.1884765625,
"rewards/margins": 0.279296875,
"rewards/rejected": -0.46875,
"step": 69
},
{
"epoch": 0.30973451327433627,
"grad_norm": 65.01067352294922,
"learning_rate": 7e-07,
"logits/chosen": -0.423828125,
"logits/rejected": -0.5,
"logps/chosen": -272.0,
"logps/rejected": -244.0,
"loss": 0.5884,
"loss/chosen-sft": 1.21875,
"loss/dpo": 0.6015625,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.1787109375,
"rewards/margins": 0.251953125,
"rewards/rejected": -0.431640625,
"step": 70
},
{
"epoch": 0.3141592920353982,
"grad_norm": 9.247684478759766,
"learning_rate": 7e-07,
"logits/chosen": -0.412109375,
"logits/rejected": -0.421875,
"logps/chosen": -284.0,
"logps/rejected": -245.0,
"loss": 0.605,
"loss/chosen-sft": 1.203125,
"loss/dpo": 0.671875,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.2216796875,
"rewards/margins": 0.1396484375,
"rewards/rejected": -0.361328125,
"step": 71
},
{
"epoch": 0.3185840707964602,
"grad_norm": 29.392963409423828,
"learning_rate": 7e-07,
"logits/chosen": -0.46875,
"logits/rejected": -0.462890625,
"logps/chosen": -346.0,
"logps/rejected": -292.0,
"loss": 0.5928,
"loss/chosen-sft": 1.25,
"loss/dpo": 0.64453125,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.30078125,
"rewards/margins": 0.1650390625,
"rewards/rejected": -0.46484375,
"step": 72
},
{
"epoch": 0.3230088495575221,
"grad_norm": 12.59981918334961,
"learning_rate": 7e-07,
"logits/chosen": -0.6171875,
"logits/rejected": -0.41796875,
"logps/chosen": -296.0,
"logps/rejected": -253.0,
"loss": 0.6255,
"loss/chosen-sft": 1.2734375,
"loss/dpo": 0.62109375,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.228515625,
"rewards/margins": 0.1884765625,
"rewards/rejected": -0.416015625,
"step": 73
},
{
"epoch": 0.3274336283185841,
"grad_norm": 27.949209213256836,
"learning_rate": 7e-07,
"logits/chosen": -0.5,
"logits/rejected": -0.4921875,
"logps/chosen": -272.0,
"logps/rejected": -218.0,
"loss": 0.5723,
"loss/chosen-sft": 1.1171875,
"loss/dpo": 0.62109375,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.25390625,
"rewards/margins": 0.1806640625,
"rewards/rejected": -0.435546875,
"step": 74
},
{
"epoch": 0.33185840707964603,
"grad_norm": 42.84572219848633,
"learning_rate": 7e-07,
"logits/chosen": -0.54296875,
"logits/rejected": -0.5546875,
"logps/chosen": -268.0,
"logps/rejected": -222.0,
"loss": 0.6099,
"loss/chosen-sft": 1.265625,
"loss/dpo": 0.6015625,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.1572265625,
"rewards/margins": 0.2392578125,
"rewards/rejected": -0.396484375,
"step": 75
},
{
"epoch": 0.336283185840708,
"grad_norm": 58.05986404418945,
"learning_rate": 7e-07,
"logits/chosen": -0.5390625,
"logits/rejected": -0.51171875,
"logps/chosen": -206.0,
"logps/rejected": -262.0,
"loss": 0.6021,
"loss/chosen-sft": 1.0546875,
"loss/dpo": 0.58984375,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.1728515625,
"rewards/margins": 0.24609375,
"rewards/rejected": -0.41796875,
"step": 76
},
{
"epoch": 0.3407079646017699,
"grad_norm": 31.465473175048828,
"learning_rate": 7e-07,
"logits/chosen": -0.423828125,
"logits/rejected": -0.1796875,
"logps/chosen": -264.0,
"logps/rejected": -286.0,
"loss": 0.5967,
"loss/chosen-sft": 1.0703125,
"loss/dpo": 0.62890625,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.31640625,
"rewards/margins": 0.19921875,
"rewards/rejected": -0.515625,
"step": 77
},
{
"epoch": 0.34513274336283184,
"grad_norm": 57.08030700683594,
"learning_rate": 7e-07,
"logits/chosen": -0.578125,
"logits/rejected": -0.326171875,
"logps/chosen": -304.0,
"logps/rejected": -240.0,
"loss": 0.6187,
"loss/chosen-sft": 1.328125,
"loss/dpo": 0.59375,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.22265625,
"rewards/margins": 0.267578125,
"rewards/rejected": -0.490234375,
"step": 78
},
{
"epoch": 0.3495575221238938,
"grad_norm": 29.671070098876953,
"learning_rate": 7e-07,
"logits/chosen": -0.48046875,
"logits/rejected": -0.59765625,
"logps/chosen": -292.0,
"logps/rejected": -256.0,
"loss": 0.6147,
"loss/chosen-sft": 1.2265625,
"loss/dpo": 0.55859375,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.1416015625,
"rewards/margins": 0.33984375,
"rewards/rejected": -0.48046875,
"step": 79
},
{
"epoch": 0.35398230088495575,
"grad_norm": 48.64991760253906,
"learning_rate": 7e-07,
"logits/chosen": -0.59375,
"logits/rejected": -0.6328125,
"logps/chosen": -288.0,
"logps/rejected": -264.0,
"loss": 0.6309,
"loss/chosen-sft": 1.2578125,
"loss/dpo": 0.6484375,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.33984375,
"rewards/margins": 0.162109375,
"rewards/rejected": -0.50390625,
"step": 80
},
{
"epoch": 0.3584070796460177,
"grad_norm": 44.117034912109375,
"learning_rate": 7e-07,
"logits/chosen": -0.49609375,
"logits/rejected": -0.51953125,
"logps/chosen": -252.0,
"logps/rejected": -262.0,
"loss": 0.5845,
"loss/chosen-sft": 1.0859375,
"loss/dpo": 0.55078125,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.2412109375,
"rewards/margins": 0.3828125,
"rewards/rejected": -0.625,
"step": 81
},
{
"epoch": 0.36283185840707965,
"grad_norm": 45.00334167480469,
"learning_rate": 7e-07,
"logits/chosen": -0.486328125,
"logits/rejected": -0.5390625,
"logps/chosen": -294.0,
"logps/rejected": -288.0,
"loss": 0.6045,
"loss/chosen-sft": 1.21875,
"loss/dpo": 0.578125,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.287109375,
"rewards/margins": 0.328125,
"rewards/rejected": -0.6171875,
"step": 82
},
{
"epoch": 0.3672566371681416,
"grad_norm": 6.153012752532959,
"learning_rate": 7e-07,
"logits/chosen": -0.427734375,
"logits/rejected": -0.427734375,
"logps/chosen": -272.0,
"logps/rejected": -278.0,
"loss": 0.582,
"loss/chosen-sft": 1.0546875,
"loss/dpo": 0.5625,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.1689453125,
"rewards/margins": 0.357421875,
"rewards/rejected": -0.52734375,
"step": 83
},
{
"epoch": 0.37168141592920356,
"grad_norm": 4.050904273986816,
"learning_rate": 7e-07,
"logits/chosen": -0.349609375,
"logits/rejected": -0.3984375,
"logps/chosen": -288.0,
"logps/rejected": -274.0,
"loss": 0.603,
"loss/chosen-sft": 1.1796875,
"loss/dpo": 0.62109375,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.28515625,
"rewards/margins": 0.2333984375,
"rewards/rejected": -0.51953125,
"step": 84
},
{
"epoch": 0.37610619469026546,
"grad_norm": 53.353515625,
"learning_rate": 7e-07,
"logits/chosen": -0.455078125,
"logits/rejected": -0.462890625,
"logps/chosen": -278.0,
"logps/rejected": -227.0,
"loss": 0.6323,
"loss/chosen-sft": 1.109375,
"loss/dpo": 0.609375,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.24609375,
"rewards/margins": 0.2373046875,
"rewards/rejected": -0.482421875,
"step": 85
},
{
"epoch": 0.3805309734513274,
"grad_norm": 63.98937225341797,
"learning_rate": 7e-07,
"logits/chosen": -0.400390625,
"logits/rejected": -0.4609375,
"logps/chosen": -300.0,
"logps/rejected": -340.0,
"loss": 0.6182,
"loss/chosen-sft": 1.046875,
"loss/dpo": 0.62890625,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.349609375,
"rewards/margins": 0.1904296875,
"rewards/rejected": -0.5390625,
"step": 86
},
{
"epoch": 0.38495575221238937,
"grad_norm": 22.751070022583008,
"learning_rate": 7e-07,
"logits/chosen": -0.416015625,
"logits/rejected": -0.47265625,
"logps/chosen": -270.0,
"logps/rejected": -234.0,
"loss": 0.5938,
"loss/chosen-sft": 1.109375,
"loss/dpo": 0.6015625,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.203125,
"rewards/margins": 0.2734375,
"rewards/rejected": -0.4765625,
"step": 87
},
{
"epoch": 0.3893805309734513,
"grad_norm": 15.500909805297852,
"learning_rate": 7e-07,
"logits/chosen": -0.42578125,
"logits/rejected": -0.51171875,
"logps/chosen": -220.0,
"logps/rejected": -222.0,
"loss": 0.5884,
"loss/chosen-sft": 1.3828125,
"loss/dpo": 0.69140625,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.33203125,
"rewards/margins": 0.0634765625,
"rewards/rejected": -0.396484375,
"step": 88
},
{
"epoch": 0.3938053097345133,
"grad_norm": 21.14480209350586,
"learning_rate": 7e-07,
"logits/chosen": -0.50390625,
"logits/rejected": -0.4921875,
"logps/chosen": -282.0,
"logps/rejected": -223.0,
"loss": 0.584,
"loss/chosen-sft": 1.296875,
"loss/dpo": 0.65234375,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.27734375,
"rewards/margins": 0.1875,
"rewards/rejected": -0.466796875,
"step": 89
},
{
"epoch": 0.39823008849557523,
"grad_norm": 14.2146635055542,
"learning_rate": 7e-07,
"logits/chosen": -0.447265625,
"logits/rejected": -0.392578125,
"logps/chosen": -268.0,
"logps/rejected": -215.0,
"loss": 0.6104,
"loss/chosen-sft": 1.09375,
"loss/dpo": 0.65625,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.359375,
"rewards/margins": 0.1923828125,
"rewards/rejected": -0.55078125,
"step": 90
},
{
"epoch": 0.4026548672566372,
"grad_norm": 76.13188171386719,
"learning_rate": 7e-07,
"logits/chosen": -0.478515625,
"logits/rejected": -0.462890625,
"logps/chosen": -270.0,
"logps/rejected": -280.0,
"loss": 0.5781,
"loss/chosen-sft": 1.2109375,
"loss/dpo": 0.59765625,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.2421875,
"rewards/margins": 0.2431640625,
"rewards/rejected": -0.486328125,
"step": 91
},
{
"epoch": 0.40707964601769914,
"grad_norm": 34.39772033691406,
"learning_rate": 7e-07,
"logits/chosen": -0.5625,
"logits/rejected": -0.4921875,
"logps/chosen": -308.0,
"logps/rejected": -294.0,
"loss": 0.5698,
"loss/chosen-sft": 1.3203125,
"loss/dpo": 0.52734375,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.2275390625,
"rewards/margins": 0.49609375,
"rewards/rejected": -0.7265625,
"step": 92
},
{
"epoch": 0.41150442477876104,
"grad_norm": 36.51502227783203,
"learning_rate": 7e-07,
"logits/chosen": -0.4296875,
"logits/rejected": -0.51953125,
"logps/chosen": -278.0,
"logps/rejected": -336.0,
"loss": 0.5869,
"loss/chosen-sft": 1.15625,
"loss/dpo": 0.57421875,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.29296875,
"rewards/margins": 0.314453125,
"rewards/rejected": -0.60546875,
"step": 93
},
{
"epoch": 0.415929203539823,
"grad_norm": 41.33882141113281,
"learning_rate": 7e-07,
"logits/chosen": -0.291015625,
"logits/rejected": -0.26171875,
"logps/chosen": -255.0,
"logps/rejected": -251.0,
"loss": 0.5908,
"loss/chosen-sft": 1.171875,
"loss/dpo": 0.6171875,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.310546875,
"rewards/margins": 0.232421875,
"rewards/rejected": -0.54296875,
"step": 94
},
{
"epoch": 0.42035398230088494,
"grad_norm": 25.335350036621094,
"learning_rate": 7e-07,
"logits/chosen": -0.2578125,
"logits/rejected": -0.26953125,
"logps/chosen": -234.0,
"logps/rejected": -312.0,
"loss": 0.5811,
"loss/chosen-sft": 1.03125,
"loss/dpo": 0.5546875,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.298828125,
"rewards/margins": 0.41796875,
"rewards/rejected": -0.71875,
"step": 95
},
{
"epoch": 0.4247787610619469,
"grad_norm": 14.678760528564453,
"learning_rate": 7e-07,
"logits/chosen": -0.46484375,
"logits/rejected": -0.439453125,
"logps/chosen": -304.0,
"logps/rejected": -248.0,
"loss": 0.6094,
"loss/chosen-sft": 1.2265625,
"loss/dpo": 0.6484375,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.287109375,
"rewards/margins": 0.1328125,
"rewards/rejected": -0.41796875,
"step": 96
},
{
"epoch": 0.42920353982300885,
"grad_norm": 12.859259605407715,
"learning_rate": 7e-07,
"logits/chosen": -0.50390625,
"logits/rejected": -0.4765625,
"logps/chosen": -255.0,
"logps/rejected": -226.0,
"loss": 0.5938,
"loss/chosen-sft": 1.1953125,
"loss/dpo": 0.64453125,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.353515625,
"rewards/margins": 0.15625,
"rewards/rejected": -0.51171875,
"step": 97
},
{
"epoch": 0.4336283185840708,
"grad_norm": 9.326794624328613,
"learning_rate": 7e-07,
"logits/chosen": -0.4765625,
"logits/rejected": -0.447265625,
"logps/chosen": -272.0,
"logps/rejected": -268.0,
"loss": 0.5752,
"loss/chosen-sft": 1.1640625,
"loss/dpo": 0.54296875,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.291015625,
"rewards/margins": 0.4765625,
"rewards/rejected": -0.76953125,
"step": 98
},
{
"epoch": 0.43805309734513276,
"grad_norm": 10.561424255371094,
"learning_rate": 7e-07,
"logits/chosen": -0.5625,
"logits/rejected": -0.5234375,
"logps/chosen": -304.0,
"logps/rejected": -282.0,
"loss": 0.5786,
"loss/chosen-sft": 1.1640625,
"loss/dpo": 0.578125,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.390625,
"rewards/margins": 0.353515625,
"rewards/rejected": -0.7421875,
"step": 99
},
{
"epoch": 0.4424778761061947,
"grad_norm": 102.05797576904297,
"learning_rate": 7e-07,
"logits/chosen": -0.44921875,
"logits/rejected": -0.423828125,
"logps/chosen": -235.0,
"logps/rejected": -284.0,
"loss": 0.6079,
"loss/chosen-sft": 1.1171875,
"loss/dpo": 0.65234375,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.423828125,
"rewards/margins": 0.236328125,
"rewards/rejected": -0.66015625,
"step": 100
},
{
"epoch": 0.4469026548672566,
"grad_norm": 10.166366577148438,
"learning_rate": 7e-07,
"logits/chosen": -0.51953125,
"logits/rejected": -0.431640625,
"logps/chosen": -342.0,
"logps/rejected": -302.0,
"loss": 0.5864,
"loss/chosen-sft": 1.1953125,
"loss/dpo": 0.53515625,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.333984375,
"rewards/margins": 0.5,
"rewards/rejected": -0.8359375,
"step": 101
},
{
"epoch": 0.45132743362831856,
"grad_norm": 9.208560943603516,
"learning_rate": 7e-07,
"logits/chosen": -0.373046875,
"logits/rejected": -0.3359375,
"logps/chosen": -332.0,
"logps/rejected": -326.0,
"loss": 0.584,
"loss/chosen-sft": 1.1640625,
"loss/dpo": 0.5390625,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.30078125,
"rewards/margins": 0.435546875,
"rewards/rejected": -0.73828125,
"step": 102
},
{
"epoch": 0.4557522123893805,
"grad_norm": 58.12112045288086,
"learning_rate": 7e-07,
"logits/chosen": -0.46484375,
"logits/rejected": -0.486328125,
"logps/chosen": -298.0,
"logps/rejected": -300.0,
"loss": 0.6191,
"loss/chosen-sft": 1.1640625,
"loss/dpo": 0.58984375,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.365234375,
"rewards/margins": 0.40625,
"rewards/rejected": -0.76953125,
"step": 103
},
{
"epoch": 0.46017699115044247,
"grad_norm": 32.06050491333008,
"learning_rate": 7e-07,
"logits/chosen": -0.51953125,
"logits/rejected": -0.49609375,
"logps/chosen": -376.0,
"logps/rejected": -346.0,
"loss": 0.5581,
"loss/chosen-sft": 1.2265625,
"loss/dpo": 0.55078125,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.35546875,
"rewards/margins": 0.46875,
"rewards/rejected": -0.82421875,
"step": 104
},
{
"epoch": 0.4646017699115044,
"grad_norm": 66.27430725097656,
"learning_rate": 7e-07,
"logits/chosen": -0.578125,
"logits/rejected": -0.53125,
"logps/chosen": -340.0,
"logps/rejected": -284.0,
"loss": 0.5674,
"loss/chosen-sft": 1.2734375,
"loss/dpo": 0.5390625,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.37890625,
"rewards/margins": 0.427734375,
"rewards/rejected": -0.80859375,
"step": 105
},
{
"epoch": 0.4690265486725664,
"grad_norm": 43.450740814208984,
"learning_rate": 7e-07,
"logits/chosen": -0.369140625,
"logits/rejected": -0.443359375,
"logps/chosen": -328.0,
"logps/rejected": -310.0,
"loss": 0.5688,
"loss/chosen-sft": 1.1640625,
"loss/dpo": 0.59765625,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.404296875,
"rewards/margins": 0.318359375,
"rewards/rejected": -0.72265625,
"step": 106
},
{
"epoch": 0.47345132743362833,
"grad_norm": 48.25244903564453,
"learning_rate": 7e-07,
"logits/chosen": -0.4765625,
"logits/rejected": -0.43359375,
"logps/chosen": -320.0,
"logps/rejected": -260.0,
"loss": 0.5781,
"loss/chosen-sft": 1.2421875,
"loss/dpo": 0.62890625,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.5234375,
"rewards/margins": 0.283203125,
"rewards/rejected": -0.8046875,
"step": 107
},
{
"epoch": 0.4778761061946903,
"grad_norm": 26.64389419555664,
"learning_rate": 7e-07,
"logits/chosen": -0.427734375,
"logits/rejected": -0.41796875,
"logps/chosen": -262.0,
"logps/rejected": -240.0,
"loss": 0.564,
"loss/chosen-sft": 1.1015625,
"loss/dpo": 0.625,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.357421875,
"rewards/margins": 0.216796875,
"rewards/rejected": -0.57421875,
"step": 108
},
{
"epoch": 0.4823008849557522,
"grad_norm": 51.666202545166016,
"learning_rate": 7e-07,
"logits/chosen": -0.671875,
"logits/rejected": -0.734375,
"logps/chosen": -372.0,
"logps/rejected": -340.0,
"loss": 0.564,
"loss/chosen-sft": 1.3125,
"loss/dpo": 0.5234375,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.35546875,
"rewards/margins": 0.5859375,
"rewards/rejected": -0.94140625,
"step": 109
},
{
"epoch": 0.48672566371681414,
"grad_norm": 14.793038368225098,
"learning_rate": 7e-07,
"logits/chosen": -0.490234375,
"logits/rejected": -0.57421875,
"logps/chosen": -288.0,
"logps/rejected": -262.0,
"loss": 0.564,
"loss/chosen-sft": 1.2890625,
"loss/dpo": 0.63671875,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.53125,
"rewards/margins": 0.232421875,
"rewards/rejected": -0.765625,
"step": 110
},
{
"epoch": 0.4911504424778761,
"grad_norm": 54.62705993652344,
"learning_rate": 7e-07,
"logits/chosen": -0.51171875,
"logits/rejected": -0.59765625,
"logps/chosen": -312.0,
"logps/rejected": -286.0,
"loss": 0.5803,
"loss/chosen-sft": 1.21875,
"loss/dpo": 0.53125,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.466796875,
"rewards/margins": 0.515625,
"rewards/rejected": -0.98046875,
"step": 111
},
{
"epoch": 0.49557522123893805,
"grad_norm": 45.08600997924805,
"learning_rate": 7e-07,
"logits/chosen": -0.53125,
"logits/rejected": -0.65625,
"logps/chosen": -306.0,
"logps/rejected": -314.0,
"loss": 0.5442,
"loss/chosen-sft": 1.1953125,
"loss/dpo": 0.57421875,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.435546875,
"rewards/margins": 0.34375,
"rewards/rejected": -0.78125,
"step": 112
},
{
"epoch": 0.5,
"grad_norm": 29.89005470275879,
"learning_rate": 7e-07,
"logits/chosen": -0.5234375,
"logits/rejected": -0.5078125,
"logps/chosen": -282.0,
"logps/rejected": -242.0,
"loss": 0.5535,
"loss/chosen-sft": 1.1015625,
"loss/dpo": 0.63671875,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.43359375,
"rewards/margins": 0.21484375,
"rewards/rejected": -0.6484375,
"step": 113
},
{
"epoch": 0.504424778761062,
"grad_norm": 5.516697406768799,
"learning_rate": 7e-07,
"logits/chosen": -0.376953125,
"logits/rejected": -0.423828125,
"logps/chosen": -249.0,
"logps/rejected": -298.0,
"loss": 0.5713,
"loss/chosen-sft": 1.125,
"loss/dpo": 0.51953125,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.298828125,
"rewards/margins": 0.48046875,
"rewards/rejected": -0.77734375,
"step": 114
},
{
"epoch": 0.5088495575221239,
"grad_norm": 42.4152946472168,
"learning_rate": 7e-07,
"logits/chosen": -0.5625,
"logits/rejected": -0.48046875,
"logps/chosen": -316.0,
"logps/rejected": -312.0,
"loss": 0.5811,
"loss/chosen-sft": 1.2265625,
"loss/dpo": 0.60546875,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.59765625,
"rewards/margins": 0.29296875,
"rewards/rejected": -0.890625,
"step": 115
},
{
"epoch": 0.5132743362831859,
"grad_norm": 43.45073699951172,
"learning_rate": 7e-07,
"logits/chosen": -0.51171875,
"logits/rejected": -0.49609375,
"logps/chosen": -320.0,
"logps/rejected": -368.0,
"loss": 0.5562,
"loss/chosen-sft": 1.203125,
"loss/dpo": 0.58203125,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.51953125,
"rewards/margins": 0.431640625,
"rewards/rejected": -0.94921875,
"step": 116
},
{
"epoch": 0.5176991150442478,
"grad_norm": 13.291467666625977,
"learning_rate": 7e-07,
"logits/chosen": -0.392578125,
"logits/rejected": -0.375,
"logps/chosen": -300.0,
"logps/rejected": -316.0,
"loss": 0.5491,
"loss/chosen-sft": 1.2734375,
"loss/dpo": 0.58984375,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.5078125,
"rewards/margins": 0.3515625,
"rewards/rejected": -0.859375,
"step": 117
},
{
"epoch": 0.5221238938053098,
"grad_norm": 16.5191707611084,
"learning_rate": 7e-07,
"logits/chosen": -0.330078125,
"logits/rejected": -0.240234375,
"logps/chosen": -292.0,
"logps/rejected": -292.0,
"loss": 0.5605,
"loss/chosen-sft": 1.2421875,
"loss/dpo": 0.57421875,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.484375,
"rewards/margins": 0.34375,
"rewards/rejected": -0.828125,
"step": 118
},
{
"epoch": 0.5265486725663717,
"grad_norm": 55.267738342285156,
"learning_rate": 7e-07,
"logits/chosen": -0.58984375,
"logits/rejected": -0.62890625,
"logps/chosen": -316.0,
"logps/rejected": -318.0,
"loss": 0.5439,
"loss/chosen-sft": 1.1484375,
"loss/dpo": 0.53515625,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.412109375,
"rewards/margins": 0.482421875,
"rewards/rejected": -0.89453125,
"step": 119
},
{
"epoch": 0.5309734513274337,
"grad_norm": 52.895042419433594,
"learning_rate": 7e-07,
"logits/chosen": -0.28515625,
"logits/rejected": -0.380859375,
"logps/chosen": -252.0,
"logps/rejected": -278.0,
"loss": 0.5623,
"loss/chosen-sft": 0.96484375,
"loss/dpo": 0.50390625,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.359375,
"rewards/margins": 0.609375,
"rewards/rejected": -0.96875,
"step": 120
},
{
"epoch": 0.5353982300884956,
"grad_norm": 33.5416374206543,
"learning_rate": 7e-07,
"logits/chosen": -0.365234375,
"logits/rejected": -0.45703125,
"logps/chosen": -282.0,
"logps/rejected": -262.0,
"loss": 0.5391,
"loss/chosen-sft": 1.28125,
"loss/dpo": 0.546875,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.4296875,
"rewards/margins": 0.4609375,
"rewards/rejected": -0.890625,
"step": 121
},
{
"epoch": 0.5398230088495575,
"grad_norm": 55.33546447753906,
"learning_rate": 7e-07,
"logits/chosen": -0.53515625,
"logits/rejected": -0.5703125,
"logps/chosen": -334.0,
"logps/rejected": -340.0,
"loss": 0.5278,
"loss/chosen-sft": 1.2578125,
"loss/dpo": 0.53515625,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.45703125,
"rewards/margins": 0.609375,
"rewards/rejected": -1.0703125,
"step": 122
},
{
"epoch": 0.5442477876106194,
"grad_norm": 46.70622253417969,
"learning_rate": 7e-07,
"logits/chosen": -0.53515625,
"logits/rejected": -0.59765625,
"logps/chosen": -308.0,
"logps/rejected": -274.0,
"loss": 0.55,
"loss/chosen-sft": 1.34375,
"loss/dpo": 0.5703125,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.5390625,
"rewards/margins": 0.3671875,
"rewards/rejected": -0.90625,
"step": 123
},
{
"epoch": 0.5486725663716814,
"grad_norm": 48.83370590209961,
"learning_rate": 7e-07,
"logits/chosen": -0.5234375,
"logits/rejected": -0.60546875,
"logps/chosen": -322.0,
"logps/rejected": -318.0,
"loss": 0.5825,
"loss/chosen-sft": 1.2734375,
"loss/dpo": 0.59375,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.625,
"rewards/margins": 0.361328125,
"rewards/rejected": -0.984375,
"step": 124
},
{
"epoch": 0.5530973451327433,
"grad_norm": 25.2650089263916,
"learning_rate": 7e-07,
"logits/chosen": -0.3203125,
"logits/rejected": -0.25390625,
"logps/chosen": -244.0,
"logps/rejected": -274.0,
"loss": 0.5889,
"loss/chosen-sft": 1.1484375,
"loss/dpo": 0.55859375,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.439453125,
"rewards/margins": 0.41796875,
"rewards/rejected": -0.859375,
"step": 125
},
{
"epoch": 0.5575221238938053,
"grad_norm": 36.186500549316406,
"learning_rate": 7e-07,
"logits/chosen": -0.58984375,
"logits/rejected": -0.6640625,
"logps/chosen": -324.0,
"logps/rejected": -358.0,
"loss": 0.585,
"loss/chosen-sft": 1.2890625,
"loss/dpo": 0.54296875,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.6484375,
"rewards/margins": 0.50390625,
"rewards/rejected": -1.15625,
"step": 126
},
{
"epoch": 0.5619469026548672,
"grad_norm": 13.623043060302734,
"learning_rate": 7e-07,
"logits/chosen": -0.640625,
"logits/rejected": -0.66796875,
"logps/chosen": -312.0,
"logps/rejected": -332.0,
"loss": 0.5789,
"loss/chosen-sft": 1.2109375,
"loss/dpo": 0.5546875,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.38671875,
"rewards/margins": 0.42578125,
"rewards/rejected": -0.8125,
"step": 127
},
{
"epoch": 0.5663716814159292,
"grad_norm": 9.796255111694336,
"learning_rate": 7e-07,
"logits/chosen": -0.494140625,
"logits/rejected": -0.6015625,
"logps/chosen": -290.0,
"logps/rejected": -346.0,
"loss": 0.5703,
"loss/chosen-sft": 1.3203125,
"loss/dpo": 0.609375,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.65234375,
"rewards/margins": 0.359375,
"rewards/rejected": -1.015625,
"step": 128
},
{
"epoch": 0.5707964601769911,
"grad_norm": 70.11766052246094,
"learning_rate": 7e-07,
"logits/chosen": -0.345703125,
"logits/rejected": -0.353515625,
"logps/chosen": -255.0,
"logps/rejected": -296.0,
"loss": 0.5767,
"loss/chosen-sft": 1.15625,
"loss/dpo": 0.48046875,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.3046875,
"rewards/margins": 0.75,
"rewards/rejected": -1.0546875,
"step": 129
},
{
"epoch": 0.5752212389380531,
"grad_norm": 137.03662109375,
"learning_rate": 7e-07,
"logits/chosen": -0.095703125,
"logits/rejected": -0.38671875,
"logps/chosen": -212.0,
"logps/rejected": -262.0,
"loss": 0.5471,
"loss/chosen-sft": 1.125,
"loss/dpo": 0.515625,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.328125,
"rewards/margins": 0.65234375,
"rewards/rejected": -0.98046875,
"step": 130
},
{
"epoch": 0.5796460176991151,
"grad_norm": 148.0476531982422,
"learning_rate": 7e-07,
"logits/chosen": -0.5625,
"logits/rejected": -0.59375,
"logps/chosen": -270.0,
"logps/rejected": -284.0,
"loss": 0.5403,
"loss/chosen-sft": 1.3046875,
"loss/dpo": 0.6796875,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.54296875,
"rewards/margins": 0.134765625,
"rewards/rejected": -0.6796875,
"step": 131
},
{
"epoch": 0.584070796460177,
"grad_norm": 93.40387725830078,
"learning_rate": 7e-07,
"logits/chosen": -0.7109375,
"logits/rejected": -0.66015625,
"logps/chosen": -340.0,
"logps/rejected": -350.0,
"loss": 0.561,
"loss/chosen-sft": 1.390625,
"loss/dpo": 0.5078125,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.51953125,
"rewards/margins": 0.69921875,
"rewards/rejected": -1.21875,
"step": 132
},
{
"epoch": 0.588495575221239,
"grad_norm": 55.550758361816406,
"learning_rate": 7e-07,
"logits/chosen": -0.42578125,
"logits/rejected": -0.55859375,
"logps/chosen": -294.0,
"logps/rejected": -296.0,
"loss": 0.5627,
"loss/chosen-sft": 1.140625,
"loss/dpo": 0.6015625,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.453125,
"rewards/margins": 0.33203125,
"rewards/rejected": -0.78515625,
"step": 133
},
{
"epoch": 0.5929203539823009,
"grad_norm": 80.4654541015625,
"learning_rate": 7e-07,
"logits/chosen": -0.43359375,
"logits/rejected": -0.470703125,
"logps/chosen": -308.0,
"logps/rejected": -338.0,
"loss": 0.5481,
"loss/chosen-sft": 1.1640625,
"loss/dpo": 0.4765625,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.3984375,
"rewards/margins": 0.67578125,
"rewards/rejected": -1.0703125,
"step": 134
},
{
"epoch": 0.5973451327433629,
"grad_norm": 109.14212036132812,
"learning_rate": 7e-07,
"logits/chosen": -0.470703125,
"logits/rejected": -0.50390625,
"logps/chosen": -334.0,
"logps/rejected": -304.0,
"loss": 0.542,
"loss/chosen-sft": 1.171875,
"loss/dpo": 0.6171875,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.54296875,
"rewards/margins": 0.2578125,
"rewards/rejected": -0.80078125,
"step": 135
},
{
"epoch": 0.6017699115044248,
"grad_norm": 37.90422821044922,
"learning_rate": 7e-07,
"logits/chosen": -0.421875,
"logits/rejected": -0.427734375,
"logps/chosen": -372.0,
"logps/rejected": -298.0,
"loss": 0.5493,
"loss/chosen-sft": 1.3203125,
"loss/dpo": 0.546875,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.50390625,
"rewards/margins": 0.474609375,
"rewards/rejected": -0.9765625,
"step": 136
},
{
"epoch": 0.6061946902654868,
"grad_norm": 118.1666030883789,
"learning_rate": 7e-07,
"logits/chosen": -0.46875,
"logits/rejected": -0.515625,
"logps/chosen": -326.0,
"logps/rejected": -394.0,
"loss": 0.5112,
"loss/chosen-sft": 1.234375,
"loss/dpo": 0.5390625,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.419921875,
"rewards/margins": 0.5703125,
"rewards/rejected": -0.9921875,
"step": 137
},
{
"epoch": 0.6106194690265486,
"grad_norm": 99.32813262939453,
"learning_rate": 7e-07,
"logits/chosen": -0.427734375,
"logits/rejected": -0.50390625,
"logps/chosen": -288.0,
"logps/rejected": -268.0,
"loss": 0.5508,
"loss/chosen-sft": 1.3984375,
"loss/dpo": 0.61328125,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.5625,
"rewards/margins": 0.353515625,
"rewards/rejected": -0.9140625,
"step": 138
},
{
"epoch": 0.6150442477876106,
"grad_norm": 17.352619171142578,
"learning_rate": 7e-07,
"logits/chosen": -0.48046875,
"logits/rejected": -0.546875,
"logps/chosen": -282.0,
"logps/rejected": -272.0,
"loss": 0.5352,
"loss/chosen-sft": 1.1640625,
"loss/dpo": 0.51953125,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.466796875,
"rewards/margins": 0.5078125,
"rewards/rejected": -0.97265625,
"step": 139
},
{
"epoch": 0.6194690265486725,
"grad_norm": 59.95145797729492,
"learning_rate": 7e-07,
"logits/chosen": -0.6171875,
"logits/rejected": -0.6875,
"logps/chosen": -302.0,
"logps/rejected": -336.0,
"loss": 0.5435,
"loss/chosen-sft": 1.265625,
"loss/dpo": 0.51953125,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.4765625,
"rewards/margins": 0.55078125,
"rewards/rejected": -1.0234375,
"step": 140
},
{
"epoch": 0.6238938053097345,
"grad_norm": 55.3637580871582,
"learning_rate": 7e-07,
"logits/chosen": -0.51953125,
"logits/rejected": -0.56640625,
"logps/chosen": -342.0,
"logps/rejected": -382.0,
"loss": 0.5469,
"loss/chosen-sft": 1.3046875,
"loss/dpo": 0.55078125,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.5546875,
"rewards/margins": 0.65625,
"rewards/rejected": -1.2109375,
"step": 141
},
{
"epoch": 0.6283185840707964,
"grad_norm": 146.2696075439453,
"learning_rate": 7e-07,
"logits/chosen": -0.55859375,
"logits/rejected": -0.62890625,
"logps/chosen": -352.0,
"logps/rejected": -318.0,
"loss": 0.5806,
"loss/chosen-sft": 1.3515625,
"loss/dpo": 0.7578125,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.703125,
"rewards/margins": 0.07275390625,
"rewards/rejected": -0.77734375,
"step": 142
},
{
"epoch": 0.6327433628318584,
"grad_norm": 46.21394729614258,
"learning_rate": 7e-07,
"logits/chosen": -0.578125,
"logits/rejected": -0.5859375,
"logps/chosen": -268.0,
"logps/rejected": -296.0,
"loss": 0.5393,
"loss/chosen-sft": 1.265625,
"loss/dpo": 0.54296875,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.341796875,
"rewards/margins": 0.48046875,
"rewards/rejected": -0.82421875,
"step": 143
},
{
"epoch": 0.6371681415929203,
"grad_norm": 38.909610748291016,
"learning_rate": 7e-07,
"logits/chosen": -0.60546875,
"logits/rejected": -0.73828125,
"logps/chosen": -320.0,
"logps/rejected": -334.0,
"loss": 0.543,
"loss/chosen-sft": 1.4375,
"loss/dpo": 0.55859375,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.6796875,
"rewards/margins": 0.5234375,
"rewards/rejected": -1.203125,
"step": 144
},
{
"epoch": 0.6415929203539823,
"grad_norm": 137.8043975830078,
"learning_rate": 7e-07,
"logits/chosen": -0.53515625,
"logits/rejected": -0.58203125,
"logps/chosen": -340.0,
"logps/rejected": -290.0,
"loss": 0.5903,
"loss/chosen-sft": 1.328125,
"loss/dpo": 0.65234375,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.7109375,
"rewards/margins": 0.333984375,
"rewards/rejected": -1.046875,
"step": 145
},
{
"epoch": 0.6460176991150443,
"grad_norm": 36.96350860595703,
"learning_rate": 7e-07,
"logits/chosen": -0.6953125,
"logits/rejected": -0.66796875,
"logps/chosen": -398.0,
"logps/rejected": -382.0,
"loss": 0.5288,
"loss/chosen-sft": 1.34375,
"loss/dpo": 0.546875,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.84375,
"rewards/margins": 0.486328125,
"rewards/rejected": -1.328125,
"step": 146
},
{
"epoch": 0.6504424778761062,
"grad_norm": 31.765138626098633,
"learning_rate": 7e-07,
"logits/chosen": -0.6796875,
"logits/rejected": -0.66015625,
"logps/chosen": -354.0,
"logps/rejected": -326.0,
"loss": 0.5535,
"loss/chosen-sft": 1.4375,
"loss/dpo": 0.5078125,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.63671875,
"rewards/margins": 0.5546875,
"rewards/rejected": -1.1953125,
"step": 147
},
{
"epoch": 0.6548672566371682,
"grad_norm": 73.44290924072266,
"learning_rate": 7e-07,
"logits/chosen": -0.609375,
"logits/rejected": -0.69921875,
"logps/chosen": -356.0,
"logps/rejected": -350.0,
"loss": 0.5137,
"loss/chosen-sft": 1.3046875,
"loss/dpo": 0.474609375,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.5234375,
"rewards/margins": 0.67578125,
"rewards/rejected": -1.203125,
"step": 148
},
{
"epoch": 0.6592920353982301,
"grad_norm": 14.243433952331543,
"learning_rate": 7e-07,
"logits/chosen": -0.50390625,
"logits/rejected": -0.55078125,
"logps/chosen": -326.0,
"logps/rejected": -316.0,
"loss": 0.5225,
"loss/chosen-sft": 1.2578125,
"loss/dpo": 0.48046875,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.61328125,
"rewards/margins": 0.765625,
"rewards/rejected": -1.3828125,
"step": 149
},
{
"epoch": 0.6637168141592921,
"grad_norm": 21.770702362060547,
"learning_rate": 7e-07,
"logits/chosen": -0.55078125,
"logits/rejected": -0.546875,
"logps/chosen": -326.0,
"logps/rejected": -330.0,
"loss": 0.5234,
"loss/chosen-sft": 1.296875,
"loss/dpo": 0.5,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.4609375,
"rewards/margins": 0.60546875,
"rewards/rejected": -1.0703125,
"step": 150
},
{
"epoch": 0.668141592920354,
"grad_norm": 77.819091796875,
"learning_rate": 7e-07,
"logits/chosen": -0.7109375,
"logits/rejected": -0.6171875,
"logps/chosen": -404.0,
"logps/rejected": -442.0,
"loss": 0.5,
"loss/chosen-sft": 1.3125,
"loss/dpo": 0.5078125,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.7734375,
"rewards/margins": 0.6875,
"rewards/rejected": -1.4609375,
"step": 151
},
{
"epoch": 0.672566371681416,
"grad_norm": 37.56740951538086,
"learning_rate": 7e-07,
"logits/chosen": -0.3828125,
"logits/rejected": -0.2275390625,
"logps/chosen": -294.0,
"logps/rejected": -324.0,
"loss": 0.5325,
"loss/chosen-sft": 1.3984375,
"loss/dpo": 0.59375,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.73828125,
"rewards/margins": 0.52734375,
"rewards/rejected": -1.265625,
"step": 152
},
{
"epoch": 0.6769911504424779,
"grad_norm": 105.4240951538086,
"learning_rate": 7e-07,
"logits/chosen": -0.423828125,
"logits/rejected": -0.54296875,
"logps/chosen": -368.0,
"logps/rejected": -390.0,
"loss": 0.5544,
"loss/chosen-sft": 1.28125,
"loss/dpo": 0.46875,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.77734375,
"rewards/margins": 0.7734375,
"rewards/rejected": -1.546875,
"step": 153
},
{
"epoch": 0.6814159292035398,
"grad_norm": 56.64170837402344,
"learning_rate": 7e-07,
"logits/chosen": -0.5390625,
"logits/rejected": -0.6640625,
"logps/chosen": -358.0,
"logps/rejected": -320.0,
"loss": 0.5625,
"loss/chosen-sft": 1.3984375,
"loss/dpo": 0.5859375,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.7578125,
"rewards/margins": 0.453125,
"rewards/rejected": -1.2109375,
"step": 154
},
{
"epoch": 0.6858407079646017,
"grad_norm": 22.23441505432129,
"learning_rate": 7e-07,
"logits/chosen": -0.61328125,
"logits/rejected": -0.5703125,
"logps/chosen": -336.0,
"logps/rejected": -314.0,
"loss": 0.5115,
"loss/chosen-sft": 1.453125,
"loss/dpo": 0.52734375,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.58984375,
"rewards/margins": 0.5625,
"rewards/rejected": -1.15625,
"step": 155
},
{
"epoch": 0.6902654867256637,
"grad_norm": 71.9916000366211,
"learning_rate": 7e-07,
"logits/chosen": -0.69921875,
"logits/rejected": -0.7109375,
"logps/chosen": -354.0,
"logps/rejected": -338.0,
"loss": 0.5227,
"loss/chosen-sft": 1.40625,
"loss/dpo": 0.5546875,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.59375,
"rewards/margins": 0.6171875,
"rewards/rejected": -1.2109375,
"step": 156
},
{
"epoch": 0.6946902654867256,
"grad_norm": 11.088499069213867,
"learning_rate": 7e-07,
"logits/chosen": -0.48046875,
"logits/rejected": -0.51171875,
"logps/chosen": -328.0,
"logps/rejected": -358.0,
"loss": 0.5259,
"loss/chosen-sft": 1.328125,
"loss/dpo": 0.60546875,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.84765625,
"rewards/margins": 0.4609375,
"rewards/rejected": -1.3125,
"step": 157
},
{
"epoch": 0.6991150442477876,
"grad_norm": 104.77384185791016,
"learning_rate": 7e-07,
"logits/chosen": -0.55859375,
"logits/rejected": -0.7265625,
"logps/chosen": -330.0,
"logps/rejected": -340.0,
"loss": 0.5566,
"loss/chosen-sft": 1.4296875,
"loss/dpo": 0.61328125,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.640625,
"rewards/margins": 0.5390625,
"rewards/rejected": -1.1796875,
"step": 158
},
{
"epoch": 0.7035398230088495,
"grad_norm": 87.36003875732422,
"learning_rate": 7e-07,
"logits/chosen": -0.53515625,
"logits/rejected": -0.55078125,
"logps/chosen": -296.0,
"logps/rejected": -342.0,
"loss": 0.5449,
"loss/chosen-sft": 1.359375,
"loss/dpo": 0.58203125,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.8125,
"rewards/margins": 0.65625,
"rewards/rejected": -1.46875,
"step": 159
},
{
"epoch": 0.7079646017699115,
"grad_norm": 37.620750427246094,
"learning_rate": 7e-07,
"logits/chosen": -0.455078125,
"logits/rejected": -0.50390625,
"logps/chosen": -376.0,
"logps/rejected": -444.0,
"loss": 0.5303,
"loss/chosen-sft": 1.265625,
"loss/dpo": 0.54296875,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.87890625,
"rewards/margins": 0.62109375,
"rewards/rejected": -1.5,
"step": 160
},
{
"epoch": 0.7123893805309734,
"grad_norm": 75.54209899902344,
"learning_rate": 7e-07,
"logits/chosen": -0.546875,
"logits/rejected": -0.64453125,
"logps/chosen": -348.0,
"logps/rejected": -368.0,
"loss": 0.5203,
"loss/chosen-sft": 1.3125,
"loss/dpo": 0.54296875,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.86328125,
"rewards/margins": 0.4921875,
"rewards/rejected": -1.359375,
"step": 161
},
{
"epoch": 0.7168141592920354,
"grad_norm": 54.434139251708984,
"learning_rate": 7e-07,
"logits/chosen": -0.60546875,
"logits/rejected": -0.74609375,
"logps/chosen": -298.0,
"logps/rejected": -352.0,
"loss": 0.511,
"loss/chosen-sft": 1.2109375,
"loss/dpo": 0.466796875,
"rewards/accuracies": 0.90625,
"rewards/chosen": -0.58984375,
"rewards/margins": 0.67578125,
"rewards/rejected": -1.265625,
"step": 162
},
{
"epoch": 0.7212389380530974,
"grad_norm": 10.78385066986084,
"learning_rate": 7e-07,
"logits/chosen": -0.6875,
"logits/rejected": -0.5859375,
"logps/chosen": -288.0,
"logps/rejected": -306.0,
"loss": 0.5369,
"loss/chosen-sft": 1.2734375,
"loss/dpo": 0.51953125,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.72265625,
"rewards/margins": 0.65234375,
"rewards/rejected": -1.375,
"step": 163
},
{
"epoch": 0.7256637168141593,
"grad_norm": 46.15651321411133,
"learning_rate": 7e-07,
"logits/chosen": -0.70703125,
"logits/rejected": -0.765625,
"logps/chosen": -364.0,
"logps/rejected": -338.0,
"loss": 0.533,
"loss/chosen-sft": 1.3828125,
"loss/dpo": 0.50390625,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.5703125,
"rewards/margins": 0.6015625,
"rewards/rejected": -1.171875,
"step": 164
},
{
"epoch": 0.7300884955752213,
"grad_norm": 76.59629821777344,
"learning_rate": 7e-07,
"logits/chosen": -0.57421875,
"logits/rejected": -0.64453125,
"logps/chosen": -258.0,
"logps/rejected": -324.0,
"loss": 0.5183,
"loss/chosen-sft": 1.1953125,
"loss/dpo": 0.52734375,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.58203125,
"rewards/margins": 0.5390625,
"rewards/rejected": -1.1171875,
"step": 165
},
{
"epoch": 0.7345132743362832,
"grad_norm": 73.99260711669922,
"learning_rate": 7e-07,
"logits/chosen": -0.390625,
"logits/rejected": -0.578125,
"logps/chosen": -316.0,
"logps/rejected": -372.0,
"loss": 0.5208,
"loss/chosen-sft": 1.2109375,
"loss/dpo": 0.5,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.8359375,
"rewards/margins": 1.0,
"rewards/rejected": -1.8359375,
"step": 166
},
{
"epoch": 0.7389380530973452,
"grad_norm": 47.95753479003906,
"learning_rate": 7e-07,
"logits/chosen": -0.5625,
"logits/rejected": -0.72265625,
"logps/chosen": -356.0,
"logps/rejected": -354.0,
"loss": 0.4956,
"loss/chosen-sft": 1.2890625,
"loss/dpo": 0.6171875,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.66015625,
"rewards/margins": 0.455078125,
"rewards/rejected": -1.1171875,
"step": 167
},
{
"epoch": 0.7433628318584071,
"grad_norm": 27.858415603637695,
"learning_rate": 7e-07,
"logits/chosen": -0.7109375,
"logits/rejected": -0.64453125,
"logps/chosen": -336.0,
"logps/rejected": -332.0,
"loss": 0.4971,
"loss/chosen-sft": 1.4375,
"loss/dpo": 0.59375,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.74609375,
"rewards/margins": 0.3515625,
"rewards/rejected": -1.09375,
"step": 168
},
{
"epoch": 0.7477876106194691,
"grad_norm": 113.69762420654297,
"learning_rate": 7e-07,
"logits/chosen": -0.79296875,
"logits/rejected": -0.79296875,
"logps/chosen": -420.0,
"logps/rejected": -446.0,
"loss": 0.4646,
"loss/chosen-sft": 1.4453125,
"loss/dpo": 0.439453125,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.90625,
"rewards/margins": 0.92578125,
"rewards/rejected": -1.828125,
"step": 169
},
{
"epoch": 0.7522123893805309,
"grad_norm": 92.540283203125,
"learning_rate": 7e-07,
"logits/chosen": -0.53125,
"logits/rejected": -0.60546875,
"logps/chosen": -356.0,
"logps/rejected": -412.0,
"loss": 0.5298,
"loss/chosen-sft": 1.3828125,
"loss/dpo": 0.51171875,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.9453125,
"rewards/margins": 0.64453125,
"rewards/rejected": -1.5859375,
"step": 170
},
{
"epoch": 0.7566371681415929,
"grad_norm": 34.24614334106445,
"learning_rate": 7e-07,
"logits/chosen": -0.515625,
"logits/rejected": -0.62109375,
"logps/chosen": -336.0,
"logps/rejected": -426.0,
"loss": 0.4983,
"loss/chosen-sft": 1.28125,
"loss/dpo": 0.40625,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.7734375,
"rewards/margins": 1.1328125,
"rewards/rejected": -1.90625,
"step": 171
},
{
"epoch": 0.7610619469026548,
"grad_norm": 26.35588264465332,
"learning_rate": 7e-07,
"logits/chosen": -0.7734375,
"logits/rejected": -0.79296875,
"logps/chosen": -358.0,
"logps/rejected": -358.0,
"loss": 0.52,
"loss/chosen-sft": 1.4453125,
"loss/dpo": 0.435546875,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.6875,
"rewards/margins": 0.92578125,
"rewards/rejected": -1.6171875,
"step": 172
},
{
"epoch": 0.7654867256637168,
"grad_norm": 70.59893798828125,
"learning_rate": 7e-07,
"logits/chosen": -0.578125,
"logits/rejected": -0.63671875,
"logps/chosen": -336.0,
"logps/rejected": -406.0,
"loss": 0.4802,
"loss/chosen-sft": 1.4609375,
"loss/dpo": 0.451171875,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.83984375,
"rewards/margins": 1.078125,
"rewards/rejected": -1.921875,
"step": 173
},
{
"epoch": 0.7699115044247787,
"grad_norm": 35.492210388183594,
"learning_rate": 7e-07,
"logits/chosen": -0.7109375,
"logits/rejected": -0.63671875,
"logps/chosen": -326.0,
"logps/rejected": -396.0,
"loss": 0.4731,
"loss/chosen-sft": 1.4140625,
"loss/dpo": 0.490234375,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.69140625,
"rewards/margins": 1.109375,
"rewards/rejected": -1.8046875,
"step": 174
},
{
"epoch": 0.7743362831858407,
"grad_norm": 52.300148010253906,
"learning_rate": 7e-07,
"logits/chosen": -0.294921875,
"logits/rejected": -0.431640625,
"logps/chosen": -298.0,
"logps/rejected": -330.0,
"loss": 0.5232,
"loss/chosen-sft": 1.3984375,
"loss/dpo": 0.6796875,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.9453125,
"rewards/margins": 0.330078125,
"rewards/rejected": -1.2734375,
"step": 175
},
{
"epoch": 0.7787610619469026,
"grad_norm": 133.416748046875,
"learning_rate": 7e-07,
"logits/chosen": -0.59375,
"logits/rejected": -0.50390625,
"logps/chosen": -330.0,
"logps/rejected": -342.0,
"loss": 0.499,
"loss/chosen-sft": 1.6796875,
"loss/dpo": 0.609375,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.8828125,
"rewards/margins": 0.33203125,
"rewards/rejected": -1.21875,
"step": 176
},
{
"epoch": 0.7831858407079646,
"grad_norm": 23.086772918701172,
"learning_rate": 7e-07,
"logits/chosen": -0.5625,
"logits/rejected": -0.50390625,
"logps/chosen": -376.0,
"logps/rejected": -506.0,
"loss": 0.4585,
"loss/chosen-sft": 1.421875,
"loss/dpo": 0.392578125,
"rewards/accuracies": 0.90625,
"rewards/chosen": -0.84375,
"rewards/margins": 1.34375,
"rewards/rejected": -2.1875,
"step": 177
},
{
"epoch": 0.7876106194690266,
"grad_norm": 206.20298767089844,
"learning_rate": 7e-07,
"logits/chosen": -0.53125,
"logits/rejected": -0.470703125,
"logps/chosen": -276.0,
"logps/rejected": -296.0,
"loss": 0.5459,
"loss/chosen-sft": 1.2890625,
"loss/dpo": 0.5625,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.62890625,
"rewards/margins": 0.58203125,
"rewards/rejected": -1.2109375,
"step": 178
},
{
"epoch": 0.7920353982300885,
"grad_norm": 138.48703002929688,
"learning_rate": 7e-07,
"logits/chosen": -0.4375,
"logits/rejected": -0.51171875,
"logps/chosen": -294.0,
"logps/rejected": -364.0,
"loss": 0.4871,
"loss/chosen-sft": 1.328125,
"loss/dpo": 0.470703125,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.76171875,
"rewards/margins": 0.7109375,
"rewards/rejected": -1.46875,
"step": 179
},
{
"epoch": 0.7964601769911505,
"grad_norm": 7.023651599884033,
"learning_rate": 7e-07,
"logits/chosen": -0.65625,
"logits/rejected": -0.8359375,
"logps/chosen": -338.0,
"logps/rejected": -370.0,
"loss": 0.4651,
"loss/chosen-sft": 1.453125,
"loss/dpo": 0.431640625,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.8203125,
"rewards/margins": 1.015625,
"rewards/rejected": -1.828125,
"step": 180
},
{
"epoch": 0.8008849557522124,
"grad_norm": 101.0399169921875,
"learning_rate": 7e-07,
"logits/chosen": -0.51953125,
"logits/rejected": -0.578125,
"logps/chosen": -418.0,
"logps/rejected": -480.0,
"loss": 0.4773,
"loss/chosen-sft": 1.5,
"loss/dpo": 0.431640625,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.0625,
"rewards/margins": 1.21875,
"rewards/rejected": -2.28125,
"step": 181
},
{
"epoch": 0.8053097345132744,
"grad_norm": 42.745174407958984,
"learning_rate": 7e-07,
"logits/chosen": -0.6484375,
"logits/rejected": -0.5859375,
"logps/chosen": -376.0,
"logps/rejected": -418.0,
"loss": 0.4871,
"loss/chosen-sft": 1.5625,
"loss/dpo": 0.45703125,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.98828125,
"rewards/margins": 1.046875,
"rewards/rejected": -2.03125,
"step": 182
},
{
"epoch": 0.8097345132743363,
"grad_norm": 65.14553833007812,
"learning_rate": 7e-07,
"logits/chosen": -0.6484375,
"logits/rejected": -0.6953125,
"logps/chosen": -298.0,
"logps/rejected": -312.0,
"loss": 0.4883,
"loss/chosen-sft": 1.4921875,
"loss/dpo": 0.5234375,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.8828125,
"rewards/margins": 0.640625,
"rewards/rejected": -1.5234375,
"step": 183
},
{
"epoch": 0.8141592920353983,
"grad_norm": 306.6606750488281,
"learning_rate": 7e-07,
"logits/chosen": -0.451171875,
"logits/rejected": -0.5703125,
"logps/chosen": -320.0,
"logps/rejected": -420.0,
"loss": 0.5232,
"loss/chosen-sft": 1.3203125,
"loss/dpo": 0.482421875,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.79296875,
"rewards/margins": 1.2421875,
"rewards/rejected": -2.046875,
"step": 184
},
{
"epoch": 0.8185840707964602,
"grad_norm": 20.622095108032227,
"learning_rate": 7e-07,
"logits/chosen": -0.6171875,
"logits/rejected": -0.72265625,
"logps/chosen": -368.0,
"logps/rejected": -396.0,
"loss": 0.5178,
"loss/chosen-sft": 1.59375,
"loss/dpo": 0.490234375,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.76953125,
"rewards/margins": 0.90625,
"rewards/rejected": -1.671875,
"step": 185
},
{
"epoch": 0.8230088495575221,
"grad_norm": 15.8814115524292,
"learning_rate": 7e-07,
"logits/chosen": -0.640625,
"logits/rejected": -0.81640625,
"logps/chosen": -350.0,
"logps/rejected": -382.0,
"loss": 0.5017,
"loss/chosen-sft": 1.484375,
"loss/dpo": 0.53515625,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.83203125,
"rewards/margins": 0.8125,
"rewards/rejected": -1.6484375,
"step": 186
},
{
"epoch": 0.827433628318584,
"grad_norm": 45.224002838134766,
"learning_rate": 7e-07,
"logits/chosen": -0.478515625,
"logits/rejected": -0.404296875,
"logps/chosen": -294.0,
"logps/rejected": -386.0,
"loss": 0.5002,
"loss/chosen-sft": 1.3046875,
"loss/dpo": 0.453125,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.72265625,
"rewards/margins": 0.83984375,
"rewards/rejected": -1.5625,
"step": 187
},
{
"epoch": 0.831858407079646,
"grad_norm": 32.42481994628906,
"learning_rate": 7e-07,
"logits/chosen": -0.38671875,
"logits/rejected": -0.43359375,
"logps/chosen": -336.0,
"logps/rejected": -340.0,
"loss": 0.5051,
"loss/chosen-sft": 1.3359375,
"loss/dpo": 0.5390625,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.84765625,
"rewards/margins": 0.5703125,
"rewards/rejected": -1.4140625,
"step": 188
},
{
"epoch": 0.8362831858407079,
"grad_norm": 54.047035217285156,
"learning_rate": 7e-07,
"logits/chosen": -0.5234375,
"logits/rejected": -0.37890625,
"logps/chosen": -290.0,
"logps/rejected": -366.0,
"loss": 0.4817,
"loss/chosen-sft": 1.3671875,
"loss/dpo": 0.54296875,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.90234375,
"rewards/margins": 0.8125,
"rewards/rejected": -1.71875,
"step": 189
},
{
"epoch": 0.8407079646017699,
"grad_norm": 10.179818153381348,
"learning_rate": 7e-07,
"logits/chosen": -0.41796875,
"logits/rejected": -0.4296875,
"logps/chosen": -296.0,
"logps/rejected": -302.0,
"loss": 0.5049,
"loss/chosen-sft": 1.3984375,
"loss/dpo": 0.55859375,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.796875,
"rewards/margins": 0.482421875,
"rewards/rejected": -1.28125,
"step": 190
},
{
"epoch": 0.8451327433628318,
"grad_norm": 63.52503204345703,
"learning_rate": 7e-07,
"logits/chosen": -0.76953125,
"logits/rejected": -0.69921875,
"logps/chosen": -356.0,
"logps/rejected": -416.0,
"loss": 0.5083,
"loss/chosen-sft": 1.46875,
"loss/dpo": 0.486328125,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.93359375,
"rewards/margins": 0.859375,
"rewards/rejected": -1.7890625,
"step": 191
},
{
"epoch": 0.8495575221238938,
"grad_norm": 64.59293365478516,
"learning_rate": 7e-07,
"logits/chosen": -0.4609375,
"logits/rejected": -0.474609375,
"logps/chosen": -378.0,
"logps/rejected": -392.0,
"loss": 0.4697,
"loss/chosen-sft": 1.3203125,
"loss/dpo": 0.47265625,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.80078125,
"rewards/margins": 0.87109375,
"rewards/rejected": -1.671875,
"step": 192
},
{
"epoch": 0.8539823008849557,
"grad_norm": 48.203311920166016,
"learning_rate": 7e-07,
"logits/chosen": -0.3046875,
"logits/rejected": -0.31640625,
"logps/chosen": -308.0,
"logps/rejected": -390.0,
"loss": 0.5132,
"loss/chosen-sft": 1.171875,
"loss/dpo": 0.4375,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.6796875,
"rewards/margins": 0.9140625,
"rewards/rejected": -1.59375,
"step": 193
},
{
"epoch": 0.8584070796460177,
"grad_norm": 86.9441909790039,
"learning_rate": 7e-07,
"logits/chosen": -0.435546875,
"logits/rejected": -0.4609375,
"logps/chosen": -296.0,
"logps/rejected": -358.0,
"loss": 0.4749,
"loss/chosen-sft": 1.2734375,
"loss/dpo": 0.44140625,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.6484375,
"rewards/margins": 0.94921875,
"rewards/rejected": -1.59375,
"step": 194
},
{
"epoch": 0.8628318584070797,
"grad_norm": 108.06549835205078,
"learning_rate": 7e-07,
"logits/chosen": -0.4453125,
"logits/rejected": -0.5234375,
"logps/chosen": -342.0,
"logps/rejected": -404.0,
"loss": 0.4771,
"loss/chosen-sft": 1.4453125,
"loss/dpo": 0.65234375,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.9921875,
"rewards/margins": 0.64453125,
"rewards/rejected": -1.640625,
"step": 195
},
{
"epoch": 0.8672566371681416,
"grad_norm": 173.70831298828125,
"learning_rate": 7e-07,
"logits/chosen": -0.58984375,
"logits/rejected": -0.62890625,
"logps/chosen": -368.0,
"logps/rejected": -400.0,
"loss": 0.4434,
"loss/chosen-sft": 1.46875,
"loss/dpo": 0.44140625,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9453125,
"rewards/margins": 1.046875,
"rewards/rejected": -1.984375,
"step": 196
},
{
"epoch": 0.8716814159292036,
"grad_norm": 41.173553466796875,
"learning_rate": 7e-07,
"logits/chosen": -0.60546875,
"logits/rejected": -0.671875,
"logps/chosen": -392.0,
"logps/rejected": -470.0,
"loss": 0.4729,
"loss/chosen-sft": 1.5,
"loss/dpo": 0.44140625,
"rewards/accuracies": 0.84375,
"rewards/chosen": -1.140625,
"rewards/margins": 1.1015625,
"rewards/rejected": -2.25,
"step": 197
},
{
"epoch": 0.8761061946902655,
"grad_norm": 14.614328384399414,
"learning_rate": 7e-07,
"logits/chosen": -0.68359375,
"logits/rejected": -0.76171875,
"logps/chosen": -360.0,
"logps/rejected": -384.0,
"loss": 0.5215,
"loss/chosen-sft": 1.484375,
"loss/dpo": 0.48828125,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.046875,
"rewards/margins": 0.91015625,
"rewards/rejected": -1.953125,
"step": 198
},
{
"epoch": 0.8805309734513275,
"grad_norm": 86.90143585205078,
"learning_rate": 7e-07,
"logits/chosen": -0.8046875,
"logits/rejected": -0.81640625,
"logps/chosen": -454.0,
"logps/rejected": -482.0,
"loss": 0.541,
"loss/chosen-sft": 1.7578125,
"loss/dpo": 0.55859375,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.3828125,
"rewards/margins": 1.0625,
"rewards/rejected": -2.4375,
"step": 199
},
{
"epoch": 0.8849557522123894,
"grad_norm": 24.392606735229492,
"learning_rate": 7e-07,
"logits/chosen": -0.6171875,
"logits/rejected": -0.73828125,
"logps/chosen": -326.0,
"logps/rejected": -384.0,
"loss": 0.4583,
"loss/chosen-sft": 1.59375,
"loss/dpo": 0.50390625,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.953125,
"rewards/margins": 0.99609375,
"rewards/rejected": -1.953125,
"step": 200
},
{
"epoch": 0.8893805309734514,
"grad_norm": 67.55127716064453,
"learning_rate": 7e-07,
"logits/chosen": -0.5078125,
"logits/rejected": -0.5546875,
"logps/chosen": -408.0,
"logps/rejected": -484.0,
"loss": 0.5017,
"loss/chosen-sft": 1.5625,
"loss/dpo": 0.494140625,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.1328125,
"rewards/margins": 1.25,
"rewards/rejected": -2.375,
"step": 201
},
{
"epoch": 0.8938053097345132,
"grad_norm": 35.01057434082031,
"learning_rate": 7e-07,
"logits/chosen": -0.69921875,
"logits/rejected": -0.640625,
"logps/chosen": -336.0,
"logps/rejected": -458.0,
"loss": 0.5005,
"loss/chosen-sft": 1.4609375,
"loss/dpo": 0.478515625,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.75390625,
"rewards/margins": 1.3046875,
"rewards/rejected": -2.0625,
"step": 202
},
{
"epoch": 0.8982300884955752,
"grad_norm": 24.339378356933594,
"learning_rate": 7e-07,
"logits/chosen": -0.408203125,
"logits/rejected": -0.435546875,
"logps/chosen": -310.0,
"logps/rejected": -408.0,
"loss": 0.4685,
"loss/chosen-sft": 1.28125,
"loss/dpo": 0.455078125,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.73828125,
"rewards/margins": 1.234375,
"rewards/rejected": -1.9765625,
"step": 203
},
{
"epoch": 0.9026548672566371,
"grad_norm": 135.99609375,
"learning_rate": 7e-07,
"logits/chosen": -0.498046875,
"logits/rejected": -0.5625,
"logps/chosen": -314.0,
"logps/rejected": -396.0,
"loss": 0.4561,
"loss/chosen-sft": 1.3125,
"loss/dpo": 0.47265625,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.62109375,
"rewards/margins": 0.9140625,
"rewards/rejected": -1.53125,
"step": 204
},
{
"epoch": 0.9070796460176991,
"grad_norm": 137.96900939941406,
"learning_rate": 7e-07,
"logits/chosen": -0.50390625,
"logits/rejected": -0.7109375,
"logps/chosen": -330.0,
"logps/rejected": -432.0,
"loss": 0.5061,
"loss/chosen-sft": 1.3359375,
"loss/dpo": 0.470703125,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.7109375,
"rewards/margins": 0.953125,
"rewards/rejected": -1.6640625,
"step": 205
},
{
"epoch": 0.911504424778761,
"grad_norm": 165.43182373046875,
"learning_rate": 7e-07,
"logits/chosen": -0.4609375,
"logits/rejected": -0.3828125,
"logps/chosen": -260.0,
"logps/rejected": -288.0,
"loss": 0.5105,
"loss/chosen-sft": 1.265625,
"loss/dpo": 0.5625,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.609375,
"rewards/margins": 0.494140625,
"rewards/rejected": -1.109375,
"step": 206
},
{
"epoch": 0.915929203539823,
"grad_norm": 216.99935913085938,
"learning_rate": 7e-07,
"logits/chosen": -0.43359375,
"logits/rejected": -0.59375,
"logps/chosen": -308.0,
"logps/rejected": -344.0,
"loss": 0.4929,
"loss/chosen-sft": 1.3125,
"loss/dpo": 0.466796875,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.57421875,
"rewards/margins": 0.75390625,
"rewards/rejected": -1.328125,
"step": 207
},
{
"epoch": 0.9203539823008849,
"grad_norm": 51.624351501464844,
"learning_rate": 7e-07,
"logits/chosen": -0.5234375,
"logits/rejected": -0.6328125,
"logps/chosen": -304.0,
"logps/rejected": -286.0,
"loss": 0.5669,
"loss/chosen-sft": 1.4140625,
"loss/dpo": 0.59375,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.60546875,
"rewards/margins": 0.314453125,
"rewards/rejected": -0.91796875,
"step": 208
},
{
"epoch": 0.9247787610619469,
"grad_norm": 29.490407943725586,
"learning_rate": 7e-07,
"logits/chosen": -0.55859375,
"logits/rejected": -0.5703125,
"logps/chosen": -366.0,
"logps/rejected": -350.0,
"loss": 0.5042,
"loss/chosen-sft": 1.234375,
"loss/dpo": 0.46484375,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.451171875,
"rewards/margins": 0.796875,
"rewards/rejected": -1.25,
"step": 209
},
{
"epoch": 0.9292035398230089,
"grad_norm": 159.65997314453125,
"learning_rate": 7e-07,
"logits/chosen": -0.67578125,
"logits/rejected": -0.55859375,
"logps/chosen": -340.0,
"logps/rejected": -346.0,
"loss": 0.5374,
"loss/chosen-sft": 1.3125,
"loss/dpo": 0.474609375,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.6328125,
"rewards/margins": 0.83984375,
"rewards/rejected": -1.46875,
"step": 210
},
{
"epoch": 0.9336283185840708,
"grad_norm": 22.276451110839844,
"learning_rate": 7e-07,
"logits/chosen": -0.41015625,
"logits/rejected": -0.46875,
"logps/chosen": -308.0,
"logps/rejected": -324.0,
"loss": 0.5066,
"loss/chosen-sft": 1.359375,
"loss/dpo": 0.57421875,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.78515625,
"rewards/margins": 0.53515625,
"rewards/rejected": -1.3203125,
"step": 211
},
{
"epoch": 0.9380530973451328,
"grad_norm": 62.60710525512695,
"learning_rate": 7e-07,
"logits/chosen": -0.68359375,
"logits/rejected": -0.76171875,
"logps/chosen": -378.0,
"logps/rejected": -396.0,
"loss": 0.4917,
"loss/chosen-sft": 1.5,
"loss/dpo": 0.578125,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.94921875,
"rewards/margins": 0.8984375,
"rewards/rejected": -1.8515625,
"step": 212
},
{
"epoch": 0.9424778761061947,
"grad_norm": 183.3363037109375,
"learning_rate": 7e-07,
"logits/chosen": -0.62109375,
"logits/rejected": -0.71875,
"logps/chosen": -328.0,
"logps/rejected": -326.0,
"loss": 0.426,
"loss/chosen-sft": 1.375,
"loss/dpo": 0.47265625,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.80078125,
"rewards/margins": 0.8046875,
"rewards/rejected": -1.609375,
"step": 213
},
{
"epoch": 0.9469026548672567,
"grad_norm": 182.4455108642578,
"learning_rate": 7e-07,
"logits/chosen": -0.62890625,
"logits/rejected": -0.57421875,
"logps/chosen": -358.0,
"logps/rejected": -480.0,
"loss": 0.5305,
"loss/chosen-sft": 1.4765625,
"loss/dpo": 0.5234375,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.80078125,
"rewards/margins": 1.609375,
"rewards/rejected": -2.40625,
"step": 214
},
{
"epoch": 0.9513274336283186,
"grad_norm": 112.33076477050781,
"learning_rate": 7e-07,
"logits/chosen": -0.52734375,
"logits/rejected": -0.58203125,
"logps/chosen": -344.0,
"logps/rejected": -448.0,
"loss": 0.4475,
"loss/chosen-sft": 1.3359375,
"loss/dpo": 0.3359375,
"rewards/accuracies": 0.90625,
"rewards/chosen": -0.61328125,
"rewards/margins": 1.4453125,
"rewards/rejected": -2.0625,
"step": 215
},
{
"epoch": 0.9557522123893806,
"grad_norm": 125.87716674804688,
"learning_rate": 7e-07,
"logits/chosen": -0.4296875,
"logits/rejected": -0.47265625,
"logps/chosen": -314.0,
"logps/rejected": -462.0,
"loss": 0.4885,
"loss/chosen-sft": 1.5078125,
"loss/dpo": 0.365234375,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.84375,
"rewards/margins": 1.5546875,
"rewards/rejected": -2.390625,
"step": 216
},
{
"epoch": 0.9601769911504425,
"grad_norm": 159.3242950439453,
"learning_rate": 7e-07,
"logits/chosen": -0.53515625,
"logits/rejected": -0.53125,
"logps/chosen": -330.0,
"logps/rejected": -398.0,
"loss": 0.4597,
"loss/chosen-sft": 1.34375,
"loss/dpo": 0.4140625,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.765625,
"rewards/margins": 1.171875,
"rewards/rejected": -1.9453125,
"step": 217
},
{
"epoch": 0.9646017699115044,
"grad_norm": 36.168601989746094,
"learning_rate": 7e-07,
"logits/chosen": -0.451171875,
"logits/rejected": -0.44140625,
"logps/chosen": -326.0,
"logps/rejected": -366.0,
"loss": 0.4858,
"loss/chosen-sft": 1.3203125,
"loss/dpo": 0.52734375,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.953125,
"rewards/margins": 0.94921875,
"rewards/rejected": -1.8984375,
"step": 218
},
{
"epoch": 0.9690265486725663,
"grad_norm": 19.605365753173828,
"learning_rate": 7e-07,
"logits/chosen": -0.4921875,
"logits/rejected": -0.55859375,
"logps/chosen": -284.0,
"logps/rejected": -370.0,
"loss": 0.4822,
"loss/chosen-sft": 1.3046875,
"loss/dpo": 0.40234375,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.90234375,
"rewards/margins": 1.046875,
"rewards/rejected": -1.9453125,
"step": 219
},
{
"epoch": 0.9734513274336283,
"grad_norm": 143.3219451904297,
"learning_rate": 7e-07,
"logits/chosen": -0.37890625,
"logits/rejected": -0.4765625,
"logps/chosen": -366.0,
"logps/rejected": -510.0,
"loss": 0.509,
"loss/chosen-sft": 1.515625,
"loss/dpo": 0.54296875,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.2734375,
"rewards/margins": 1.03125,
"rewards/rejected": -2.296875,
"step": 220
},
{
"epoch": 0.9778761061946902,
"grad_norm": 268.8928527832031,
"learning_rate": 7e-07,
"logits/chosen": -0.392578125,
"logits/rejected": -0.30859375,
"logps/chosen": -249.0,
"logps/rejected": -414.0,
"loss": 0.5212,
"loss/chosen-sft": 1.3984375,
"loss/dpo": 0.404296875,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.78125,
"rewards/margins": 1.53125,
"rewards/rejected": -2.3125,
"step": 221
},
{
"epoch": 0.9823008849557522,
"grad_norm": 295.1648864746094,
"learning_rate": 7e-07,
"logits/chosen": -0.56640625,
"logits/rejected": -0.6328125,
"logps/chosen": -368.0,
"logps/rejected": -496.0,
"loss": 0.5183,
"loss/chosen-sft": 1.5,
"loss/dpo": 0.5,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.265625,
"rewards/margins": 0.96875,
"rewards/rejected": -2.234375,
"step": 222
},
{
"epoch": 0.9867256637168141,
"grad_norm": 21.13129425048828,
"learning_rate": 7e-07,
"logits/chosen": -0.498046875,
"logits/rejected": -0.6328125,
"logps/chosen": -346.0,
"logps/rejected": -374.0,
"loss": 0.4485,
"loss/chosen-sft": 1.5234375,
"loss/dpo": 0.5234375,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0390625,
"rewards/margins": 0.984375,
"rewards/rejected": -2.03125,
"step": 223
},
{
"epoch": 0.9911504424778761,
"grad_norm": 199.18679809570312,
"learning_rate": 7e-07,
"logits/chosen": -0.62890625,
"logits/rejected": -0.625,
"logps/chosen": -338.0,
"logps/rejected": -406.0,
"loss": 0.4338,
"loss/chosen-sft": 1.484375,
"loss/dpo": 0.5,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.109375,
"rewards/margins": 0.94140625,
"rewards/rejected": -2.046875,
"step": 224
},
{
"epoch": 0.995575221238938,
"grad_norm": 238.7730255126953,
"learning_rate": 7e-07,
"logits/chosen": -0.478515625,
"logits/rejected": -0.30078125,
"logps/chosen": -300.0,
"logps/rejected": -416.0,
"loss": 0.4624,
"loss/chosen-sft": 1.4765625,
"loss/dpo": 0.451171875,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.9609375,
"rewards/margins": 1.2890625,
"rewards/rejected": -2.25,
"step": 225
},
{
"epoch": 1.0,
"grad_norm": 94.32278442382812,
"learning_rate": 7e-07,
"logits/chosen": -0.482421875,
"logits/rejected": -0.55078125,
"logps/chosen": -312.0,
"logps/rejected": -424.0,
"loss": 0.4558,
"loss/chosen-sft": 1.359375,
"loss/dpo": 0.478515625,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.828125,
"rewards/margins": 1.109375,
"rewards/rejected": -1.9296875,
"step": 226
},
{
"epoch": 1.0,
"step": 226,
"total_flos": 0.0,
"train_loss": 0.5733826223727876,
"train_runtime": 2164.3688,
"train_samples_per_second": 26.647,
"train_steps_per_second": 0.104
}
],
"logging_steps": 1,
"max_steps": 226,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}