{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 226, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004424778761061947, "grad_norm": 2.562241554260254, "learning_rate": 7e-07, "logits/chosen": -0.2119140625, "logits/rejected": -0.1328125, "logps/chosen": -242.0, "logps/rejected": -178.0, "loss": 0.6914, "loss/chosen-sft": 1.0, "loss/dpo": 0.69140625, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.008849557522123894, "grad_norm": 2.733123302459717, "learning_rate": 7e-07, "logits/chosen": -0.263671875, "logits/rejected": -0.19140625, "logps/chosen": -225.0, "logps/rejected": -175.0, "loss": 0.6929, "loss/chosen-sft": 1.09375, "loss/dpo": 0.69140625, "rewards/accuracies": 0.3125, "rewards/chosen": 0.00019550323486328125, "rewards/margins": -0.00183868408203125, "rewards/rejected": 0.0020294189453125, "step": 2 }, { "epoch": 0.01327433628318584, "grad_norm": 5.071370601654053, "learning_rate": 7e-07, "logits/chosen": -0.287109375, "logits/rejected": -0.1796875, "logps/chosen": -258.0, "logps/rejected": -195.0, "loss": 0.6914, "loss/chosen-sft": 1.1171875, "loss/dpo": 0.69140625, "rewards/accuracies": 0.21875, "rewards/chosen": -0.000743865966796875, "rewards/margins": -3.910064697265625e-05, "rewards/rejected": -0.000701904296875, "step": 3 }, { "epoch": 0.017699115044247787, "grad_norm": 9.871452331542969, "learning_rate": 7e-07, "logits/chosen": -0.294921875, "logits/rejected": -0.310546875, "logps/chosen": -270.0, "logps/rejected": -236.0, "loss": 0.6914, "loss/chosen-sft": 1.09375, "loss/dpo": 0.69140625, "rewards/accuracies": 0.375, "rewards/chosen": -0.0017242431640625, "rewards/margins": 0.002197265625, "rewards/rejected": -0.00390625, "step": 4 }, { "epoch": 0.022123893805309734, "grad_norm": 8.816597938537598, "learning_rate": 7e-07, "logits/chosen": -0.1328125, "logits/rejected": -0.2421875, "logps/chosen": -280.0, "logps/rejected": -249.0, "loss": 0.6914, "loss/chosen-sft": 1.1484375, "loss/dpo": 0.6953125, "rewards/accuracies": 0.3125, "rewards/chosen": -0.00156402587890625, "rewards/margins": -0.0022735595703125, "rewards/rejected": 0.000705718994140625, "step": 5 }, { "epoch": 0.02654867256637168, "grad_norm": 8.637555122375488, "learning_rate": 7e-07, "logits/chosen": -0.12890625, "logits/rejected": -0.1611328125, "logps/chosen": -223.0, "logps/rejected": -172.0, "loss": 0.6895, "loss/chosen-sft": 1.0, "loss/dpo": 0.69140625, "rewards/accuracies": 0.625, "rewards/chosen": 0.005889892578125, "rewards/margins": 0.006195068359375, "rewards/rejected": -0.00031280517578125, "step": 6 }, { "epoch": 0.030973451327433628, "grad_norm": 4.954357147216797, "learning_rate": 7e-07, "logits/chosen": -0.27734375, "logits/rejected": -0.125, "logps/chosen": -312.0, "logps/rejected": -228.0, "loss": 0.6899, "loss/chosen-sft": 1.078125, "loss/dpo": 0.69140625, "rewards/accuracies": 0.40625, "rewards/chosen": 0.00113677978515625, "rewards/margins": 0.0034027099609375, "rewards/rejected": -0.0022735595703125, "step": 7 }, { "epoch": 0.035398230088495575, "grad_norm": 7.5598249435424805, "learning_rate": 7e-07, "logits/chosen": -0.095703125, "logits/rejected": -0.1416015625, "logps/chosen": -240.0, "logps/rejected": -233.0, "loss": 0.6875, "loss/chosen-sft": 1.0546875, "loss/dpo": 0.6875, "rewards/accuracies": 0.625, "rewards/chosen": 0.006988525390625, "rewards/margins": 0.01007080078125, "rewards/rejected": -0.003082275390625, "step": 8 }, { "epoch": 0.03982300884955752, "grad_norm": 2.6033551692962646, "learning_rate": 7e-07, "logits/chosen": -0.294921875, "logits/rejected": -0.28125, "logps/chosen": -292.0, "logps/rejected": -212.0, "loss": 0.6885, "loss/chosen-sft": 1.15625, "loss/dpo": 0.6875, "rewards/accuracies": 0.5625, "rewards/chosen": 0.00665283203125, "rewards/margins": 0.0135498046875, "rewards/rejected": -0.00689697265625, "step": 9 }, { "epoch": 0.04424778761061947, "grad_norm": 10.546786308288574, "learning_rate": 7e-07, "logits/chosen": -0.267578125, "logits/rejected": -0.050537109375, "logps/chosen": -308.0, "logps/rejected": -210.0, "loss": 0.687, "loss/chosen-sft": 1.078125, "loss/dpo": 0.68359375, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0142822265625, "rewards/margins": 0.02099609375, "rewards/rejected": -0.006683349609375, "step": 10 }, { "epoch": 0.048672566371681415, "grad_norm": 9.844188690185547, "learning_rate": 7e-07, "logits/chosen": -0.3046875, "logits/rejected": -0.177734375, "logps/chosen": -243.0, "logps/rejected": -234.0, "loss": 0.687, "loss/chosen-sft": 1.1796875, "loss/dpo": 0.68359375, "rewards/accuracies": 0.75, "rewards/chosen": 0.01025390625, "rewards/margins": 0.016357421875, "rewards/rejected": -0.006103515625, "step": 11 }, { "epoch": 0.05309734513274336, "grad_norm": 12.557535171508789, "learning_rate": 7e-07, "logits/chosen": -0.2265625, "logits/rejected": -0.2275390625, "logps/chosen": -244.0, "logps/rejected": -219.0, "loss": 0.686, "loss/chosen-sft": 1.078125, "loss/dpo": 0.6875, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0025177001953125, "rewards/margins": 0.0113525390625, "rewards/rejected": -0.0087890625, "step": 12 }, { "epoch": 0.05752212389380531, "grad_norm": 8.109821319580078, "learning_rate": 7e-07, "logits/chosen": -0.1953125, "logits/rejected": -0.224609375, "logps/chosen": -272.0, "logps/rejected": -217.0, "loss": 0.6836, "loss/chosen-sft": 1.03125, "loss/dpo": 0.68359375, "rewards/accuracies": 0.65625, "rewards/chosen": 0.00726318359375, "rewards/margins": 0.0164794921875, "rewards/rejected": -0.00921630859375, "step": 13 }, { "epoch": 0.061946902654867256, "grad_norm": 8.9277982711792, "learning_rate": 7e-07, "logits/chosen": -0.19140625, "logits/rejected": -0.2412109375, "logps/chosen": -294.0, "logps/rejected": -198.0, "loss": 0.6816, "loss/chosen-sft": 1.109375, "loss/dpo": 0.6796875, "rewards/accuracies": 0.75, "rewards/chosen": 0.007171630859375, "rewards/margins": 0.025146484375, "rewards/rejected": -0.0179443359375, "step": 14 }, { "epoch": 0.06637168141592921, "grad_norm": 3.4456562995910645, "learning_rate": 7e-07, "logits/chosen": -0.07470703125, "logits/rejected": -0.0712890625, "logps/chosen": -239.0, "logps/rejected": -227.0, "loss": 0.6826, "loss/chosen-sft": 0.93359375, "loss/dpo": 0.6796875, "rewards/accuracies": 0.59375, "rewards/chosen": 0.00970458984375, "rewards/margins": 0.0260009765625, "rewards/rejected": -0.016357421875, "step": 15 }, { "epoch": 0.07079646017699115, "grad_norm": 3.63268780708313, "learning_rate": 7e-07, "logits/chosen": -0.244140625, "logits/rejected": -0.255859375, "logps/chosen": -264.0, "logps/rejected": -208.0, "loss": 0.6826, "loss/chosen-sft": 1.0, "loss/dpo": 0.6796875, "rewards/accuracies": 0.53125, "rewards/chosen": 0.007537841796875, "rewards/margins": 0.03173828125, "rewards/rejected": -0.0242919921875, "step": 16 }, { "epoch": 0.0752212389380531, "grad_norm": 14.086406707763672, "learning_rate": 7e-07, "logits/chosen": -0.30078125, "logits/rejected": -0.271484375, "logps/chosen": -214.0, "logps/rejected": -197.0, "loss": 0.6807, "loss/chosen-sft": 0.9296875, "loss/dpo": 0.6875, "rewards/accuracies": 0.53125, "rewards/chosen": -0.004791259765625, "rewards/margins": 0.01422119140625, "rewards/rejected": -0.0189208984375, "step": 17 }, { "epoch": 0.07964601769911504, "grad_norm": 3.81648850440979, "learning_rate": 7e-07, "logits/chosen": -0.1435546875, "logits/rejected": -0.1494140625, "logps/chosen": -268.0, "logps/rejected": -223.0, "loss": 0.6797, "loss/chosen-sft": 1.0, "loss/dpo": 0.6796875, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0057373046875, "rewards/margins": 0.02734375, "rewards/rejected": -0.0216064453125, "step": 18 }, { "epoch": 0.084070796460177, "grad_norm": 3.1519317626953125, "learning_rate": 7e-07, "logits/chosen": -0.2109375, "logits/rejected": -0.1220703125, "logps/chosen": -247.0, "logps/rejected": -236.0, "loss": 0.6748, "loss/chosen-sft": 1.0625, "loss/dpo": 0.67578125, "rewards/accuracies": 0.65625, "rewards/chosen": 0.004730224609375, "rewards/margins": 0.04052734375, "rewards/rejected": -0.035888671875, "step": 19 }, { "epoch": 0.08849557522123894, "grad_norm": 8.66562271118164, "learning_rate": 7e-07, "logits/chosen": -0.1982421875, "logits/rejected": -0.1904296875, "logps/chosen": -304.0, "logps/rejected": -245.0, "loss": 0.6733, "loss/chosen-sft": 0.99609375, "loss/dpo": 0.671875, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0106201171875, "rewards/margins": 0.0400390625, "rewards/rejected": -0.029296875, "step": 20 }, { "epoch": 0.09292035398230089, "grad_norm": 3.798952579498291, "learning_rate": 7e-07, "logits/chosen": -0.2578125, "logits/rejected": -0.0281982421875, "logps/chosen": -296.0, "logps/rejected": -190.0, "loss": 0.6782, "loss/chosen-sft": 1.125, "loss/dpo": 0.68359375, "rewards/accuracies": 0.65625, "rewards/chosen": -0.006195068359375, "rewards/margins": 0.0238037109375, "rewards/rejected": -0.030029296875, "step": 21 }, { "epoch": 0.09734513274336283, "grad_norm": 6.044543743133545, "learning_rate": 7e-07, "logits/chosen": -0.1728515625, "logits/rejected": -0.263671875, "logps/chosen": -238.0, "logps/rejected": -225.0, "loss": 0.6743, "loss/chosen-sft": 0.98828125, "loss/dpo": 0.68359375, "rewards/accuracies": 0.59375, "rewards/chosen": -0.004638671875, "rewards/margins": 0.01611328125, "rewards/rejected": -0.020751953125, "step": 22 }, { "epoch": 0.10176991150442478, "grad_norm": 6.223972797393799, "learning_rate": 7e-07, "logits/chosen": -0.36328125, "logits/rejected": -0.37890625, "logps/chosen": -256.0, "logps/rejected": -235.0, "loss": 0.6768, "loss/chosen-sft": 1.09375, "loss/dpo": 0.6796875, "rewards/accuracies": 0.625, "rewards/chosen": -0.0225830078125, "rewards/margins": 0.03173828125, "rewards/rejected": -0.05419921875, "step": 23 }, { "epoch": 0.10619469026548672, "grad_norm": 2.817391872406006, "learning_rate": 7e-07, "logits/chosen": -0.2353515625, "logits/rejected": -0.27734375, "logps/chosen": -258.0, "logps/rejected": -196.0, "loss": 0.6699, "loss/chosen-sft": 1.09375, "loss/dpo": 0.67578125, "rewards/accuracies": 0.71875, "rewards/chosen": -0.00830078125, "rewards/margins": 0.03515625, "rewards/rejected": -0.043212890625, "step": 24 }, { "epoch": 0.11061946902654868, "grad_norm": 5.743912220001221, "learning_rate": 7e-07, "logits/chosen": -0.12890625, "logits/rejected": -0.07861328125, "logps/chosen": -268.0, "logps/rejected": -231.0, "loss": 0.6685, "loss/chosen-sft": 0.9140625, "loss/dpo": 0.66796875, "rewards/accuracies": 0.71875, "rewards/chosen": -0.0189208984375, "rewards/margins": 0.050537109375, "rewards/rejected": -0.0693359375, "step": 25 }, { "epoch": 0.11504424778761062, "grad_norm": 3.4631690979003906, "learning_rate": 7e-07, "logits/chosen": 0.046142578125, "logits/rejected": 0.018798828125, "logps/chosen": -213.0, "logps/rejected": -252.0, "loss": 0.6699, "loss/chosen-sft": 0.87109375, "loss/dpo": 0.65625, "rewards/accuracies": 0.84375, "rewards/chosen": 0.0076904296875, "rewards/margins": 0.07470703125, "rewards/rejected": -0.06689453125, "step": 26 }, { "epoch": 0.11946902654867257, "grad_norm": 2.302494525909424, "learning_rate": 7e-07, "logits/chosen": -0.2734375, "logits/rejected": -0.296875, "logps/chosen": -294.0, "logps/rejected": -219.0, "loss": 0.6733, "loss/chosen-sft": 1.1796875, "loss/dpo": 0.6640625, "rewards/accuracies": 0.71875, "rewards/chosen": -0.0025177001953125, "rewards/margins": 0.058837890625, "rewards/rejected": -0.061279296875, "step": 27 }, { "epoch": 0.12389380530973451, "grad_norm": 15.041751861572266, "learning_rate": 7e-07, "logits/chosen": -0.326171875, "logits/rejected": -0.2255859375, "logps/chosen": -324.0, "logps/rejected": -215.0, "loss": 0.6597, "loss/chosen-sft": 1.1328125, "loss/dpo": 0.65625, "rewards/accuracies": 0.78125, "rewards/chosen": 0.0016632080078125, "rewards/margins": 0.076171875, "rewards/rejected": -0.07470703125, "step": 28 }, { "epoch": 0.12831858407079647, "grad_norm": 10.651082992553711, "learning_rate": 7e-07, "logits/chosen": -0.055908203125, "logits/rejected": -0.04638671875, "logps/chosen": -264.0, "logps/rejected": -233.0, "loss": 0.6704, "loss/chosen-sft": 0.9140625, "loss/dpo": 0.65625, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0001506805419921875, "rewards/margins": 0.0751953125, "rewards/rejected": -0.0751953125, "step": 29 }, { "epoch": 0.13274336283185842, "grad_norm": 17.013574600219727, "learning_rate": 7e-07, "logits/chosen": -0.21484375, "logits/rejected": -0.138671875, "logps/chosen": -225.0, "logps/rejected": -202.0, "loss": 0.6685, "loss/chosen-sft": 0.984375, "loss/dpo": 0.67578125, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0390625, "rewards/margins": 0.038818359375, "rewards/rejected": -0.07763671875, "step": 30 }, { "epoch": 0.13716814159292035, "grad_norm": 3.365304708480835, "learning_rate": 7e-07, "logits/chosen": -0.267578125, "logits/rejected": -0.25390625, "logps/chosen": -296.0, "logps/rejected": -262.0, "loss": 0.6641, "loss/chosen-sft": 1.1640625, "loss/dpo": 0.6640625, "rewards/accuracies": 0.71875, "rewards/chosen": -0.0126953125, "rewards/margins": 0.06689453125, "rewards/rejected": -0.07958984375, "step": 31 }, { "epoch": 0.1415929203539823, "grad_norm": 5.34974479675293, "learning_rate": 7e-07, "logits/chosen": -0.263671875, "logits/rejected": -0.294921875, "logps/chosen": -234.0, "logps/rejected": -255.0, "loss": 0.668, "loss/chosen-sft": 1.0546875, "loss/dpo": 0.65625, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0081787109375, "rewards/margins": 0.0830078125, "rewards/rejected": -0.07470703125, "step": 32 }, { "epoch": 0.14601769911504425, "grad_norm": 5.2689948081970215, "learning_rate": 7e-07, "logits/chosen": -0.255859375, "logits/rejected": -0.38671875, "logps/chosen": -221.0, "logps/rejected": -318.0, "loss": 0.6582, "loss/chosen-sft": 1.0390625, "loss/dpo": 0.6484375, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0267333984375, "rewards/margins": 0.09814453125, "rewards/rejected": -0.12451171875, "step": 33 }, { "epoch": 0.1504424778761062, "grad_norm": 13.799201011657715, "learning_rate": 7e-07, "logits/chosen": -0.275390625, "logits/rejected": -0.21875, "logps/chosen": -238.0, "logps/rejected": -208.0, "loss": 0.6733, "loss/chosen-sft": 1.046875, "loss/dpo": 0.6484375, "rewards/accuracies": 0.6875, "rewards/chosen": -0.03662109375, "rewards/margins": 0.09765625, "rewards/rejected": -0.1337890625, "step": 34 }, { "epoch": 0.15486725663716813, "grad_norm": 13.07458782196045, "learning_rate": 7e-07, "logits/chosen": -0.369140625, "logits/rejected": -0.24609375, "logps/chosen": -280.0, "logps/rejected": -255.0, "loss": 0.6631, "loss/chosen-sft": 1.1015625, "loss/dpo": 0.66015625, "rewards/accuracies": 0.5625, "rewards/chosen": -0.078125, "rewards/margins": 0.07470703125, "rewards/rejected": -0.15234375, "step": 35 }, { "epoch": 0.1592920353982301, "grad_norm": 2.256340742111206, "learning_rate": 7e-07, "logits/chosen": -0.287109375, "logits/rejected": -0.28125, "logps/chosen": -226.0, "logps/rejected": -240.0, "loss": 0.6543, "loss/chosen-sft": 1.015625, "loss/dpo": 0.6484375, "rewards/accuracies": 0.75, "rewards/chosen": -0.049072265625, "rewards/margins": 0.095703125, "rewards/rejected": -0.1455078125, "step": 36 }, { "epoch": 0.16371681415929204, "grad_norm": 8.795002937316895, "learning_rate": 7e-07, "logits/chosen": -0.2294921875, "logits/rejected": -0.21484375, "logps/chosen": -225.0, "logps/rejected": -216.0, "loss": 0.6523, "loss/chosen-sft": 0.9765625, "loss/dpo": 0.671875, "rewards/accuracies": 0.59375, "rewards/chosen": -0.06787109375, "rewards/margins": 0.047607421875, "rewards/rejected": -0.115234375, "step": 37 }, { "epoch": 0.168141592920354, "grad_norm": 2.735612154006958, "learning_rate": 7e-07, "logits/chosen": -0.271484375, "logits/rejected": -0.26171875, "logps/chosen": -322.0, "logps/rejected": -236.0, "loss": 0.6455, "loss/chosen-sft": 1.234375, "loss/dpo": 0.625, "rewards/accuracies": 0.84375, "rewards/chosen": -0.029052734375, "rewards/margins": 0.1474609375, "rewards/rejected": -0.1767578125, "step": 38 }, { "epoch": 0.17256637168141592, "grad_norm": 17.598873138427734, "learning_rate": 7e-07, "logits/chosen": -0.23046875, "logits/rejected": -0.357421875, "logps/chosen": -237.0, "logps/rejected": -253.0, "loss": 0.6367, "loss/chosen-sft": 1.0625, "loss/dpo": 0.64453125, "rewards/accuracies": 0.71875, "rewards/chosen": -0.053955078125, "rewards/margins": 0.10888671875, "rewards/rejected": -0.1630859375, "step": 39 }, { "epoch": 0.17699115044247787, "grad_norm": 4.538353443145752, "learning_rate": 7e-07, "logits/chosen": -0.1953125, "logits/rejected": -0.263671875, "logps/chosen": -240.0, "logps/rejected": -290.0, "loss": 0.6475, "loss/chosen-sft": 0.9296875, "loss/dpo": 0.6328125, "rewards/accuracies": 0.71875, "rewards/chosen": -0.0625, "rewards/margins": 0.13671875, "rewards/rejected": -0.19921875, "step": 40 }, { "epoch": 0.18141592920353983, "grad_norm": 17.900062561035156, "learning_rate": 7e-07, "logits/chosen": -0.349609375, "logits/rejected": -0.306640625, "logps/chosen": -260.0, "logps/rejected": -284.0, "loss": 0.6538, "loss/chosen-sft": 1.03125, "loss/dpo": 0.65625, "rewards/accuracies": 0.5625, "rewards/chosen": -0.09912109375, "rewards/margins": 0.08154296875, "rewards/rejected": -0.1806640625, "step": 41 }, { "epoch": 0.18584070796460178, "grad_norm": 2.5172457695007324, "learning_rate": 7e-07, "logits/chosen": -0.40625, "logits/rejected": -0.38671875, "logps/chosen": -272.0, "logps/rejected": -216.0, "loss": 0.6426, "loss/chosen-sft": 1.234375, "loss/dpo": 0.625, "rewards/accuracies": 0.75, "rewards/chosen": -0.07080078125, "rewards/margins": 0.1708984375, "rewards/rejected": -0.2421875, "step": 42 }, { "epoch": 0.1902654867256637, "grad_norm": 10.608353614807129, "learning_rate": 7e-07, "logits/chosen": -0.400390625, "logits/rejected": -0.353515625, "logps/chosen": -240.0, "logps/rejected": -224.0, "loss": 0.6455, "loss/chosen-sft": 1.140625, "loss/dpo": 0.6328125, "rewards/accuracies": 0.71875, "rewards/chosen": -0.0888671875, "rewards/margins": 0.1357421875, "rewards/rejected": -0.224609375, "step": 43 }, { "epoch": 0.19469026548672566, "grad_norm": 17.277069091796875, "learning_rate": 7e-07, "logits/chosen": -0.33984375, "logits/rejected": -0.37109375, "logps/chosen": -300.0, "logps/rejected": -215.0, "loss": 0.6396, "loss/chosen-sft": 1.1640625, "loss/dpo": 0.60546875, "rewards/accuracies": 0.75, "rewards/chosen": -0.06005859375, "rewards/margins": 0.2041015625, "rewards/rejected": -0.263671875, "step": 44 }, { "epoch": 0.19911504424778761, "grad_norm": 18.799257278442383, "learning_rate": 7e-07, "logits/chosen": -0.0888671875, "logits/rejected": -0.171875, "logps/chosen": -234.0, "logps/rejected": -260.0, "loss": 0.6382, "loss/chosen-sft": 0.890625, "loss/dpo": 0.60546875, "rewards/accuracies": 0.75, "rewards/chosen": -0.08203125, "rewards/margins": 0.2197265625, "rewards/rejected": -0.302734375, "step": 45 }, { "epoch": 0.20353982300884957, "grad_norm": 34.8527717590332, "learning_rate": 7e-07, "logits/chosen": -0.3828125, "logits/rejected": -0.46875, "logps/chosen": -246.0, "logps/rejected": -255.0, "loss": 0.6538, "loss/chosen-sft": 1.109375, "loss/dpo": 0.71875, "rewards/accuracies": 0.46875, "rewards/chosen": -0.240234375, "rewards/margins": -0.01708984375, "rewards/rejected": -0.22265625, "step": 46 }, { "epoch": 0.2079646017699115, "grad_norm": 6.215828895568848, "learning_rate": 7e-07, "logits/chosen": -0.578125, "logits/rejected": -0.49609375, "logps/chosen": -288.0, "logps/rejected": -284.0, "loss": 0.6431, "loss/chosen-sft": 1.25, "loss/dpo": 0.61328125, "rewards/accuracies": 0.75, "rewards/chosen": -0.0625, "rewards/margins": 0.18359375, "rewards/rejected": -0.2470703125, "step": 47 }, { "epoch": 0.21238938053097345, "grad_norm": 2.940314531326294, "learning_rate": 7e-07, "logits/chosen": -0.361328125, "logits/rejected": -0.400390625, "logps/chosen": -288.0, "logps/rejected": -214.0, "loss": 0.627, "loss/chosen-sft": 1.171875, "loss/dpo": 0.62109375, "rewards/accuracies": 0.625, "rewards/chosen": -0.1357421875, "rewards/margins": 0.1748046875, "rewards/rejected": -0.310546875, "step": 48 }, { "epoch": 0.2168141592920354, "grad_norm": 124.2203598022461, "learning_rate": 7e-07, "logits/chosen": -0.2255859375, "logits/rejected": -0.27734375, "logps/chosen": -276.0, "logps/rejected": -280.0, "loss": 0.6323, "loss/chosen-sft": 1.015625, "loss/dpo": 0.61328125, "rewards/accuracies": 0.78125, "rewards/chosen": -0.076171875, "rewards/margins": 0.177734375, "rewards/rejected": -0.25390625, "step": 49 }, { "epoch": 0.22123893805309736, "grad_norm": 49.72818374633789, "learning_rate": 7e-07, "logits/chosen": -0.5859375, "logits/rejected": -0.55859375, "logps/chosen": -312.0, "logps/rejected": -266.0, "loss": 0.6343, "loss/chosen-sft": 1.1796875, "loss/dpo": 0.609375, "rewards/accuracies": 0.65625, "rewards/chosen": -0.10791015625, "rewards/margins": 0.2021484375, "rewards/rejected": -0.310546875, "step": 50 }, { "epoch": 0.22566371681415928, "grad_norm": 110.1352310180664, "learning_rate": 7e-07, "logits/chosen": -0.44921875, "logits/rejected": -0.431640625, "logps/chosen": -274.0, "logps/rejected": -264.0, "loss": 0.6245, "loss/chosen-sft": 1.171875, "loss/dpo": 0.61328125, "rewards/accuracies": 0.71875, "rewards/chosen": -0.1162109375, "rewards/margins": 0.208984375, "rewards/rejected": -0.32421875, "step": 51 }, { "epoch": 0.23008849557522124, "grad_norm": 14.234803199768066, "learning_rate": 7e-07, "logits/chosen": -0.40625, "logits/rejected": -0.28515625, "logps/chosen": -268.0, "logps/rejected": -256.0, "loss": 0.627, "loss/chosen-sft": 1.046875, "loss/dpo": 0.66796875, "rewards/accuracies": 0.5625, "rewards/chosen": -0.220703125, "rewards/margins": 0.0849609375, "rewards/rejected": -0.306640625, "step": 52 }, { "epoch": 0.2345132743362832, "grad_norm": 25.839595794677734, "learning_rate": 7e-07, "logits/chosen": -0.61328125, "logits/rejected": -0.5390625, "logps/chosen": -302.0, "logps/rejected": -251.0, "loss": 0.6152, "loss/chosen-sft": 1.3203125, "loss/dpo": 0.58203125, "rewards/accuracies": 0.875, "rewards/chosen": -0.0849609375, "rewards/margins": 0.2578125, "rewards/rejected": -0.34375, "step": 53 }, { "epoch": 0.23893805309734514, "grad_norm": 17.5559024810791, "learning_rate": 7e-07, "logits/chosen": -0.32421875, "logits/rejected": -0.4140625, "logps/chosen": -241.0, "logps/rejected": -272.0, "loss": 0.6279, "loss/chosen-sft": 1.109375, "loss/dpo": 0.61328125, "rewards/accuracies": 0.6875, "rewards/chosen": -0.173828125, "rewards/margins": 0.21875, "rewards/rejected": -0.392578125, "step": 54 }, { "epoch": 0.24336283185840707, "grad_norm": 4.360561847686768, "learning_rate": 7e-07, "logits/chosen": -0.2353515625, "logits/rejected": -0.10107421875, "logps/chosen": -284.0, "logps/rejected": -237.0, "loss": 0.605, "loss/chosen-sft": 1.0546875, "loss/dpo": 0.6015625, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1689453125, "rewards/margins": 0.2353515625, "rewards/rejected": -0.404296875, "step": 55 }, { "epoch": 0.24778761061946902, "grad_norm": 16.581727981567383, "learning_rate": 7e-07, "logits/chosen": -0.431640625, "logits/rejected": -0.39453125, "logps/chosen": -340.0, "logps/rejected": -294.0, "loss": 0.6133, "loss/chosen-sft": 1.25, "loss/dpo": 0.6484375, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2177734375, "rewards/margins": 0.1396484375, "rewards/rejected": -0.357421875, "step": 56 }, { "epoch": 0.252212389380531, "grad_norm": 49.88325119018555, "learning_rate": 7e-07, "logits/chosen": -0.396484375, "logits/rejected": -0.173828125, "logps/chosen": -226.0, "logps/rejected": -191.0, "loss": 0.6279, "loss/chosen-sft": 1.0546875, "loss/dpo": 0.68359375, "rewards/accuracies": 0.53125, "rewards/chosen": -0.1884765625, "rewards/margins": 0.036865234375, "rewards/rejected": -0.2255859375, "step": 57 }, { "epoch": 0.25663716814159293, "grad_norm": 24.68882179260254, "learning_rate": 7e-07, "logits/chosen": -0.37890625, "logits/rejected": -0.28515625, "logps/chosen": -244.0, "logps/rejected": -260.0, "loss": 0.6133, "loss/chosen-sft": 1.078125, "loss/dpo": 0.60546875, "rewards/accuracies": 0.84375, "rewards/chosen": -0.23828125, "rewards/margins": 0.2255859375, "rewards/rejected": -0.46484375, "step": 58 }, { "epoch": 0.2610619469026549, "grad_norm": 15.78062915802002, "learning_rate": 7e-07, "logits/chosen": -0.294921875, "logits/rejected": -0.375, "logps/chosen": -300.0, "logps/rejected": -255.0, "loss": 0.6138, "loss/chosen-sft": 1.1796875, "loss/dpo": 0.59375, "rewards/accuracies": 0.75, "rewards/chosen": -0.154296875, "rewards/margins": 0.2421875, "rewards/rejected": -0.396484375, "step": 59 }, { "epoch": 0.26548672566371684, "grad_norm": 20.29654884338379, "learning_rate": 7e-07, "logits/chosen": -0.318359375, "logits/rejected": -0.376953125, "logps/chosen": -280.0, "logps/rejected": -300.0, "loss": 0.6206, "loss/chosen-sft": 1.2265625, "loss/dpo": 0.6484375, "rewards/accuracies": 0.625, "rewards/chosen": -0.2265625, "rewards/margins": 0.1357421875, "rewards/rejected": -0.361328125, "step": 60 }, { "epoch": 0.26991150442477874, "grad_norm": 43.65996551513672, "learning_rate": 7e-07, "logits/chosen": -0.36328125, "logits/rejected": -0.3046875, "logps/chosen": -238.0, "logps/rejected": -278.0, "loss": 0.6255, "loss/chosen-sft": 1.25, "loss/dpo": 0.59375, "rewards/accuracies": 0.71875, "rewards/chosen": -0.1572265625, "rewards/margins": 0.2578125, "rewards/rejected": -0.416015625, "step": 61 }, { "epoch": 0.2743362831858407, "grad_norm": 72.5498046875, "learning_rate": 7e-07, "logits/chosen": -0.359375, "logits/rejected": -0.373046875, "logps/chosen": -318.0, "logps/rejected": -270.0, "loss": 0.6245, "loss/chosen-sft": 1.1015625, "loss/dpo": 0.59375, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1884765625, "rewards/margins": 0.275390625, "rewards/rejected": -0.46484375, "step": 62 }, { "epoch": 0.27876106194690264, "grad_norm": 99.62593078613281, "learning_rate": 7e-07, "logits/chosen": -0.265625, "logits/rejected": -0.33984375, "logps/chosen": -294.0, "logps/rejected": -258.0, "loss": 0.627, "loss/chosen-sft": 1.0703125, "loss/dpo": 0.65234375, "rewards/accuracies": 0.53125, "rewards/chosen": -0.267578125, "rewards/margins": 0.1328125, "rewards/rejected": -0.400390625, "step": 63 }, { "epoch": 0.2831858407079646, "grad_norm": 71.84874725341797, "learning_rate": 7e-07, "logits/chosen": -0.359375, "logits/rejected": -0.392578125, "logps/chosen": -260.0, "logps/rejected": -219.0, "loss": 0.6138, "loss/chosen-sft": 1.171875, "loss/dpo": 0.671875, "rewards/accuracies": 0.5, "rewards/chosen": -0.248046875, "rewards/margins": 0.087890625, "rewards/rejected": -0.3359375, "step": 64 }, { "epoch": 0.28761061946902655, "grad_norm": 43.867332458496094, "learning_rate": 7e-07, "logits/chosen": -0.494140625, "logits/rejected": -0.38671875, "logps/chosen": -298.0, "logps/rejected": -276.0, "loss": 0.6162, "loss/chosen-sft": 1.234375, "loss/dpo": 0.6171875, "rewards/accuracies": 0.65625, "rewards/chosen": -0.251953125, "rewards/margins": 0.2021484375, "rewards/rejected": -0.455078125, "step": 65 }, { "epoch": 0.2920353982300885, "grad_norm": 15.392335891723633, "learning_rate": 7e-07, "logits/chosen": -0.275390625, "logits/rejected": -0.1953125, "logps/chosen": -241.0, "logps/rejected": -234.0, "loss": 0.6211, "loss/chosen-sft": 1.1953125, "loss/dpo": 0.703125, "rewards/accuracies": 0.5625, "rewards/chosen": -0.251953125, "rewards/margins": 0.017578125, "rewards/rejected": -0.26953125, "step": 66 }, { "epoch": 0.29646017699115046, "grad_norm": 17.315120697021484, "learning_rate": 7e-07, "logits/chosen": -0.271484375, "logits/rejected": -0.3828125, "logps/chosen": -268.0, "logps/rejected": -292.0, "loss": 0.6226, "loss/chosen-sft": 1.0078125, "loss/dpo": 0.6171875, "rewards/accuracies": 0.6875, "rewards/chosen": -0.30078125, "rewards/margins": 0.203125, "rewards/rejected": -0.50390625, "step": 67 }, { "epoch": 0.3008849557522124, "grad_norm": 17.68255615234375, "learning_rate": 7e-07, "logits/chosen": -0.369140625, "logits/rejected": -0.50390625, "logps/chosen": -266.0, "logps/rejected": -248.0, "loss": 0.6143, "loss/chosen-sft": 1.109375, "loss/dpo": 0.58203125, "rewards/accuracies": 0.71875, "rewards/chosen": -0.138671875, "rewards/margins": 0.3046875, "rewards/rejected": -0.443359375, "step": 68 }, { "epoch": 0.3053097345132743, "grad_norm": 31.16883087158203, "learning_rate": 7e-07, "logits/chosen": -0.373046875, "logits/rejected": -0.369140625, "logps/chosen": -256.0, "logps/rejected": -235.0, "loss": 0.6191, "loss/chosen-sft": 1.15625, "loss/dpo": 0.59765625, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1884765625, "rewards/margins": 0.279296875, "rewards/rejected": -0.46875, "step": 69 }, { "epoch": 0.30973451327433627, "grad_norm": 65.01067352294922, "learning_rate": 7e-07, "logits/chosen": -0.423828125, "logits/rejected": -0.5, "logps/chosen": -272.0, "logps/rejected": -244.0, "loss": 0.5884, "loss/chosen-sft": 1.21875, "loss/dpo": 0.6015625, "rewards/accuracies": 0.625, "rewards/chosen": -0.1787109375, "rewards/margins": 0.251953125, "rewards/rejected": -0.431640625, "step": 70 }, { "epoch": 0.3141592920353982, "grad_norm": 9.247684478759766, "learning_rate": 7e-07, "logits/chosen": -0.412109375, "logits/rejected": -0.421875, "logps/chosen": -284.0, "logps/rejected": -245.0, "loss": 0.605, "loss/chosen-sft": 1.203125, "loss/dpo": 0.671875, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2216796875, "rewards/margins": 0.1396484375, "rewards/rejected": -0.361328125, "step": 71 }, { "epoch": 0.3185840707964602, "grad_norm": 29.392963409423828, "learning_rate": 7e-07, "logits/chosen": -0.46875, "logits/rejected": -0.462890625, "logps/chosen": -346.0, "logps/rejected": -292.0, "loss": 0.5928, "loss/chosen-sft": 1.25, "loss/dpo": 0.64453125, "rewards/accuracies": 0.65625, "rewards/chosen": -0.30078125, "rewards/margins": 0.1650390625, "rewards/rejected": -0.46484375, "step": 72 }, { "epoch": 0.3230088495575221, "grad_norm": 12.59981918334961, "learning_rate": 7e-07, "logits/chosen": -0.6171875, "logits/rejected": -0.41796875, "logps/chosen": -296.0, "logps/rejected": -253.0, "loss": 0.6255, "loss/chosen-sft": 1.2734375, "loss/dpo": 0.62109375, "rewards/accuracies": 0.8125, "rewards/chosen": -0.228515625, "rewards/margins": 0.1884765625, "rewards/rejected": -0.416015625, "step": 73 }, { "epoch": 0.3274336283185841, "grad_norm": 27.949209213256836, "learning_rate": 7e-07, "logits/chosen": -0.5, "logits/rejected": -0.4921875, "logps/chosen": -272.0, "logps/rejected": -218.0, "loss": 0.5723, "loss/chosen-sft": 1.1171875, "loss/dpo": 0.62109375, "rewards/accuracies": 0.75, "rewards/chosen": -0.25390625, "rewards/margins": 0.1806640625, "rewards/rejected": -0.435546875, "step": 74 }, { "epoch": 0.33185840707964603, "grad_norm": 42.84572219848633, "learning_rate": 7e-07, "logits/chosen": -0.54296875, "logits/rejected": -0.5546875, "logps/chosen": -268.0, "logps/rejected": -222.0, "loss": 0.6099, "loss/chosen-sft": 1.265625, "loss/dpo": 0.6015625, "rewards/accuracies": 0.625, "rewards/chosen": -0.1572265625, "rewards/margins": 0.2392578125, "rewards/rejected": -0.396484375, "step": 75 }, { "epoch": 0.336283185840708, "grad_norm": 58.05986404418945, "learning_rate": 7e-07, "logits/chosen": -0.5390625, "logits/rejected": -0.51171875, "logps/chosen": -206.0, "logps/rejected": -262.0, "loss": 0.6021, "loss/chosen-sft": 1.0546875, "loss/dpo": 0.58984375, "rewards/accuracies": 0.8125, "rewards/chosen": -0.1728515625, "rewards/margins": 0.24609375, "rewards/rejected": -0.41796875, "step": 76 }, { "epoch": 0.3407079646017699, "grad_norm": 31.465473175048828, "learning_rate": 7e-07, "logits/chosen": -0.423828125, "logits/rejected": -0.1796875, "logps/chosen": -264.0, "logps/rejected": -286.0, "loss": 0.5967, "loss/chosen-sft": 1.0703125, "loss/dpo": 0.62890625, "rewards/accuracies": 0.65625, "rewards/chosen": -0.31640625, "rewards/margins": 0.19921875, "rewards/rejected": -0.515625, "step": 77 }, { "epoch": 0.34513274336283184, "grad_norm": 57.08030700683594, "learning_rate": 7e-07, "logits/chosen": -0.578125, "logits/rejected": -0.326171875, "logps/chosen": -304.0, "logps/rejected": -240.0, "loss": 0.6187, "loss/chosen-sft": 1.328125, "loss/dpo": 0.59375, "rewards/accuracies": 0.8125, "rewards/chosen": -0.22265625, "rewards/margins": 0.267578125, "rewards/rejected": -0.490234375, "step": 78 }, { "epoch": 0.3495575221238938, "grad_norm": 29.671070098876953, "learning_rate": 7e-07, "logits/chosen": -0.48046875, "logits/rejected": -0.59765625, "logps/chosen": -292.0, "logps/rejected": -256.0, "loss": 0.6147, "loss/chosen-sft": 1.2265625, "loss/dpo": 0.55859375, "rewards/accuracies": 0.78125, "rewards/chosen": -0.1416015625, "rewards/margins": 0.33984375, "rewards/rejected": -0.48046875, "step": 79 }, { "epoch": 0.35398230088495575, "grad_norm": 48.64991760253906, "learning_rate": 7e-07, "logits/chosen": -0.59375, "logits/rejected": -0.6328125, "logps/chosen": -288.0, "logps/rejected": -264.0, "loss": 0.6309, "loss/chosen-sft": 1.2578125, "loss/dpo": 0.6484375, "rewards/accuracies": 0.65625, "rewards/chosen": -0.33984375, "rewards/margins": 0.162109375, "rewards/rejected": -0.50390625, "step": 80 }, { "epoch": 0.3584070796460177, "grad_norm": 44.117034912109375, "learning_rate": 7e-07, "logits/chosen": -0.49609375, "logits/rejected": -0.51953125, "logps/chosen": -252.0, "logps/rejected": -262.0, "loss": 0.5845, "loss/chosen-sft": 1.0859375, "loss/dpo": 0.55078125, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2412109375, "rewards/margins": 0.3828125, "rewards/rejected": -0.625, "step": 81 }, { "epoch": 0.36283185840707965, "grad_norm": 45.00334167480469, "learning_rate": 7e-07, "logits/chosen": -0.486328125, "logits/rejected": -0.5390625, "logps/chosen": -294.0, "logps/rejected": -288.0, "loss": 0.6045, "loss/chosen-sft": 1.21875, "loss/dpo": 0.578125, "rewards/accuracies": 0.71875, "rewards/chosen": -0.287109375, "rewards/margins": 0.328125, "rewards/rejected": -0.6171875, "step": 82 }, { "epoch": 0.3672566371681416, "grad_norm": 6.153012752532959, "learning_rate": 7e-07, "logits/chosen": -0.427734375, "logits/rejected": -0.427734375, "logps/chosen": -272.0, "logps/rejected": -278.0, "loss": 0.582, "loss/chosen-sft": 1.0546875, "loss/dpo": 0.5625, "rewards/accuracies": 0.78125, "rewards/chosen": -0.1689453125, "rewards/margins": 0.357421875, "rewards/rejected": -0.52734375, "step": 83 }, { "epoch": 0.37168141592920356, "grad_norm": 4.050904273986816, "learning_rate": 7e-07, "logits/chosen": -0.349609375, "logits/rejected": -0.3984375, "logps/chosen": -288.0, "logps/rejected": -274.0, "loss": 0.603, "loss/chosen-sft": 1.1796875, "loss/dpo": 0.62109375, "rewards/accuracies": 0.65625, "rewards/chosen": -0.28515625, "rewards/margins": 0.2333984375, "rewards/rejected": -0.51953125, "step": 84 }, { "epoch": 0.37610619469026546, "grad_norm": 53.353515625, "learning_rate": 7e-07, "logits/chosen": -0.455078125, "logits/rejected": -0.462890625, "logps/chosen": -278.0, "logps/rejected": -227.0, "loss": 0.6323, "loss/chosen-sft": 1.109375, "loss/dpo": 0.609375, "rewards/accuracies": 0.6875, "rewards/chosen": -0.24609375, "rewards/margins": 0.2373046875, "rewards/rejected": -0.482421875, "step": 85 }, { "epoch": 0.3805309734513274, "grad_norm": 63.98937225341797, "learning_rate": 7e-07, "logits/chosen": -0.400390625, "logits/rejected": -0.4609375, "logps/chosen": -300.0, "logps/rejected": -340.0, "loss": 0.6182, "loss/chosen-sft": 1.046875, "loss/dpo": 0.62890625, "rewards/accuracies": 0.65625, "rewards/chosen": -0.349609375, "rewards/margins": 0.1904296875, "rewards/rejected": -0.5390625, "step": 86 }, { "epoch": 0.38495575221238937, "grad_norm": 22.751070022583008, "learning_rate": 7e-07, "logits/chosen": -0.416015625, "logits/rejected": -0.47265625, "logps/chosen": -270.0, "logps/rejected": -234.0, "loss": 0.5938, "loss/chosen-sft": 1.109375, "loss/dpo": 0.6015625, "rewards/accuracies": 0.78125, "rewards/chosen": -0.203125, "rewards/margins": 0.2734375, "rewards/rejected": -0.4765625, "step": 87 }, { "epoch": 0.3893805309734513, "grad_norm": 15.500909805297852, "learning_rate": 7e-07, "logits/chosen": -0.42578125, "logits/rejected": -0.51171875, "logps/chosen": -220.0, "logps/rejected": -222.0, "loss": 0.5884, "loss/chosen-sft": 1.3828125, "loss/dpo": 0.69140625, "rewards/accuracies": 0.625, "rewards/chosen": -0.33203125, "rewards/margins": 0.0634765625, "rewards/rejected": -0.396484375, "step": 88 }, { "epoch": 0.3938053097345133, "grad_norm": 21.14480209350586, "learning_rate": 7e-07, "logits/chosen": -0.50390625, "logits/rejected": -0.4921875, "logps/chosen": -282.0, "logps/rejected": -223.0, "loss": 0.584, "loss/chosen-sft": 1.296875, "loss/dpo": 0.65234375, "rewards/accuracies": 0.5625, "rewards/chosen": -0.27734375, "rewards/margins": 0.1875, "rewards/rejected": -0.466796875, "step": 89 }, { "epoch": 0.39823008849557523, "grad_norm": 14.2146635055542, "learning_rate": 7e-07, "logits/chosen": -0.447265625, "logits/rejected": -0.392578125, "logps/chosen": -268.0, "logps/rejected": -215.0, "loss": 0.6104, "loss/chosen-sft": 1.09375, "loss/dpo": 0.65625, "rewards/accuracies": 0.53125, "rewards/chosen": -0.359375, "rewards/margins": 0.1923828125, "rewards/rejected": -0.55078125, "step": 90 }, { "epoch": 0.4026548672566372, "grad_norm": 76.13188171386719, "learning_rate": 7e-07, "logits/chosen": -0.478515625, "logits/rejected": -0.462890625, "logps/chosen": -270.0, "logps/rejected": -280.0, "loss": 0.5781, "loss/chosen-sft": 1.2109375, "loss/dpo": 0.59765625, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2421875, "rewards/margins": 0.2431640625, "rewards/rejected": -0.486328125, "step": 91 }, { "epoch": 0.40707964601769914, "grad_norm": 34.39772033691406, "learning_rate": 7e-07, "logits/chosen": -0.5625, "logits/rejected": -0.4921875, "logps/chosen": -308.0, "logps/rejected": -294.0, "loss": 0.5698, "loss/chosen-sft": 1.3203125, "loss/dpo": 0.52734375, "rewards/accuracies": 0.75, "rewards/chosen": -0.2275390625, "rewards/margins": 0.49609375, "rewards/rejected": -0.7265625, "step": 92 }, { "epoch": 0.41150442477876104, "grad_norm": 36.51502227783203, "learning_rate": 7e-07, "logits/chosen": -0.4296875, "logits/rejected": -0.51953125, "logps/chosen": -278.0, "logps/rejected": -336.0, "loss": 0.5869, "loss/chosen-sft": 1.15625, "loss/dpo": 0.57421875, "rewards/accuracies": 0.78125, "rewards/chosen": -0.29296875, "rewards/margins": 0.314453125, "rewards/rejected": -0.60546875, "step": 93 }, { "epoch": 0.415929203539823, "grad_norm": 41.33882141113281, "learning_rate": 7e-07, "logits/chosen": -0.291015625, "logits/rejected": -0.26171875, "logps/chosen": -255.0, "logps/rejected": -251.0, "loss": 0.5908, "loss/chosen-sft": 1.171875, "loss/dpo": 0.6171875, "rewards/accuracies": 0.65625, "rewards/chosen": -0.310546875, "rewards/margins": 0.232421875, "rewards/rejected": -0.54296875, "step": 94 }, { "epoch": 0.42035398230088494, "grad_norm": 25.335350036621094, "learning_rate": 7e-07, "logits/chosen": -0.2578125, "logits/rejected": -0.26953125, "logps/chosen": -234.0, "logps/rejected": -312.0, "loss": 0.5811, "loss/chosen-sft": 1.03125, "loss/dpo": 0.5546875, "rewards/accuracies": 0.8125, "rewards/chosen": -0.298828125, "rewards/margins": 0.41796875, "rewards/rejected": -0.71875, "step": 95 }, { "epoch": 0.4247787610619469, "grad_norm": 14.678760528564453, "learning_rate": 7e-07, "logits/chosen": -0.46484375, "logits/rejected": -0.439453125, "logps/chosen": -304.0, "logps/rejected": -248.0, "loss": 0.6094, "loss/chosen-sft": 1.2265625, "loss/dpo": 0.6484375, "rewards/accuracies": 0.625, "rewards/chosen": -0.287109375, "rewards/margins": 0.1328125, "rewards/rejected": -0.41796875, "step": 96 }, { "epoch": 0.42920353982300885, "grad_norm": 12.859259605407715, "learning_rate": 7e-07, "logits/chosen": -0.50390625, "logits/rejected": -0.4765625, "logps/chosen": -255.0, "logps/rejected": -226.0, "loss": 0.5938, "loss/chosen-sft": 1.1953125, "loss/dpo": 0.64453125, "rewards/accuracies": 0.625, "rewards/chosen": -0.353515625, "rewards/margins": 0.15625, "rewards/rejected": -0.51171875, "step": 97 }, { "epoch": 0.4336283185840708, "grad_norm": 9.326794624328613, "learning_rate": 7e-07, "logits/chosen": -0.4765625, "logits/rejected": -0.447265625, "logps/chosen": -272.0, "logps/rejected": -268.0, "loss": 0.5752, "loss/chosen-sft": 1.1640625, "loss/dpo": 0.54296875, "rewards/accuracies": 0.71875, "rewards/chosen": -0.291015625, "rewards/margins": 0.4765625, "rewards/rejected": -0.76953125, "step": 98 }, { "epoch": 0.43805309734513276, "grad_norm": 10.561424255371094, "learning_rate": 7e-07, "logits/chosen": -0.5625, "logits/rejected": -0.5234375, "logps/chosen": -304.0, "logps/rejected": -282.0, "loss": 0.5786, "loss/chosen-sft": 1.1640625, "loss/dpo": 0.578125, "rewards/accuracies": 0.78125, "rewards/chosen": -0.390625, "rewards/margins": 0.353515625, "rewards/rejected": -0.7421875, "step": 99 }, { "epoch": 0.4424778761061947, "grad_norm": 102.05797576904297, "learning_rate": 7e-07, "logits/chosen": -0.44921875, "logits/rejected": -0.423828125, "logps/chosen": -235.0, "logps/rejected": -284.0, "loss": 0.6079, "loss/chosen-sft": 1.1171875, "loss/dpo": 0.65234375, "rewards/accuracies": 0.75, "rewards/chosen": -0.423828125, "rewards/margins": 0.236328125, "rewards/rejected": -0.66015625, "step": 100 }, { "epoch": 0.4469026548672566, "grad_norm": 10.166366577148438, "learning_rate": 7e-07, "logits/chosen": -0.51953125, "logits/rejected": -0.431640625, "logps/chosen": -342.0, "logps/rejected": -302.0, "loss": 0.5864, "loss/chosen-sft": 1.1953125, "loss/dpo": 0.53515625, "rewards/accuracies": 0.8125, "rewards/chosen": -0.333984375, "rewards/margins": 0.5, "rewards/rejected": -0.8359375, "step": 101 }, { "epoch": 0.45132743362831856, "grad_norm": 9.208560943603516, "learning_rate": 7e-07, "logits/chosen": -0.373046875, "logits/rejected": -0.3359375, "logps/chosen": -332.0, "logps/rejected": -326.0, "loss": 0.584, "loss/chosen-sft": 1.1640625, "loss/dpo": 0.5390625, "rewards/accuracies": 0.75, "rewards/chosen": -0.30078125, "rewards/margins": 0.435546875, "rewards/rejected": -0.73828125, "step": 102 }, { "epoch": 0.4557522123893805, "grad_norm": 58.12112045288086, "learning_rate": 7e-07, "logits/chosen": -0.46484375, "logits/rejected": -0.486328125, "logps/chosen": -298.0, "logps/rejected": -300.0, "loss": 0.6191, "loss/chosen-sft": 1.1640625, "loss/dpo": 0.58984375, "rewards/accuracies": 0.6875, "rewards/chosen": -0.365234375, "rewards/margins": 0.40625, "rewards/rejected": -0.76953125, "step": 103 }, { "epoch": 0.46017699115044247, "grad_norm": 32.06050491333008, "learning_rate": 7e-07, "logits/chosen": -0.51953125, "logits/rejected": -0.49609375, "logps/chosen": -376.0, "logps/rejected": -346.0, "loss": 0.5581, "loss/chosen-sft": 1.2265625, "loss/dpo": 0.55078125, "rewards/accuracies": 0.8125, "rewards/chosen": -0.35546875, "rewards/margins": 0.46875, "rewards/rejected": -0.82421875, "step": 104 }, { "epoch": 0.4646017699115044, "grad_norm": 66.27430725097656, "learning_rate": 7e-07, "logits/chosen": -0.578125, "logits/rejected": -0.53125, "logps/chosen": -340.0, "logps/rejected": -284.0, "loss": 0.5674, "loss/chosen-sft": 1.2734375, "loss/dpo": 0.5390625, "rewards/accuracies": 0.78125, "rewards/chosen": -0.37890625, "rewards/margins": 0.427734375, "rewards/rejected": -0.80859375, "step": 105 }, { "epoch": 0.4690265486725664, "grad_norm": 43.450740814208984, "learning_rate": 7e-07, "logits/chosen": -0.369140625, "logits/rejected": -0.443359375, "logps/chosen": -328.0, "logps/rejected": -310.0, "loss": 0.5688, "loss/chosen-sft": 1.1640625, "loss/dpo": 0.59765625, "rewards/accuracies": 0.625, "rewards/chosen": -0.404296875, "rewards/margins": 0.318359375, "rewards/rejected": -0.72265625, "step": 106 }, { "epoch": 0.47345132743362833, "grad_norm": 48.25244903564453, "learning_rate": 7e-07, "logits/chosen": -0.4765625, "logits/rejected": -0.43359375, "logps/chosen": -320.0, "logps/rejected": -260.0, "loss": 0.5781, "loss/chosen-sft": 1.2421875, "loss/dpo": 0.62890625, "rewards/accuracies": 0.625, "rewards/chosen": -0.5234375, "rewards/margins": 0.283203125, "rewards/rejected": -0.8046875, "step": 107 }, { "epoch": 0.4778761061946903, "grad_norm": 26.64389419555664, "learning_rate": 7e-07, "logits/chosen": -0.427734375, "logits/rejected": -0.41796875, "logps/chosen": -262.0, "logps/rejected": -240.0, "loss": 0.564, "loss/chosen-sft": 1.1015625, "loss/dpo": 0.625, "rewards/accuracies": 0.625, "rewards/chosen": -0.357421875, "rewards/margins": 0.216796875, "rewards/rejected": -0.57421875, "step": 108 }, { "epoch": 0.4823008849557522, "grad_norm": 51.666202545166016, "learning_rate": 7e-07, "logits/chosen": -0.671875, "logits/rejected": -0.734375, "logps/chosen": -372.0, "logps/rejected": -340.0, "loss": 0.564, "loss/chosen-sft": 1.3125, "loss/dpo": 0.5234375, "rewards/accuracies": 0.71875, "rewards/chosen": -0.35546875, "rewards/margins": 0.5859375, "rewards/rejected": -0.94140625, "step": 109 }, { "epoch": 0.48672566371681414, "grad_norm": 14.793038368225098, "learning_rate": 7e-07, "logits/chosen": -0.490234375, "logits/rejected": -0.57421875, "logps/chosen": -288.0, "logps/rejected": -262.0, "loss": 0.564, "loss/chosen-sft": 1.2890625, "loss/dpo": 0.63671875, "rewards/accuracies": 0.6875, "rewards/chosen": -0.53125, "rewards/margins": 0.232421875, "rewards/rejected": -0.765625, "step": 110 }, { "epoch": 0.4911504424778761, "grad_norm": 54.62705993652344, "learning_rate": 7e-07, "logits/chosen": -0.51171875, "logits/rejected": -0.59765625, "logps/chosen": -312.0, "logps/rejected": -286.0, "loss": 0.5803, "loss/chosen-sft": 1.21875, "loss/dpo": 0.53125, "rewards/accuracies": 0.71875, "rewards/chosen": -0.466796875, "rewards/margins": 0.515625, "rewards/rejected": -0.98046875, "step": 111 }, { "epoch": 0.49557522123893805, "grad_norm": 45.08600997924805, "learning_rate": 7e-07, "logits/chosen": -0.53125, "logits/rejected": -0.65625, "logps/chosen": -306.0, "logps/rejected": -314.0, "loss": 0.5442, "loss/chosen-sft": 1.1953125, "loss/dpo": 0.57421875, "rewards/accuracies": 0.75, "rewards/chosen": -0.435546875, "rewards/margins": 0.34375, "rewards/rejected": -0.78125, "step": 112 }, { "epoch": 0.5, "grad_norm": 29.89005470275879, "learning_rate": 7e-07, "logits/chosen": -0.5234375, "logits/rejected": -0.5078125, "logps/chosen": -282.0, "logps/rejected": -242.0, "loss": 0.5535, "loss/chosen-sft": 1.1015625, "loss/dpo": 0.63671875, "rewards/accuracies": 0.59375, "rewards/chosen": -0.43359375, "rewards/margins": 0.21484375, "rewards/rejected": -0.6484375, "step": 113 }, { "epoch": 0.504424778761062, "grad_norm": 5.516697406768799, "learning_rate": 7e-07, "logits/chosen": -0.376953125, "logits/rejected": -0.423828125, "logps/chosen": -249.0, "logps/rejected": -298.0, "loss": 0.5713, "loss/chosen-sft": 1.125, "loss/dpo": 0.51953125, "rewards/accuracies": 0.75, "rewards/chosen": -0.298828125, "rewards/margins": 0.48046875, "rewards/rejected": -0.77734375, "step": 114 }, { "epoch": 0.5088495575221239, "grad_norm": 42.4152946472168, "learning_rate": 7e-07, "logits/chosen": -0.5625, "logits/rejected": -0.48046875, "logps/chosen": -316.0, "logps/rejected": -312.0, "loss": 0.5811, "loss/chosen-sft": 1.2265625, "loss/dpo": 0.60546875, "rewards/accuracies": 0.59375, "rewards/chosen": -0.59765625, "rewards/margins": 0.29296875, "rewards/rejected": -0.890625, "step": 115 }, { "epoch": 0.5132743362831859, "grad_norm": 43.45073699951172, "learning_rate": 7e-07, "logits/chosen": -0.51171875, "logits/rejected": -0.49609375, "logps/chosen": -320.0, "logps/rejected": -368.0, "loss": 0.5562, "loss/chosen-sft": 1.203125, "loss/dpo": 0.58203125, "rewards/accuracies": 0.6875, "rewards/chosen": -0.51953125, "rewards/margins": 0.431640625, "rewards/rejected": -0.94921875, "step": 116 }, { "epoch": 0.5176991150442478, "grad_norm": 13.291467666625977, "learning_rate": 7e-07, "logits/chosen": -0.392578125, "logits/rejected": -0.375, "logps/chosen": -300.0, "logps/rejected": -316.0, "loss": 0.5491, "loss/chosen-sft": 1.2734375, "loss/dpo": 0.58984375, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5078125, "rewards/margins": 0.3515625, "rewards/rejected": -0.859375, "step": 117 }, { "epoch": 0.5221238938053098, "grad_norm": 16.5191707611084, "learning_rate": 7e-07, "logits/chosen": -0.330078125, "logits/rejected": -0.240234375, "logps/chosen": -292.0, "logps/rejected": -292.0, "loss": 0.5605, "loss/chosen-sft": 1.2421875, "loss/dpo": 0.57421875, "rewards/accuracies": 0.71875, "rewards/chosen": -0.484375, "rewards/margins": 0.34375, "rewards/rejected": -0.828125, "step": 118 }, { "epoch": 0.5265486725663717, "grad_norm": 55.267738342285156, "learning_rate": 7e-07, "logits/chosen": -0.58984375, "logits/rejected": -0.62890625, "logps/chosen": -316.0, "logps/rejected": -318.0, "loss": 0.5439, "loss/chosen-sft": 1.1484375, "loss/dpo": 0.53515625, "rewards/accuracies": 0.71875, "rewards/chosen": -0.412109375, "rewards/margins": 0.482421875, "rewards/rejected": -0.89453125, "step": 119 }, { "epoch": 0.5309734513274337, "grad_norm": 52.895042419433594, "learning_rate": 7e-07, "logits/chosen": -0.28515625, "logits/rejected": -0.380859375, "logps/chosen": -252.0, "logps/rejected": -278.0, "loss": 0.5623, "loss/chosen-sft": 0.96484375, "loss/dpo": 0.50390625, "rewards/accuracies": 0.71875, "rewards/chosen": -0.359375, "rewards/margins": 0.609375, "rewards/rejected": -0.96875, "step": 120 }, { "epoch": 0.5353982300884956, "grad_norm": 33.5416374206543, "learning_rate": 7e-07, "logits/chosen": -0.365234375, "logits/rejected": -0.45703125, "logps/chosen": -282.0, "logps/rejected": -262.0, "loss": 0.5391, "loss/chosen-sft": 1.28125, "loss/dpo": 0.546875, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4296875, "rewards/margins": 0.4609375, "rewards/rejected": -0.890625, "step": 121 }, { "epoch": 0.5398230088495575, "grad_norm": 55.33546447753906, "learning_rate": 7e-07, "logits/chosen": -0.53515625, "logits/rejected": -0.5703125, "logps/chosen": -334.0, "logps/rejected": -340.0, "loss": 0.5278, "loss/chosen-sft": 1.2578125, "loss/dpo": 0.53515625, "rewards/accuracies": 0.71875, "rewards/chosen": -0.45703125, "rewards/margins": 0.609375, "rewards/rejected": -1.0703125, "step": 122 }, { "epoch": 0.5442477876106194, "grad_norm": 46.70622253417969, "learning_rate": 7e-07, "logits/chosen": -0.53515625, "logits/rejected": -0.59765625, "logps/chosen": -308.0, "logps/rejected": -274.0, "loss": 0.55, "loss/chosen-sft": 1.34375, "loss/dpo": 0.5703125, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5390625, "rewards/margins": 0.3671875, "rewards/rejected": -0.90625, "step": 123 }, { "epoch": 0.5486725663716814, "grad_norm": 48.83370590209961, "learning_rate": 7e-07, "logits/chosen": -0.5234375, "logits/rejected": -0.60546875, "logps/chosen": -322.0, "logps/rejected": -318.0, "loss": 0.5825, "loss/chosen-sft": 1.2734375, "loss/dpo": 0.59375, "rewards/accuracies": 0.6875, "rewards/chosen": -0.625, "rewards/margins": 0.361328125, "rewards/rejected": -0.984375, "step": 124 }, { "epoch": 0.5530973451327433, "grad_norm": 25.2650089263916, "learning_rate": 7e-07, "logits/chosen": -0.3203125, "logits/rejected": -0.25390625, "logps/chosen": -244.0, "logps/rejected": -274.0, "loss": 0.5889, "loss/chosen-sft": 1.1484375, "loss/dpo": 0.55859375, "rewards/accuracies": 0.65625, "rewards/chosen": -0.439453125, "rewards/margins": 0.41796875, "rewards/rejected": -0.859375, "step": 125 }, { "epoch": 0.5575221238938053, "grad_norm": 36.186500549316406, "learning_rate": 7e-07, "logits/chosen": -0.58984375, "logits/rejected": -0.6640625, "logps/chosen": -324.0, "logps/rejected": -358.0, "loss": 0.585, "loss/chosen-sft": 1.2890625, "loss/dpo": 0.54296875, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6484375, "rewards/margins": 0.50390625, "rewards/rejected": -1.15625, "step": 126 }, { "epoch": 0.5619469026548672, "grad_norm": 13.623043060302734, "learning_rate": 7e-07, "logits/chosen": -0.640625, "logits/rejected": -0.66796875, "logps/chosen": -312.0, "logps/rejected": -332.0, "loss": 0.5789, "loss/chosen-sft": 1.2109375, "loss/dpo": 0.5546875, "rewards/accuracies": 0.75, "rewards/chosen": -0.38671875, "rewards/margins": 0.42578125, "rewards/rejected": -0.8125, "step": 127 }, { "epoch": 0.5663716814159292, "grad_norm": 9.796255111694336, "learning_rate": 7e-07, "logits/chosen": -0.494140625, "logits/rejected": -0.6015625, "logps/chosen": -290.0, "logps/rejected": -346.0, "loss": 0.5703, "loss/chosen-sft": 1.3203125, "loss/dpo": 0.609375, "rewards/accuracies": 0.625, "rewards/chosen": -0.65234375, "rewards/margins": 0.359375, "rewards/rejected": -1.015625, "step": 128 }, { "epoch": 0.5707964601769911, "grad_norm": 70.11766052246094, "learning_rate": 7e-07, "logits/chosen": -0.345703125, "logits/rejected": -0.353515625, "logps/chosen": -255.0, "logps/rejected": -296.0, "loss": 0.5767, "loss/chosen-sft": 1.15625, "loss/dpo": 0.48046875, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3046875, "rewards/margins": 0.75, "rewards/rejected": -1.0546875, "step": 129 }, { "epoch": 0.5752212389380531, "grad_norm": 137.03662109375, "learning_rate": 7e-07, "logits/chosen": -0.095703125, "logits/rejected": -0.38671875, "logps/chosen": -212.0, "logps/rejected": -262.0, "loss": 0.5471, "loss/chosen-sft": 1.125, "loss/dpo": 0.515625, "rewards/accuracies": 0.75, "rewards/chosen": -0.328125, "rewards/margins": 0.65234375, "rewards/rejected": -0.98046875, "step": 130 }, { "epoch": 0.5796460176991151, "grad_norm": 148.0476531982422, "learning_rate": 7e-07, "logits/chosen": -0.5625, "logits/rejected": -0.59375, "logps/chosen": -270.0, "logps/rejected": -284.0, "loss": 0.5403, "loss/chosen-sft": 1.3046875, "loss/dpo": 0.6796875, "rewards/accuracies": 0.6875, "rewards/chosen": -0.54296875, "rewards/margins": 0.134765625, "rewards/rejected": -0.6796875, "step": 131 }, { "epoch": 0.584070796460177, "grad_norm": 93.40387725830078, "learning_rate": 7e-07, "logits/chosen": -0.7109375, "logits/rejected": -0.66015625, "logps/chosen": -340.0, "logps/rejected": -350.0, "loss": 0.561, "loss/chosen-sft": 1.390625, "loss/dpo": 0.5078125, "rewards/accuracies": 0.78125, "rewards/chosen": -0.51953125, "rewards/margins": 0.69921875, "rewards/rejected": -1.21875, "step": 132 }, { "epoch": 0.588495575221239, "grad_norm": 55.550758361816406, "learning_rate": 7e-07, "logits/chosen": -0.42578125, "logits/rejected": -0.55859375, "logps/chosen": -294.0, "logps/rejected": -296.0, "loss": 0.5627, "loss/chosen-sft": 1.140625, "loss/dpo": 0.6015625, "rewards/accuracies": 0.75, "rewards/chosen": -0.453125, "rewards/margins": 0.33203125, "rewards/rejected": -0.78515625, "step": 133 }, { "epoch": 0.5929203539823009, "grad_norm": 80.4654541015625, "learning_rate": 7e-07, "logits/chosen": -0.43359375, "logits/rejected": -0.470703125, "logps/chosen": -308.0, "logps/rejected": -338.0, "loss": 0.5481, "loss/chosen-sft": 1.1640625, "loss/dpo": 0.4765625, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3984375, "rewards/margins": 0.67578125, "rewards/rejected": -1.0703125, "step": 134 }, { "epoch": 0.5973451327433629, "grad_norm": 109.14212036132812, "learning_rate": 7e-07, "logits/chosen": -0.470703125, "logits/rejected": -0.50390625, "logps/chosen": -334.0, "logps/rejected": -304.0, "loss": 0.542, "loss/chosen-sft": 1.171875, "loss/dpo": 0.6171875, "rewards/accuracies": 0.59375, "rewards/chosen": -0.54296875, "rewards/margins": 0.2578125, "rewards/rejected": -0.80078125, "step": 135 }, { "epoch": 0.6017699115044248, "grad_norm": 37.90422821044922, "learning_rate": 7e-07, "logits/chosen": -0.421875, "logits/rejected": -0.427734375, "logps/chosen": -372.0, "logps/rejected": -298.0, "loss": 0.5493, "loss/chosen-sft": 1.3203125, "loss/dpo": 0.546875, "rewards/accuracies": 0.71875, "rewards/chosen": -0.50390625, "rewards/margins": 0.474609375, "rewards/rejected": -0.9765625, "step": 136 }, { "epoch": 0.6061946902654868, "grad_norm": 118.1666030883789, "learning_rate": 7e-07, "logits/chosen": -0.46875, "logits/rejected": -0.515625, "logps/chosen": -326.0, "logps/rejected": -394.0, "loss": 0.5112, "loss/chosen-sft": 1.234375, "loss/dpo": 0.5390625, "rewards/accuracies": 0.6875, "rewards/chosen": -0.419921875, "rewards/margins": 0.5703125, "rewards/rejected": -0.9921875, "step": 137 }, { "epoch": 0.6106194690265486, "grad_norm": 99.32813262939453, "learning_rate": 7e-07, "logits/chosen": -0.427734375, "logits/rejected": -0.50390625, "logps/chosen": -288.0, "logps/rejected": -268.0, "loss": 0.5508, "loss/chosen-sft": 1.3984375, "loss/dpo": 0.61328125, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5625, "rewards/margins": 0.353515625, "rewards/rejected": -0.9140625, "step": 138 }, { "epoch": 0.6150442477876106, "grad_norm": 17.352619171142578, "learning_rate": 7e-07, "logits/chosen": -0.48046875, "logits/rejected": -0.546875, "logps/chosen": -282.0, "logps/rejected": -272.0, "loss": 0.5352, "loss/chosen-sft": 1.1640625, "loss/dpo": 0.51953125, "rewards/accuracies": 0.78125, "rewards/chosen": -0.466796875, "rewards/margins": 0.5078125, "rewards/rejected": -0.97265625, "step": 139 }, { "epoch": 0.6194690265486725, "grad_norm": 59.95145797729492, "learning_rate": 7e-07, "logits/chosen": -0.6171875, "logits/rejected": -0.6875, "logps/chosen": -302.0, "logps/rejected": -336.0, "loss": 0.5435, "loss/chosen-sft": 1.265625, "loss/dpo": 0.51953125, "rewards/accuracies": 0.84375, "rewards/chosen": -0.4765625, "rewards/margins": 0.55078125, "rewards/rejected": -1.0234375, "step": 140 }, { "epoch": 0.6238938053097345, "grad_norm": 55.3637580871582, "learning_rate": 7e-07, "logits/chosen": -0.51953125, "logits/rejected": -0.56640625, "logps/chosen": -342.0, "logps/rejected": -382.0, "loss": 0.5469, "loss/chosen-sft": 1.3046875, "loss/dpo": 0.55078125, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5546875, "rewards/margins": 0.65625, "rewards/rejected": -1.2109375, "step": 141 }, { "epoch": 0.6283185840707964, "grad_norm": 146.2696075439453, "learning_rate": 7e-07, "logits/chosen": -0.55859375, "logits/rejected": -0.62890625, "logps/chosen": -352.0, "logps/rejected": -318.0, "loss": 0.5806, "loss/chosen-sft": 1.3515625, "loss/dpo": 0.7578125, "rewards/accuracies": 0.59375, "rewards/chosen": -0.703125, "rewards/margins": 0.07275390625, "rewards/rejected": -0.77734375, "step": 142 }, { "epoch": 0.6327433628318584, "grad_norm": 46.21394729614258, "learning_rate": 7e-07, "logits/chosen": -0.578125, "logits/rejected": -0.5859375, "logps/chosen": -268.0, "logps/rejected": -296.0, "loss": 0.5393, "loss/chosen-sft": 1.265625, "loss/dpo": 0.54296875, "rewards/accuracies": 0.71875, "rewards/chosen": -0.341796875, "rewards/margins": 0.48046875, "rewards/rejected": -0.82421875, "step": 143 }, { "epoch": 0.6371681415929203, "grad_norm": 38.909610748291016, "learning_rate": 7e-07, "logits/chosen": -0.60546875, "logits/rejected": -0.73828125, "logps/chosen": -320.0, "logps/rejected": -334.0, "loss": 0.543, "loss/chosen-sft": 1.4375, "loss/dpo": 0.55859375, "rewards/accuracies": 0.59375, "rewards/chosen": -0.6796875, "rewards/margins": 0.5234375, "rewards/rejected": -1.203125, "step": 144 }, { "epoch": 0.6415929203539823, "grad_norm": 137.8043975830078, "learning_rate": 7e-07, "logits/chosen": -0.53515625, "logits/rejected": -0.58203125, "logps/chosen": -340.0, "logps/rejected": -290.0, "loss": 0.5903, "loss/chosen-sft": 1.328125, "loss/dpo": 0.65234375, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7109375, "rewards/margins": 0.333984375, "rewards/rejected": -1.046875, "step": 145 }, { "epoch": 0.6460176991150443, "grad_norm": 36.96350860595703, "learning_rate": 7e-07, "logits/chosen": -0.6953125, "logits/rejected": -0.66796875, "logps/chosen": -398.0, "logps/rejected": -382.0, "loss": 0.5288, "loss/chosen-sft": 1.34375, "loss/dpo": 0.546875, "rewards/accuracies": 0.75, "rewards/chosen": -0.84375, "rewards/margins": 0.486328125, "rewards/rejected": -1.328125, "step": 146 }, { "epoch": 0.6504424778761062, "grad_norm": 31.765138626098633, "learning_rate": 7e-07, "logits/chosen": -0.6796875, "logits/rejected": -0.66015625, "logps/chosen": -354.0, "logps/rejected": -326.0, "loss": 0.5535, "loss/chosen-sft": 1.4375, "loss/dpo": 0.5078125, "rewards/accuracies": 0.8125, "rewards/chosen": -0.63671875, "rewards/margins": 0.5546875, "rewards/rejected": -1.1953125, "step": 147 }, { "epoch": 0.6548672566371682, "grad_norm": 73.44290924072266, "learning_rate": 7e-07, "logits/chosen": -0.609375, "logits/rejected": -0.69921875, "logps/chosen": -356.0, "logps/rejected": -350.0, "loss": 0.5137, "loss/chosen-sft": 1.3046875, "loss/dpo": 0.474609375, "rewards/accuracies": 0.75, "rewards/chosen": -0.5234375, "rewards/margins": 0.67578125, "rewards/rejected": -1.203125, "step": 148 }, { "epoch": 0.6592920353982301, "grad_norm": 14.243433952331543, "learning_rate": 7e-07, "logits/chosen": -0.50390625, "logits/rejected": -0.55078125, "logps/chosen": -326.0, "logps/rejected": -316.0, "loss": 0.5225, "loss/chosen-sft": 1.2578125, "loss/dpo": 0.48046875, "rewards/accuracies": 0.78125, "rewards/chosen": -0.61328125, "rewards/margins": 0.765625, "rewards/rejected": -1.3828125, "step": 149 }, { "epoch": 0.6637168141592921, "grad_norm": 21.770702362060547, "learning_rate": 7e-07, "logits/chosen": -0.55078125, "logits/rejected": -0.546875, "logps/chosen": -326.0, "logps/rejected": -330.0, "loss": 0.5234, "loss/chosen-sft": 1.296875, "loss/dpo": 0.5, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4609375, "rewards/margins": 0.60546875, "rewards/rejected": -1.0703125, "step": 150 }, { "epoch": 0.668141592920354, "grad_norm": 77.819091796875, "learning_rate": 7e-07, "logits/chosen": -0.7109375, "logits/rejected": -0.6171875, "logps/chosen": -404.0, "logps/rejected": -442.0, "loss": 0.5, "loss/chosen-sft": 1.3125, "loss/dpo": 0.5078125, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7734375, "rewards/margins": 0.6875, "rewards/rejected": -1.4609375, "step": 151 }, { "epoch": 0.672566371681416, "grad_norm": 37.56740951538086, "learning_rate": 7e-07, "logits/chosen": -0.3828125, "logits/rejected": -0.2275390625, "logps/chosen": -294.0, "logps/rejected": -324.0, "loss": 0.5325, "loss/chosen-sft": 1.3984375, "loss/dpo": 0.59375, "rewards/accuracies": 0.71875, "rewards/chosen": -0.73828125, "rewards/margins": 0.52734375, "rewards/rejected": -1.265625, "step": 152 }, { "epoch": 0.6769911504424779, "grad_norm": 105.4240951538086, "learning_rate": 7e-07, "logits/chosen": -0.423828125, "logits/rejected": -0.54296875, "logps/chosen": -368.0, "logps/rejected": -390.0, "loss": 0.5544, "loss/chosen-sft": 1.28125, "loss/dpo": 0.46875, "rewards/accuracies": 0.78125, "rewards/chosen": -0.77734375, "rewards/margins": 0.7734375, "rewards/rejected": -1.546875, "step": 153 }, { "epoch": 0.6814159292035398, "grad_norm": 56.64170837402344, "learning_rate": 7e-07, "logits/chosen": -0.5390625, "logits/rejected": -0.6640625, "logps/chosen": -358.0, "logps/rejected": -320.0, "loss": 0.5625, "loss/chosen-sft": 1.3984375, "loss/dpo": 0.5859375, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7578125, "rewards/margins": 0.453125, "rewards/rejected": -1.2109375, "step": 154 }, { "epoch": 0.6858407079646017, "grad_norm": 22.23441505432129, "learning_rate": 7e-07, "logits/chosen": -0.61328125, "logits/rejected": -0.5703125, "logps/chosen": -336.0, "logps/rejected": -314.0, "loss": 0.5115, "loss/chosen-sft": 1.453125, "loss/dpo": 0.52734375, "rewards/accuracies": 0.71875, "rewards/chosen": -0.58984375, "rewards/margins": 0.5625, "rewards/rejected": -1.15625, "step": 155 }, { "epoch": 0.6902654867256637, "grad_norm": 71.9916000366211, "learning_rate": 7e-07, "logits/chosen": -0.69921875, "logits/rejected": -0.7109375, "logps/chosen": -354.0, "logps/rejected": -338.0, "loss": 0.5227, "loss/chosen-sft": 1.40625, "loss/dpo": 0.5546875, "rewards/accuracies": 0.65625, "rewards/chosen": -0.59375, "rewards/margins": 0.6171875, "rewards/rejected": -1.2109375, "step": 156 }, { "epoch": 0.6946902654867256, "grad_norm": 11.088499069213867, "learning_rate": 7e-07, "logits/chosen": -0.48046875, "logits/rejected": -0.51171875, "logps/chosen": -328.0, "logps/rejected": -358.0, "loss": 0.5259, "loss/chosen-sft": 1.328125, "loss/dpo": 0.60546875, "rewards/accuracies": 0.71875, "rewards/chosen": -0.84765625, "rewards/margins": 0.4609375, "rewards/rejected": -1.3125, "step": 157 }, { "epoch": 0.6991150442477876, "grad_norm": 104.77384185791016, "learning_rate": 7e-07, "logits/chosen": -0.55859375, "logits/rejected": -0.7265625, "logps/chosen": -330.0, "logps/rejected": -340.0, "loss": 0.5566, "loss/chosen-sft": 1.4296875, "loss/dpo": 0.61328125, "rewards/accuracies": 0.5625, "rewards/chosen": -0.640625, "rewards/margins": 0.5390625, "rewards/rejected": -1.1796875, "step": 158 }, { "epoch": 0.7035398230088495, "grad_norm": 87.36003875732422, "learning_rate": 7e-07, "logits/chosen": -0.53515625, "logits/rejected": -0.55078125, "logps/chosen": -296.0, "logps/rejected": -342.0, "loss": 0.5449, "loss/chosen-sft": 1.359375, "loss/dpo": 0.58203125, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8125, "rewards/margins": 0.65625, "rewards/rejected": -1.46875, "step": 159 }, { "epoch": 0.7079646017699115, "grad_norm": 37.620750427246094, "learning_rate": 7e-07, "logits/chosen": -0.455078125, "logits/rejected": -0.50390625, "logps/chosen": -376.0, "logps/rejected": -444.0, "loss": 0.5303, "loss/chosen-sft": 1.265625, "loss/dpo": 0.54296875, "rewards/accuracies": 0.71875, "rewards/chosen": -0.87890625, "rewards/margins": 0.62109375, "rewards/rejected": -1.5, "step": 160 }, { "epoch": 0.7123893805309734, "grad_norm": 75.54209899902344, "learning_rate": 7e-07, "logits/chosen": -0.546875, "logits/rejected": -0.64453125, "logps/chosen": -348.0, "logps/rejected": -368.0, "loss": 0.5203, "loss/chosen-sft": 1.3125, "loss/dpo": 0.54296875, "rewards/accuracies": 0.6875, "rewards/chosen": -0.86328125, "rewards/margins": 0.4921875, "rewards/rejected": -1.359375, "step": 161 }, { "epoch": 0.7168141592920354, "grad_norm": 54.434139251708984, "learning_rate": 7e-07, "logits/chosen": -0.60546875, "logits/rejected": -0.74609375, "logps/chosen": -298.0, "logps/rejected": -352.0, "loss": 0.511, "loss/chosen-sft": 1.2109375, "loss/dpo": 0.466796875, "rewards/accuracies": 0.90625, "rewards/chosen": -0.58984375, "rewards/margins": 0.67578125, "rewards/rejected": -1.265625, "step": 162 }, { "epoch": 0.7212389380530974, "grad_norm": 10.78385066986084, "learning_rate": 7e-07, "logits/chosen": -0.6875, "logits/rejected": -0.5859375, "logps/chosen": -288.0, "logps/rejected": -306.0, "loss": 0.5369, "loss/chosen-sft": 1.2734375, "loss/dpo": 0.51953125, "rewards/accuracies": 0.75, "rewards/chosen": -0.72265625, "rewards/margins": 0.65234375, "rewards/rejected": -1.375, "step": 163 }, { "epoch": 0.7256637168141593, "grad_norm": 46.15651321411133, "learning_rate": 7e-07, "logits/chosen": -0.70703125, "logits/rejected": -0.765625, "logps/chosen": -364.0, "logps/rejected": -338.0, "loss": 0.533, "loss/chosen-sft": 1.3828125, "loss/dpo": 0.50390625, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5703125, "rewards/margins": 0.6015625, "rewards/rejected": -1.171875, "step": 164 }, { "epoch": 0.7300884955752213, "grad_norm": 76.59629821777344, "learning_rate": 7e-07, "logits/chosen": -0.57421875, "logits/rejected": -0.64453125, "logps/chosen": -258.0, "logps/rejected": -324.0, "loss": 0.5183, "loss/chosen-sft": 1.1953125, "loss/dpo": 0.52734375, "rewards/accuracies": 0.6875, "rewards/chosen": -0.58203125, "rewards/margins": 0.5390625, "rewards/rejected": -1.1171875, "step": 165 }, { "epoch": 0.7345132743362832, "grad_norm": 73.99260711669922, "learning_rate": 7e-07, "logits/chosen": -0.390625, "logits/rejected": -0.578125, "logps/chosen": -316.0, "logps/rejected": -372.0, "loss": 0.5208, "loss/chosen-sft": 1.2109375, "loss/dpo": 0.5, "rewards/accuracies": 0.84375, "rewards/chosen": -0.8359375, "rewards/margins": 1.0, "rewards/rejected": -1.8359375, "step": 166 }, { "epoch": 0.7389380530973452, "grad_norm": 47.95753479003906, "learning_rate": 7e-07, "logits/chosen": -0.5625, "logits/rejected": -0.72265625, "logps/chosen": -356.0, "logps/rejected": -354.0, "loss": 0.4956, "loss/chosen-sft": 1.2890625, "loss/dpo": 0.6171875, "rewards/accuracies": 0.65625, "rewards/chosen": -0.66015625, "rewards/margins": 0.455078125, "rewards/rejected": -1.1171875, "step": 167 }, { "epoch": 0.7433628318584071, "grad_norm": 27.858415603637695, "learning_rate": 7e-07, "logits/chosen": -0.7109375, "logits/rejected": -0.64453125, "logps/chosen": -336.0, "logps/rejected": -332.0, "loss": 0.4971, "loss/chosen-sft": 1.4375, "loss/dpo": 0.59375, "rewards/accuracies": 0.65625, "rewards/chosen": -0.74609375, "rewards/margins": 0.3515625, "rewards/rejected": -1.09375, "step": 168 }, { "epoch": 0.7477876106194691, "grad_norm": 113.69762420654297, "learning_rate": 7e-07, "logits/chosen": -0.79296875, "logits/rejected": -0.79296875, "logps/chosen": -420.0, "logps/rejected": -446.0, "loss": 0.4646, "loss/chosen-sft": 1.4453125, "loss/dpo": 0.439453125, "rewards/accuracies": 0.75, "rewards/chosen": -0.90625, "rewards/margins": 0.92578125, "rewards/rejected": -1.828125, "step": 169 }, { "epoch": 0.7522123893805309, "grad_norm": 92.540283203125, "learning_rate": 7e-07, "logits/chosen": -0.53125, "logits/rejected": -0.60546875, "logps/chosen": -356.0, "logps/rejected": -412.0, "loss": 0.5298, "loss/chosen-sft": 1.3828125, "loss/dpo": 0.51171875, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9453125, "rewards/margins": 0.64453125, "rewards/rejected": -1.5859375, "step": 170 }, { "epoch": 0.7566371681415929, "grad_norm": 34.24614334106445, "learning_rate": 7e-07, "logits/chosen": -0.515625, "logits/rejected": -0.62109375, "logps/chosen": -336.0, "logps/rejected": -426.0, "loss": 0.4983, "loss/chosen-sft": 1.28125, "loss/dpo": 0.40625, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7734375, "rewards/margins": 1.1328125, "rewards/rejected": -1.90625, "step": 171 }, { "epoch": 0.7610619469026548, "grad_norm": 26.35588264465332, "learning_rate": 7e-07, "logits/chosen": -0.7734375, "logits/rejected": -0.79296875, "logps/chosen": -358.0, "logps/rejected": -358.0, "loss": 0.52, "loss/chosen-sft": 1.4453125, "loss/dpo": 0.435546875, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6875, "rewards/margins": 0.92578125, "rewards/rejected": -1.6171875, "step": 172 }, { "epoch": 0.7654867256637168, "grad_norm": 70.59893798828125, "learning_rate": 7e-07, "logits/chosen": -0.578125, "logits/rejected": -0.63671875, "logps/chosen": -336.0, "logps/rejected": -406.0, "loss": 0.4802, "loss/chosen-sft": 1.4609375, "loss/dpo": 0.451171875, "rewards/accuracies": 0.78125, "rewards/chosen": -0.83984375, "rewards/margins": 1.078125, "rewards/rejected": -1.921875, "step": 173 }, { "epoch": 0.7699115044247787, "grad_norm": 35.492210388183594, "learning_rate": 7e-07, "logits/chosen": -0.7109375, "logits/rejected": -0.63671875, "logps/chosen": -326.0, "logps/rejected": -396.0, "loss": 0.4731, "loss/chosen-sft": 1.4140625, "loss/dpo": 0.490234375, "rewards/accuracies": 0.71875, "rewards/chosen": -0.69140625, "rewards/margins": 1.109375, "rewards/rejected": -1.8046875, "step": 174 }, { "epoch": 0.7743362831858407, "grad_norm": 52.300148010253906, "learning_rate": 7e-07, "logits/chosen": -0.294921875, "logits/rejected": -0.431640625, "logps/chosen": -298.0, "logps/rejected": -330.0, "loss": 0.5232, "loss/chosen-sft": 1.3984375, "loss/dpo": 0.6796875, "rewards/accuracies": 0.625, "rewards/chosen": -0.9453125, "rewards/margins": 0.330078125, "rewards/rejected": -1.2734375, "step": 175 }, { "epoch": 0.7787610619469026, "grad_norm": 133.416748046875, "learning_rate": 7e-07, "logits/chosen": -0.59375, "logits/rejected": -0.50390625, "logps/chosen": -330.0, "logps/rejected": -342.0, "loss": 0.499, "loss/chosen-sft": 1.6796875, "loss/dpo": 0.609375, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8828125, "rewards/margins": 0.33203125, "rewards/rejected": -1.21875, "step": 176 }, { "epoch": 0.7831858407079646, "grad_norm": 23.086772918701172, "learning_rate": 7e-07, "logits/chosen": -0.5625, "logits/rejected": -0.50390625, "logps/chosen": -376.0, "logps/rejected": -506.0, "loss": 0.4585, "loss/chosen-sft": 1.421875, "loss/dpo": 0.392578125, "rewards/accuracies": 0.90625, "rewards/chosen": -0.84375, "rewards/margins": 1.34375, "rewards/rejected": -2.1875, "step": 177 }, { "epoch": 0.7876106194690266, "grad_norm": 206.20298767089844, "learning_rate": 7e-07, "logits/chosen": -0.53125, "logits/rejected": -0.470703125, "logps/chosen": -276.0, "logps/rejected": -296.0, "loss": 0.5459, "loss/chosen-sft": 1.2890625, "loss/dpo": 0.5625, "rewards/accuracies": 0.65625, "rewards/chosen": -0.62890625, "rewards/margins": 0.58203125, "rewards/rejected": -1.2109375, "step": 178 }, { "epoch": 0.7920353982300885, "grad_norm": 138.48703002929688, "learning_rate": 7e-07, "logits/chosen": -0.4375, "logits/rejected": -0.51171875, "logps/chosen": -294.0, "logps/rejected": -364.0, "loss": 0.4871, "loss/chosen-sft": 1.328125, "loss/dpo": 0.470703125, "rewards/accuracies": 0.78125, "rewards/chosen": -0.76171875, "rewards/margins": 0.7109375, "rewards/rejected": -1.46875, "step": 179 }, { "epoch": 0.7964601769911505, "grad_norm": 7.023651599884033, "learning_rate": 7e-07, "logits/chosen": -0.65625, "logits/rejected": -0.8359375, "logps/chosen": -338.0, "logps/rejected": -370.0, "loss": 0.4651, "loss/chosen-sft": 1.453125, "loss/dpo": 0.431640625, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8203125, "rewards/margins": 1.015625, "rewards/rejected": -1.828125, "step": 180 }, { "epoch": 0.8008849557522124, "grad_norm": 101.0399169921875, "learning_rate": 7e-07, "logits/chosen": -0.51953125, "logits/rejected": -0.578125, "logps/chosen": -418.0, "logps/rejected": -480.0, "loss": 0.4773, "loss/chosen-sft": 1.5, "loss/dpo": 0.431640625, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0625, "rewards/margins": 1.21875, "rewards/rejected": -2.28125, "step": 181 }, { "epoch": 0.8053097345132744, "grad_norm": 42.745174407958984, "learning_rate": 7e-07, "logits/chosen": -0.6484375, "logits/rejected": -0.5859375, "logps/chosen": -376.0, "logps/rejected": -418.0, "loss": 0.4871, "loss/chosen-sft": 1.5625, "loss/dpo": 0.45703125, "rewards/accuracies": 0.78125, "rewards/chosen": -0.98828125, "rewards/margins": 1.046875, "rewards/rejected": -2.03125, "step": 182 }, { "epoch": 0.8097345132743363, "grad_norm": 65.14553833007812, "learning_rate": 7e-07, "logits/chosen": -0.6484375, "logits/rejected": -0.6953125, "logps/chosen": -298.0, "logps/rejected": -312.0, "loss": 0.4883, "loss/chosen-sft": 1.4921875, "loss/dpo": 0.5234375, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8828125, "rewards/margins": 0.640625, "rewards/rejected": -1.5234375, "step": 183 }, { "epoch": 0.8141592920353983, "grad_norm": 306.6606750488281, "learning_rate": 7e-07, "logits/chosen": -0.451171875, "logits/rejected": -0.5703125, "logps/chosen": -320.0, "logps/rejected": -420.0, "loss": 0.5232, "loss/chosen-sft": 1.3203125, "loss/dpo": 0.482421875, "rewards/accuracies": 0.78125, "rewards/chosen": -0.79296875, "rewards/margins": 1.2421875, "rewards/rejected": -2.046875, "step": 184 }, { "epoch": 0.8185840707964602, "grad_norm": 20.622095108032227, "learning_rate": 7e-07, "logits/chosen": -0.6171875, "logits/rejected": -0.72265625, "logps/chosen": -368.0, "logps/rejected": -396.0, "loss": 0.5178, "loss/chosen-sft": 1.59375, "loss/dpo": 0.490234375, "rewards/accuracies": 0.75, "rewards/chosen": -0.76953125, "rewards/margins": 0.90625, "rewards/rejected": -1.671875, "step": 185 }, { "epoch": 0.8230088495575221, "grad_norm": 15.8814115524292, "learning_rate": 7e-07, "logits/chosen": -0.640625, "logits/rejected": -0.81640625, "logps/chosen": -350.0, "logps/rejected": -382.0, "loss": 0.5017, "loss/chosen-sft": 1.484375, "loss/dpo": 0.53515625, "rewards/accuracies": 0.8125, "rewards/chosen": -0.83203125, "rewards/margins": 0.8125, "rewards/rejected": -1.6484375, "step": 186 }, { "epoch": 0.827433628318584, "grad_norm": 45.224002838134766, "learning_rate": 7e-07, "logits/chosen": -0.478515625, "logits/rejected": -0.404296875, "logps/chosen": -294.0, "logps/rejected": -386.0, "loss": 0.5002, "loss/chosen-sft": 1.3046875, "loss/dpo": 0.453125, "rewards/accuracies": 0.8125, "rewards/chosen": -0.72265625, "rewards/margins": 0.83984375, "rewards/rejected": -1.5625, "step": 187 }, { "epoch": 0.831858407079646, "grad_norm": 32.42481994628906, "learning_rate": 7e-07, "logits/chosen": -0.38671875, "logits/rejected": -0.43359375, "logps/chosen": -336.0, "logps/rejected": -340.0, "loss": 0.5051, "loss/chosen-sft": 1.3359375, "loss/dpo": 0.5390625, "rewards/accuracies": 0.75, "rewards/chosen": -0.84765625, "rewards/margins": 0.5703125, "rewards/rejected": -1.4140625, "step": 188 }, { "epoch": 0.8362831858407079, "grad_norm": 54.047035217285156, "learning_rate": 7e-07, "logits/chosen": -0.5234375, "logits/rejected": -0.37890625, "logps/chosen": -290.0, "logps/rejected": -366.0, "loss": 0.4817, "loss/chosen-sft": 1.3671875, "loss/dpo": 0.54296875, "rewards/accuracies": 0.8125, "rewards/chosen": -0.90234375, "rewards/margins": 0.8125, "rewards/rejected": -1.71875, "step": 189 }, { "epoch": 0.8407079646017699, "grad_norm": 10.179818153381348, "learning_rate": 7e-07, "logits/chosen": -0.41796875, "logits/rejected": -0.4296875, "logps/chosen": -296.0, "logps/rejected": -302.0, "loss": 0.5049, "loss/chosen-sft": 1.3984375, "loss/dpo": 0.55859375, "rewards/accuracies": 0.8125, "rewards/chosen": -0.796875, "rewards/margins": 0.482421875, "rewards/rejected": -1.28125, "step": 190 }, { "epoch": 0.8451327433628318, "grad_norm": 63.52503204345703, "learning_rate": 7e-07, "logits/chosen": -0.76953125, "logits/rejected": -0.69921875, "logps/chosen": -356.0, "logps/rejected": -416.0, "loss": 0.5083, "loss/chosen-sft": 1.46875, "loss/dpo": 0.486328125, "rewards/accuracies": 0.84375, "rewards/chosen": -0.93359375, "rewards/margins": 0.859375, "rewards/rejected": -1.7890625, "step": 191 }, { "epoch": 0.8495575221238938, "grad_norm": 64.59293365478516, "learning_rate": 7e-07, "logits/chosen": -0.4609375, "logits/rejected": -0.474609375, "logps/chosen": -378.0, "logps/rejected": -392.0, "loss": 0.4697, "loss/chosen-sft": 1.3203125, "loss/dpo": 0.47265625, "rewards/accuracies": 0.84375, "rewards/chosen": -0.80078125, "rewards/margins": 0.87109375, "rewards/rejected": -1.671875, "step": 192 }, { "epoch": 0.8539823008849557, "grad_norm": 48.203311920166016, "learning_rate": 7e-07, "logits/chosen": -0.3046875, "logits/rejected": -0.31640625, "logps/chosen": -308.0, "logps/rejected": -390.0, "loss": 0.5132, "loss/chosen-sft": 1.171875, "loss/dpo": 0.4375, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6796875, "rewards/margins": 0.9140625, "rewards/rejected": -1.59375, "step": 193 }, { "epoch": 0.8584070796460177, "grad_norm": 86.9441909790039, "learning_rate": 7e-07, "logits/chosen": -0.435546875, "logits/rejected": -0.4609375, "logps/chosen": -296.0, "logps/rejected": -358.0, "loss": 0.4749, "loss/chosen-sft": 1.2734375, "loss/dpo": 0.44140625, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6484375, "rewards/margins": 0.94921875, "rewards/rejected": -1.59375, "step": 194 }, { "epoch": 0.8628318584070797, "grad_norm": 108.06549835205078, "learning_rate": 7e-07, "logits/chosen": -0.4453125, "logits/rejected": -0.5234375, "logps/chosen": -342.0, "logps/rejected": -404.0, "loss": 0.4771, "loss/chosen-sft": 1.4453125, "loss/dpo": 0.65234375, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9921875, "rewards/margins": 0.64453125, "rewards/rejected": -1.640625, "step": 195 }, { "epoch": 0.8672566371681416, "grad_norm": 173.70831298828125, "learning_rate": 7e-07, "logits/chosen": -0.58984375, "logits/rejected": -0.62890625, "logps/chosen": -368.0, "logps/rejected": -400.0, "loss": 0.4434, "loss/chosen-sft": 1.46875, "loss/dpo": 0.44140625, "rewards/accuracies": 0.75, "rewards/chosen": -0.9453125, "rewards/margins": 1.046875, "rewards/rejected": -1.984375, "step": 196 }, { "epoch": 0.8716814159292036, "grad_norm": 41.173553466796875, "learning_rate": 7e-07, "logits/chosen": -0.60546875, "logits/rejected": -0.671875, "logps/chosen": -392.0, "logps/rejected": -470.0, "loss": 0.4729, "loss/chosen-sft": 1.5, "loss/dpo": 0.44140625, "rewards/accuracies": 0.84375, "rewards/chosen": -1.140625, "rewards/margins": 1.1015625, "rewards/rejected": -2.25, "step": 197 }, { "epoch": 0.8761061946902655, "grad_norm": 14.614328384399414, "learning_rate": 7e-07, "logits/chosen": -0.68359375, "logits/rejected": -0.76171875, "logps/chosen": -360.0, "logps/rejected": -384.0, "loss": 0.5215, "loss/chosen-sft": 1.484375, "loss/dpo": 0.48828125, "rewards/accuracies": 0.78125, "rewards/chosen": -1.046875, "rewards/margins": 0.91015625, "rewards/rejected": -1.953125, "step": 198 }, { "epoch": 0.8805309734513275, "grad_norm": 86.90143585205078, "learning_rate": 7e-07, "logits/chosen": -0.8046875, "logits/rejected": -0.81640625, "logps/chosen": -454.0, "logps/rejected": -482.0, "loss": 0.541, "loss/chosen-sft": 1.7578125, "loss/dpo": 0.55859375, "rewards/accuracies": 0.78125, "rewards/chosen": -1.3828125, "rewards/margins": 1.0625, "rewards/rejected": -2.4375, "step": 199 }, { "epoch": 0.8849557522123894, "grad_norm": 24.392606735229492, "learning_rate": 7e-07, "logits/chosen": -0.6171875, "logits/rejected": -0.73828125, "logps/chosen": -326.0, "logps/rejected": -384.0, "loss": 0.4583, "loss/chosen-sft": 1.59375, "loss/dpo": 0.50390625, "rewards/accuracies": 0.71875, "rewards/chosen": -0.953125, "rewards/margins": 0.99609375, "rewards/rejected": -1.953125, "step": 200 }, { "epoch": 0.8893805309734514, "grad_norm": 67.55127716064453, "learning_rate": 7e-07, "logits/chosen": -0.5078125, "logits/rejected": -0.5546875, "logps/chosen": -408.0, "logps/rejected": -484.0, "loss": 0.5017, "loss/chosen-sft": 1.5625, "loss/dpo": 0.494140625, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1328125, "rewards/margins": 1.25, "rewards/rejected": -2.375, "step": 201 }, { "epoch": 0.8938053097345132, "grad_norm": 35.01057434082031, "learning_rate": 7e-07, "logits/chosen": -0.69921875, "logits/rejected": -0.640625, "logps/chosen": -336.0, "logps/rejected": -458.0, "loss": 0.5005, "loss/chosen-sft": 1.4609375, "loss/dpo": 0.478515625, "rewards/accuracies": 0.875, "rewards/chosen": -0.75390625, "rewards/margins": 1.3046875, "rewards/rejected": -2.0625, "step": 202 }, { "epoch": 0.8982300884955752, "grad_norm": 24.339378356933594, "learning_rate": 7e-07, "logits/chosen": -0.408203125, "logits/rejected": -0.435546875, "logps/chosen": -310.0, "logps/rejected": -408.0, "loss": 0.4685, "loss/chosen-sft": 1.28125, "loss/dpo": 0.455078125, "rewards/accuracies": 0.75, "rewards/chosen": -0.73828125, "rewards/margins": 1.234375, "rewards/rejected": -1.9765625, "step": 203 }, { "epoch": 0.9026548672566371, "grad_norm": 135.99609375, "learning_rate": 7e-07, "logits/chosen": -0.498046875, "logits/rejected": -0.5625, "logps/chosen": -314.0, "logps/rejected": -396.0, "loss": 0.4561, "loss/chosen-sft": 1.3125, "loss/dpo": 0.47265625, "rewards/accuracies": 0.78125, "rewards/chosen": -0.62109375, "rewards/margins": 0.9140625, "rewards/rejected": -1.53125, "step": 204 }, { "epoch": 0.9070796460176991, "grad_norm": 137.96900939941406, "learning_rate": 7e-07, "logits/chosen": -0.50390625, "logits/rejected": -0.7109375, "logps/chosen": -330.0, "logps/rejected": -432.0, "loss": 0.5061, "loss/chosen-sft": 1.3359375, "loss/dpo": 0.470703125, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7109375, "rewards/margins": 0.953125, "rewards/rejected": -1.6640625, "step": 205 }, { "epoch": 0.911504424778761, "grad_norm": 165.43182373046875, "learning_rate": 7e-07, "logits/chosen": -0.4609375, "logits/rejected": -0.3828125, "logps/chosen": -260.0, "logps/rejected": -288.0, "loss": 0.5105, "loss/chosen-sft": 1.265625, "loss/dpo": 0.5625, "rewards/accuracies": 0.625, "rewards/chosen": -0.609375, "rewards/margins": 0.494140625, "rewards/rejected": -1.109375, "step": 206 }, { "epoch": 0.915929203539823, "grad_norm": 216.99935913085938, "learning_rate": 7e-07, "logits/chosen": -0.43359375, "logits/rejected": -0.59375, "logps/chosen": -308.0, "logps/rejected": -344.0, "loss": 0.4929, "loss/chosen-sft": 1.3125, "loss/dpo": 0.466796875, "rewards/accuracies": 0.78125, "rewards/chosen": -0.57421875, "rewards/margins": 0.75390625, "rewards/rejected": -1.328125, "step": 207 }, { "epoch": 0.9203539823008849, "grad_norm": 51.624351501464844, "learning_rate": 7e-07, "logits/chosen": -0.5234375, "logits/rejected": -0.6328125, "logps/chosen": -304.0, "logps/rejected": -286.0, "loss": 0.5669, "loss/chosen-sft": 1.4140625, "loss/dpo": 0.59375, "rewards/accuracies": 0.65625, "rewards/chosen": -0.60546875, "rewards/margins": 0.314453125, "rewards/rejected": -0.91796875, "step": 208 }, { "epoch": 0.9247787610619469, "grad_norm": 29.490407943725586, "learning_rate": 7e-07, "logits/chosen": -0.55859375, "logits/rejected": -0.5703125, "logps/chosen": -366.0, "logps/rejected": -350.0, "loss": 0.5042, "loss/chosen-sft": 1.234375, "loss/dpo": 0.46484375, "rewards/accuracies": 0.875, "rewards/chosen": -0.451171875, "rewards/margins": 0.796875, "rewards/rejected": -1.25, "step": 209 }, { "epoch": 0.9292035398230089, "grad_norm": 159.65997314453125, "learning_rate": 7e-07, "logits/chosen": -0.67578125, "logits/rejected": -0.55859375, "logps/chosen": -340.0, "logps/rejected": -346.0, "loss": 0.5374, "loss/chosen-sft": 1.3125, "loss/dpo": 0.474609375, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6328125, "rewards/margins": 0.83984375, "rewards/rejected": -1.46875, "step": 210 }, { "epoch": 0.9336283185840708, "grad_norm": 22.276451110839844, "learning_rate": 7e-07, "logits/chosen": -0.41015625, "logits/rejected": -0.46875, "logps/chosen": -308.0, "logps/rejected": -324.0, "loss": 0.5066, "loss/chosen-sft": 1.359375, "loss/dpo": 0.57421875, "rewards/accuracies": 0.75, "rewards/chosen": -0.78515625, "rewards/margins": 0.53515625, "rewards/rejected": -1.3203125, "step": 211 }, { "epoch": 0.9380530973451328, "grad_norm": 62.60710525512695, "learning_rate": 7e-07, "logits/chosen": -0.68359375, "logits/rejected": -0.76171875, "logps/chosen": -378.0, "logps/rejected": -396.0, "loss": 0.4917, "loss/chosen-sft": 1.5, "loss/dpo": 0.578125, "rewards/accuracies": 0.71875, "rewards/chosen": -0.94921875, "rewards/margins": 0.8984375, "rewards/rejected": -1.8515625, "step": 212 }, { "epoch": 0.9424778761061947, "grad_norm": 183.3363037109375, "learning_rate": 7e-07, "logits/chosen": -0.62109375, "logits/rejected": -0.71875, "logps/chosen": -328.0, "logps/rejected": -326.0, "loss": 0.426, "loss/chosen-sft": 1.375, "loss/dpo": 0.47265625, "rewards/accuracies": 0.6875, "rewards/chosen": -0.80078125, "rewards/margins": 0.8046875, "rewards/rejected": -1.609375, "step": 213 }, { "epoch": 0.9469026548672567, "grad_norm": 182.4455108642578, "learning_rate": 7e-07, "logits/chosen": -0.62890625, "logits/rejected": -0.57421875, "logps/chosen": -358.0, "logps/rejected": -480.0, "loss": 0.5305, "loss/chosen-sft": 1.4765625, "loss/dpo": 0.5234375, "rewards/accuracies": 0.8125, "rewards/chosen": -0.80078125, "rewards/margins": 1.609375, "rewards/rejected": -2.40625, "step": 214 }, { "epoch": 0.9513274336283186, "grad_norm": 112.33076477050781, "learning_rate": 7e-07, "logits/chosen": -0.52734375, "logits/rejected": -0.58203125, "logps/chosen": -344.0, "logps/rejected": -448.0, "loss": 0.4475, "loss/chosen-sft": 1.3359375, "loss/dpo": 0.3359375, "rewards/accuracies": 0.90625, "rewards/chosen": -0.61328125, "rewards/margins": 1.4453125, "rewards/rejected": -2.0625, "step": 215 }, { "epoch": 0.9557522123893806, "grad_norm": 125.87716674804688, "learning_rate": 7e-07, "logits/chosen": -0.4296875, "logits/rejected": -0.47265625, "logps/chosen": -314.0, "logps/rejected": -462.0, "loss": 0.4885, "loss/chosen-sft": 1.5078125, "loss/dpo": 0.365234375, "rewards/accuracies": 0.875, "rewards/chosen": -0.84375, "rewards/margins": 1.5546875, "rewards/rejected": -2.390625, "step": 216 }, { "epoch": 0.9601769911504425, "grad_norm": 159.3242950439453, "learning_rate": 7e-07, "logits/chosen": -0.53515625, "logits/rejected": -0.53125, "logps/chosen": -330.0, "logps/rejected": -398.0, "loss": 0.4597, "loss/chosen-sft": 1.34375, "loss/dpo": 0.4140625, "rewards/accuracies": 0.8125, "rewards/chosen": -0.765625, "rewards/margins": 1.171875, "rewards/rejected": -1.9453125, "step": 217 }, { "epoch": 0.9646017699115044, "grad_norm": 36.168601989746094, "learning_rate": 7e-07, "logits/chosen": -0.451171875, "logits/rejected": -0.44140625, "logps/chosen": -326.0, "logps/rejected": -366.0, "loss": 0.4858, "loss/chosen-sft": 1.3203125, "loss/dpo": 0.52734375, "rewards/accuracies": 0.78125, "rewards/chosen": -0.953125, "rewards/margins": 0.94921875, "rewards/rejected": -1.8984375, "step": 218 }, { "epoch": 0.9690265486725663, "grad_norm": 19.605365753173828, "learning_rate": 7e-07, "logits/chosen": -0.4921875, "logits/rejected": -0.55859375, "logps/chosen": -284.0, "logps/rejected": -370.0, "loss": 0.4822, "loss/chosen-sft": 1.3046875, "loss/dpo": 0.40234375, "rewards/accuracies": 0.84375, "rewards/chosen": -0.90234375, "rewards/margins": 1.046875, "rewards/rejected": -1.9453125, "step": 219 }, { "epoch": 0.9734513274336283, "grad_norm": 143.3219451904297, "learning_rate": 7e-07, "logits/chosen": -0.37890625, "logits/rejected": -0.4765625, "logps/chosen": -366.0, "logps/rejected": -510.0, "loss": 0.509, "loss/chosen-sft": 1.515625, "loss/dpo": 0.54296875, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2734375, "rewards/margins": 1.03125, "rewards/rejected": -2.296875, "step": 220 }, { "epoch": 0.9778761061946902, "grad_norm": 268.8928527832031, "learning_rate": 7e-07, "logits/chosen": -0.392578125, "logits/rejected": -0.30859375, "logps/chosen": -249.0, "logps/rejected": -414.0, "loss": 0.5212, "loss/chosen-sft": 1.3984375, "loss/dpo": 0.404296875, "rewards/accuracies": 0.875, "rewards/chosen": -0.78125, "rewards/margins": 1.53125, "rewards/rejected": -2.3125, "step": 221 }, { "epoch": 0.9823008849557522, "grad_norm": 295.1648864746094, "learning_rate": 7e-07, "logits/chosen": -0.56640625, "logits/rejected": -0.6328125, "logps/chosen": -368.0, "logps/rejected": -496.0, "loss": 0.5183, "loss/chosen-sft": 1.5, "loss/dpo": 0.5, "rewards/accuracies": 0.75, "rewards/chosen": -1.265625, "rewards/margins": 0.96875, "rewards/rejected": -2.234375, "step": 222 }, { "epoch": 0.9867256637168141, "grad_norm": 21.13129425048828, "learning_rate": 7e-07, "logits/chosen": -0.498046875, "logits/rejected": -0.6328125, "logps/chosen": -346.0, "logps/rejected": -374.0, "loss": 0.4485, "loss/chosen-sft": 1.5234375, "loss/dpo": 0.5234375, "rewards/accuracies": 0.75, "rewards/chosen": -1.0390625, "rewards/margins": 0.984375, "rewards/rejected": -2.03125, "step": 223 }, { "epoch": 0.9911504424778761, "grad_norm": 199.18679809570312, "learning_rate": 7e-07, "logits/chosen": -0.62890625, "logits/rejected": -0.625, "logps/chosen": -338.0, "logps/rejected": -406.0, "loss": 0.4338, "loss/chosen-sft": 1.484375, "loss/dpo": 0.5, "rewards/accuracies": 0.75, "rewards/chosen": -1.109375, "rewards/margins": 0.94140625, "rewards/rejected": -2.046875, "step": 224 }, { "epoch": 0.995575221238938, "grad_norm": 238.7730255126953, "learning_rate": 7e-07, "logits/chosen": -0.478515625, "logits/rejected": -0.30078125, "logps/chosen": -300.0, "logps/rejected": -416.0, "loss": 0.4624, "loss/chosen-sft": 1.4765625, "loss/dpo": 0.451171875, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9609375, "rewards/margins": 1.2890625, "rewards/rejected": -2.25, "step": 225 }, { "epoch": 1.0, "grad_norm": 94.32278442382812, "learning_rate": 7e-07, "logits/chosen": -0.482421875, "logits/rejected": -0.55078125, "logps/chosen": -312.0, "logps/rejected": -424.0, "loss": 0.4558, "loss/chosen-sft": 1.359375, "loss/dpo": 0.478515625, "rewards/accuracies": 0.75, "rewards/chosen": -0.828125, "rewards/margins": 1.109375, "rewards/rejected": -1.9296875, "step": 226 }, { "epoch": 1.0, "step": 226, "total_flos": 0.0, "train_loss": 0.5733826223727876, "train_runtime": 2164.3688, "train_samples_per_second": 26.647, "train_steps_per_second": 0.104 } ], "logging_steps": 1, "max_steps": 226, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }