{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9988221436984688, "eval_steps": 500, "global_step": 477, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0020939667582777124, "grad_norm": 3.542720317840576, "learning_rate": 1.0416666666666666e-08, "logits/chosen": -0.4924379587173462, "logits/rejected": -0.5068457126617432, "logps/chosen": -312.7522277832031, "logps/ref_chosen": -312.7147216796875, "logps/ref_rejected": -253.63491821289062, "logps/rejected": -253.51414489746094, "loss": 0.6933, "margin_dpo/margin_mean": -0.15826064348220825, "margin_dpo/margin_std": 0.4408723711967468, "step": 1 }, { "epoch": 0.010469833791388562, "grad_norm": 3.7683515548706055, "learning_rate": 5.208333333333333e-08, "logits/chosen": -0.5594870448112488, "logits/rejected": -0.561593234539032, "logps/chosen": -302.042236328125, "logps/ref_chosen": -302.113525390625, "logps/ref_rejected": -265.921630859375, "logps/rejected": -265.81756591796875, "loss": 0.6936, "margin_dpo/margin_mean": -0.032741278409957886, "margin_dpo/margin_std": 0.47709810733795166, "step": 5 }, { "epoch": 0.020939667582777124, "grad_norm": 3.2978525161743164, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -0.6263613700866699, "logits/rejected": -0.6366501450538635, "logps/chosen": -289.3572998046875, "logps/ref_chosen": -289.3254699707031, "logps/ref_rejected": -265.15252685546875, "logps/rejected": -265.20623779296875, "loss": 0.693, "margin_dpo/margin_mean": 0.021922003477811813, "margin_dpo/margin_std": 0.3989306092262268, "step": 10 }, { "epoch": 0.031409501374165684, "grad_norm": 3.729114532470703, "learning_rate": 1.5624999999999999e-07, "logits/chosen": -0.5729600191116333, "logits/rejected": -0.6039099097251892, "logps/chosen": -259.98077392578125, "logps/ref_chosen": -259.9504699707031, "logps/ref_rejected": -240.0460205078125, "logps/rejected": -240.0342254638672, "loss": 0.6932, "margin_dpo/margin_mean": -0.042098261415958405, "margin_dpo/margin_std": 0.3472225069999695, "step": 15 }, { "epoch": 0.04187933516555425, "grad_norm": 3.6892592906951904, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -0.5994583368301392, "logits/rejected": -0.6448055505752563, "logps/chosen": -313.4446105957031, "logps/ref_chosen": -313.56878662109375, "logps/ref_rejected": -242.9326629638672, "logps/rejected": -242.9879608154297, "loss": 0.6929, "margin_dpo/margin_mean": 0.17950855195522308, "margin_dpo/margin_std": 0.39689332246780396, "step": 20 }, { "epoch": 0.05234916895694281, "grad_norm": 3.113755941390991, "learning_rate": 2.604166666666667e-07, "logits/chosen": -0.5944957733154297, "logits/rejected": -0.6160570383071899, "logps/chosen": -268.8236999511719, "logps/ref_chosen": -269.0141296386719, "logps/ref_rejected": -249.57373046875, "logps/rejected": -249.5041046142578, "loss": 0.6925, "margin_dpo/margin_mean": 0.12079717963933945, "margin_dpo/margin_std": 0.354322612285614, "step": 25 }, { "epoch": 0.06281900274833137, "grad_norm": 3.619852304458618, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -0.5969554781913757, "logits/rejected": -0.6033689975738525, "logps/chosen": -248.0936279296875, "logps/ref_chosen": -248.4279327392578, "logps/ref_rejected": -223.625, "logps/rejected": -223.48489379882812, "loss": 0.6922, "margin_dpo/margin_mean": 0.19413962960243225, "margin_dpo/margin_std": 0.42593711614608765, "step": 30 }, { "epoch": 0.07328883653971993, "grad_norm": 3.3220112323760986, "learning_rate": 3.645833333333333e-07, "logits/chosen": -0.5999919772148132, "logits/rejected": -0.621133029460907, "logps/chosen": -292.1496887207031, "logps/ref_chosen": -292.8326721191406, "logps/ref_rejected": -266.7723388671875, "logps/rejected": -266.6192932128906, "loss": 0.6911, "margin_dpo/margin_mean": 0.5299659371376038, "margin_dpo/margin_std": 0.7171144485473633, "step": 35 }, { "epoch": 0.0837586703311085, "grad_norm": 3.3946120738983154, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -0.588017463684082, "logits/rejected": -0.6025812029838562, "logps/chosen": -275.02734375, "logps/ref_chosen": -276.1731262207031, "logps/ref_rejected": -270.0716552734375, "logps/rejected": -269.6761474609375, "loss": 0.6897, "margin_dpo/margin_mean": 0.7502254247665405, "margin_dpo/margin_std": 0.9299477338790894, "step": 40 }, { "epoch": 0.09422850412249706, "grad_norm": 3.5746335983276367, "learning_rate": 4.6874999999999996e-07, "logits/chosen": -0.5573989748954773, "logits/rejected": -0.6107737421989441, "logps/chosen": -272.93560791015625, "logps/ref_chosen": -274.55963134765625, "logps/ref_rejected": -255.4543914794922, "logps/rejected": -254.9309844970703, "loss": 0.6877, "margin_dpo/margin_mean": 1.1005710363388062, "margin_dpo/margin_std": 1.3177019357681274, "step": 45 }, { "epoch": 0.10469833791388562, "grad_norm": 3.53144907951355, "learning_rate": 4.999731868769026e-07, "logits/chosen": -0.5910452604293823, "logits/rejected": -0.6201281547546387, "logps/chosen": -251.3848876953125, "logps/ref_chosen": -253.75625610351562, "logps/ref_rejected": -281.420654296875, "logps/rejected": -280.2291564941406, "loss": 0.6869, "margin_dpo/margin_mean": 1.1799527406692505, "margin_dpo/margin_std": 1.7736713886260986, "step": 50 }, { "epoch": 0.11516817170527417, "grad_norm": 2.9852986335754395, "learning_rate": 4.996716052911017e-07, "logits/chosen": -0.6343711018562317, "logits/rejected": -0.6835486888885498, "logps/chosen": -296.74749755859375, "logps/ref_chosen": -299.85479736328125, "logps/ref_rejected": -257.32342529296875, "logps/rejected": -256.5378112792969, "loss": 0.6831, "margin_dpo/margin_mean": 2.3216612339019775, "margin_dpo/margin_std": 2.748133420944214, "step": 55 }, { "epoch": 0.12563800549666274, "grad_norm": 3.400481939315796, "learning_rate": 4.990353313429303e-07, "logits/chosen": -0.6267572641372681, "logits/rejected": -0.6378194093704224, "logps/chosen": -272.1562805175781, "logps/ref_chosen": -274.9129638671875, "logps/ref_rejected": -237.11660766601562, "logps/rejected": -236.6967010498047, "loss": 0.6802, "margin_dpo/margin_mean": 2.336799144744873, "margin_dpo/margin_std": 3.691256284713745, "step": 60 }, { "epoch": 0.1361078392880513, "grad_norm": 3.4638543128967285, "learning_rate": 4.980652179769217e-07, "logits/chosen": -0.6442904472351074, "logits/rejected": -0.676816463470459, "logps/chosen": -309.783935546875, "logps/ref_chosen": -313.6683044433594, "logps/ref_rejected": -299.89239501953125, "logps/rejected": -299.6600341796875, "loss": 0.6775, "margin_dpo/margin_mean": 3.6519775390625, "margin_dpo/margin_std": 4.225631237030029, "step": 65 }, { "epoch": 0.14657767307943986, "grad_norm": 3.5650267601013184, "learning_rate": 4.967625656594781e-07, "logits/chosen": -0.705890417098999, "logits/rejected": -0.7243223190307617, "logps/chosen": -277.22039794921875, "logps/ref_chosen": -280.58587646484375, "logps/ref_rejected": -250.37631225585938, "logps/rejected": -251.58987426757812, "loss": 0.6721, "margin_dpo/margin_mean": 4.578995704650879, "margin_dpo/margin_std": 5.895940780639648, "step": 70 }, { "epoch": 0.1570475068708284, "grad_norm": 3.530616044998169, "learning_rate": 4.951291206355559e-07, "logits/chosen": -0.6583966612815857, "logits/rejected": -0.6398724317550659, "logps/chosen": -304.5086364746094, "logps/ref_chosen": -307.75555419921875, "logps/ref_rejected": -285.0672302246094, "logps/rejected": -286.8878173828125, "loss": 0.6705, "margin_dpo/margin_mean": 5.0675201416015625, "margin_dpo/margin_std": 5.583609580993652, "step": 75 }, { "epoch": 0.167517340662217, "grad_norm": 3.5915329456329346, "learning_rate": 4.93167072587771e-07, "logits/chosen": -0.6411532759666443, "logits/rejected": -0.6395535469055176, "logps/chosen": -238.50308227539062, "logps/ref_chosen": -240.4182891845703, "logps/ref_rejected": -242.6591033935547, "logps/rejected": -246.6568145751953, "loss": 0.6613, "margin_dpo/margin_mean": 5.9129133224487305, "margin_dpo/margin_std": 6.689321994781494, "step": 80 }, { "epoch": 0.17798717445360554, "grad_norm": 3.667856454849243, "learning_rate": 4.908790517010636e-07, "logits/chosen": -0.7155531644821167, "logits/rejected": -0.7152854800224304, "logps/chosen": -284.5023193359375, "logps/ref_chosen": -284.64874267578125, "logps/ref_rejected": -262.38665771484375, "logps/rejected": -268.3260803222656, "loss": 0.6634, "margin_dpo/margin_mean": 6.085813999176025, "margin_dpo/margin_std": 8.766670227050781, "step": 85 }, { "epoch": 0.18845700824499412, "grad_norm": 3.6875975131988525, "learning_rate": 4.882681251368548e-07, "logits/chosen": -0.7377266883850098, "logits/rejected": -0.7434552311897278, "logps/chosen": -292.61492919921875, "logps/ref_chosen": -291.6158447265625, "logps/ref_rejected": -234.08114624023438, "logps/rejected": -244.12353515625, "loss": 0.6552, "margin_dpo/margin_mean": 9.043306350708008, "margin_dpo/margin_std": 10.449368476867676, "step": 90 }, { "epoch": 0.19892684203638267, "grad_norm": 3.9781625270843506, "learning_rate": 4.853377929214243e-07, "logits/chosen": -0.6916964650154114, "logits/rejected": -0.6866524815559387, "logps/chosen": -264.53924560546875, "logps/ref_chosen": -260.4999084472656, "logps/ref_rejected": -263.0595703125, "logps/rejected": -276.92669677734375, "loss": 0.6545, "margin_dpo/margin_mean": 9.827803611755371, "margin_dpo/margin_std": 11.283980369567871, "step": 95 }, { "epoch": 0.20939667582777124, "grad_norm": 4.604659080505371, "learning_rate": 4.820919832540181e-07, "logits/chosen": -0.7635897397994995, "logits/rejected": -0.7725192904472351, "logps/chosen": -295.8533630371094, "logps/ref_chosen": -291.0201110839844, "logps/ref_rejected": -261.11016845703125, "logps/rejected": -277.1815490722656, "loss": 0.6459, "margin_dpo/margin_mean": 11.238081932067871, "margin_dpo/margin_std": 11.718504905700684, "step": 100 }, { "epoch": 0.2198665096191598, "grad_norm": 4.310611724853516, "learning_rate": 4.785350472409791e-07, "logits/chosen": -0.7497888803482056, "logits/rejected": -0.7694140672683716, "logps/chosen": -306.2271423339844, "logps/ref_chosen": -295.6263732910156, "logps/ref_rejected": -272.5986022949219, "logps/rejected": -296.2708740234375, "loss": 0.6352, "margin_dpo/margin_mean": 13.071493148803711, "margin_dpo/margin_std": 14.635180473327637, "step": 105 }, { "epoch": 0.23033634341054834, "grad_norm": 4.162095546722412, "learning_rate": 4.7467175306295647e-07, "logits/chosen": -0.7169866561889648, "logits/rejected": -0.729865312576294, "logps/chosen": -297.9765319824219, "logps/ref_chosen": -286.58831787109375, "logps/ref_rejected": -240.3873291015625, "logps/rejected": -266.1390380859375, "loss": 0.621, "margin_dpo/margin_mean": 14.363420486450195, "margin_dpo/margin_std": 18.693180084228516, "step": 110 }, { "epoch": 0.24080617720193692, "grad_norm": 5.201338291168213, "learning_rate": 4.70507279583015e-07, "logits/chosen": -0.8167451620101929, "logits/rejected": -0.7936628460884094, "logps/chosen": -306.8990478515625, "logps/ref_chosen": -292.059326171875, "logps/ref_rejected": -272.7640075683594, "logps/rejected": -306.86248779296875, "loss": 0.6289, "margin_dpo/margin_mean": 19.258747100830078, "margin_dpo/margin_std": 19.82230567932129, "step": 115 }, { "epoch": 0.25127601099332547, "grad_norm": 5.098325729370117, "learning_rate": 4.6604720940421207e-07, "logits/chosen": -0.8128940463066101, "logits/rejected": -0.8098734021186829, "logps/chosen": -303.0677795410156, "logps/ref_chosen": -275.99810791015625, "logps/ref_rejected": -260.53021240234375, "logps/rejected": -307.44207763671875, "loss": 0.6248, "margin_dpo/margin_mean": 19.842172622680664, "margin_dpo/margin_std": 22.20087242126465, "step": 120 }, { "epoch": 0.261745844784714, "grad_norm": 6.16968297958374, "learning_rate": 4.612975213859487e-07, "logits/chosen": -0.8130077123641968, "logits/rejected": -0.7779678106307983, "logps/chosen": -292.06878662109375, "logps/ref_chosen": -264.0341796875, "logps/ref_rejected": -267.7076721191406, "logps/rejected": -317.40997314453125, "loss": 0.6225, "margin_dpo/margin_mean": 21.667631149291992, "margin_dpo/margin_std": 26.292369842529297, "step": 125 }, { "epoch": 0.2722156785761026, "grad_norm": 7.582566738128662, "learning_rate": 4.5626458262912735e-07, "logits/chosen": -0.8276444673538208, "logits/rejected": -0.8278166055679321, "logps/chosen": -331.5013732910156, "logps/ref_chosen": -311.31207275390625, "logps/ref_rejected": -292.29461669921875, "logps/rejected": -346.29656982421875, "loss": 0.5979, "margin_dpo/margin_mean": 33.81269836425781, "margin_dpo/margin_std": 27.7309513092041, "step": 130 }, { "epoch": 0.2826855123674912, "grad_norm": 8.18915843963623, "learning_rate": 4.5095513994085974e-07, "logits/chosen": -0.8242789506912231, "logits/rejected": -0.8151847124099731, "logps/chosen": -298.26580810546875, "logps/ref_chosen": -272.9286193847656, "logps/ref_rejected": -282.5295104980469, "logps/rejected": -338.06854248046875, "loss": 0.6061, "margin_dpo/margin_mean": 30.201824188232422, "margin_dpo/margin_std": 30.407756805419922, "step": 135 }, { "epoch": 0.2931553461588797, "grad_norm": 6.744533061981201, "learning_rate": 4.453763107901675e-07, "logits/chosen": -0.8922699689865112, "logits/rejected": -0.8769844174385071, "logps/chosen": -340.090576171875, "logps/ref_chosen": -306.9497985839844, "logps/ref_rejected": -266.4873352050781, "logps/rejected": -330.78497314453125, "loss": 0.6056, "margin_dpo/margin_mean": 31.1568546295166, "margin_dpo/margin_std": 35.5713996887207, "step": 140 }, { "epoch": 0.3036251799502683, "grad_norm": 7.777499198913574, "learning_rate": 4.395355737667985e-07, "logits/chosen": -0.8868590593338013, "logits/rejected": -0.8802726864814758, "logps/chosen": -321.41265869140625, "logps/ref_chosen": -292.8253479003906, "logps/ref_rejected": -224.4563751220703, "logps/rejected": -293.2380065917969, "loss": 0.5814, "margin_dpo/margin_mean": 40.19435501098633, "margin_dpo/margin_std": 34.86083984375, "step": 145 }, { "epoch": 0.3140950137416568, "grad_norm": 8.168221473693848, "learning_rate": 4.3344075855595097e-07, "logits/chosen": -0.9321554899215698, "logits/rejected": -0.894702136516571, "logps/chosen": -332.61553955078125, "logps/ref_chosen": -280.79345703125, "logps/ref_rejected": -287.8544921875, "logps/rejected": -368.8316955566406, "loss": 0.5853, "margin_dpo/margin_mean": 29.1551456451416, "margin_dpo/margin_std": 36.90970230102539, "step": 150 }, { "epoch": 0.32456484753304543, "grad_norm": 9.480332374572754, "learning_rate": 4.271000354423425e-07, "logits/chosen": -0.8461268544197083, "logits/rejected": -0.8421329259872437, "logps/chosen": -336.14678955078125, "logps/ref_chosen": -285.0645446777344, "logps/ref_rejected": -272.604736328125, "logps/rejected": -374.5640869140625, "loss": 0.55, "margin_dpo/margin_mean": 50.87714767456055, "margin_dpo/margin_std": 34.851402282714844, "step": 155 }, { "epoch": 0.335034681324434, "grad_norm": 11.324240684509277, "learning_rate": 4.2052190435769554e-07, "logits/chosen": -0.8743706941604614, "logits/rejected": -0.8669834136962891, "logps/chosen": -324.98382568359375, "logps/ref_chosen": -266.2510681152344, "logps/ref_rejected": -258.3799743652344, "logps/rejected": -363.7251892089844, "loss": 0.5611, "margin_dpo/margin_mean": 46.61243438720703, "margin_dpo/margin_std": 40.98955535888672, "step": 160 }, { "epoch": 0.34550451511582253, "grad_norm": 12.692601203918457, "learning_rate": 4.137151834863213e-07, "logits/chosen": -0.9032427668571472, "logits/rejected": -0.869240403175354, "logps/chosen": -349.10186767578125, "logps/ref_chosen": -284.22137451171875, "logps/ref_rejected": -255.17459106445312, "logps/rejected": -358.26324462890625, "loss": 0.5621, "margin_dpo/margin_mean": 38.20818328857422, "margin_dpo/margin_std": 44.2523193359375, "step": 165 }, { "epoch": 0.3559743489072111, "grad_norm": 8.949703216552734, "learning_rate": 4.0668899744407567e-07, "logits/chosen": -0.867663562297821, "logits/rejected": -0.8652926683425903, "logps/chosen": -369.1268005371094, "logps/ref_chosen": -312.779541015625, "logps/ref_rejected": -238.9472198486328, "logps/rejected": -345.5440673828125, "loss": 0.5535, "margin_dpo/margin_mean": 50.249610900878906, "margin_dpo/margin_std": 41.97388458251953, "step": 170 }, { "epoch": 0.3664441826985997, "grad_norm": 10.735611915588379, "learning_rate": 3.994527650465352e-07, "logits/chosen": -0.8405522108078003, "logits/rejected": -0.8641465902328491, "logps/chosen": -366.65057373046875, "logps/ref_chosen": -308.1912841796875, "logps/ref_rejected": -281.0702819824219, "logps/rejected": -388.7403869628906, "loss": 0.5533, "margin_dpo/margin_mean": 49.21086120605469, "margin_dpo/margin_std": 45.18487548828125, "step": 175 }, { "epoch": 0.37691401648998824, "grad_norm": 13.484320640563965, "learning_rate": 3.920161866827889e-07, "logits/chosen": -0.8439409136772156, "logits/rejected": -0.8432701826095581, "logps/chosen": -324.17413330078125, "logps/ref_chosen": -259.9729919433594, "logps/ref_rejected": -253.807373046875, "logps/rejected": -359.5162353515625, "loss": 0.5797, "margin_dpo/margin_mean": 41.50773620605469, "margin_dpo/margin_std": 44.9793815612793, "step": 180 }, { "epoch": 0.3873838502813768, "grad_norm": 12.232016563415527, "learning_rate": 3.8438923131177237e-07, "logits/chosen": -0.8532809019088745, "logits/rejected": -0.8654898405075073, "logps/chosen": -333.65447998046875, "logps/ref_chosen": -278.8919982910156, "logps/ref_rejected": -252.01766967773438, "logps/rejected": -363.85467529296875, "loss": 0.5344, "margin_dpo/margin_mean": 57.074462890625, "margin_dpo/margin_std": 50.60524368286133, "step": 185 }, { "epoch": 0.39785368407276533, "grad_norm": 18.704683303833008, "learning_rate": 3.765821230985757e-07, "logits/chosen": -0.8458822965621948, "logits/rejected": -0.8640161752700806, "logps/chosen": -330.4486999511719, "logps/ref_chosen": -263.2441101074219, "logps/ref_rejected": -241.41879272460938, "logps/rejected": -352.07611083984375, "loss": 0.5522, "margin_dpo/margin_mean": 43.45270538330078, "margin_dpo/margin_std": 40.3618278503418, "step": 190 }, { "epoch": 0.4083235178641539, "grad_norm": 14.519493103027344, "learning_rate": 3.6860532770864005e-07, "logits/chosen": -0.8930074572563171, "logits/rejected": -0.891897976398468, "logps/chosen": -377.4366760253906, "logps/ref_chosen": -298.1391906738281, "logps/ref_rejected": -270.0352478027344, "logps/rejected": -404.7273864746094, "loss": 0.5724, "margin_dpo/margin_mean": 55.394676208496094, "margin_dpo/margin_std": 43.578697204589844, "step": 195 }, { "epoch": 0.4187933516555425, "grad_norm": 18.55974578857422, "learning_rate": 3.604695382782159e-07, "logits/chosen": -0.831684947013855, "logits/rejected": -0.8021112680435181, "logps/chosen": -409.08917236328125, "logps/ref_chosen": -312.67950439453125, "logps/ref_rejected": -279.4552917480469, "logps/rejected": -422.96563720703125, "loss": 0.5528, "margin_dpo/margin_mean": 47.1006965637207, "margin_dpo/margin_std": 50.622745513916016, "step": 200 }, { "epoch": 0.42926318544693104, "grad_norm": 12.525370597839355, "learning_rate": 3.5218566107988867e-07, "logits/chosen": -0.921481728553772, "logits/rejected": -0.9008976817131042, "logps/chosen": -393.524658203125, "logps/ref_chosen": -295.2485046386719, "logps/ref_rejected": -263.5693054199219, "logps/rejected": -407.8573303222656, "loss": 0.5635, "margin_dpo/margin_mean": 46.01184844970703, "margin_dpo/margin_std": 50.522342681884766, "step": 205 }, { "epoch": 0.4397330192383196, "grad_norm": 11.883400917053223, "learning_rate": 3.4376480090239047e-07, "logits/chosen": -0.7784077525138855, "logits/rejected": -0.7815487384796143, "logps/chosen": -330.11505126953125, "logps/ref_chosen": -245.77099609375, "logps/ref_rejected": -249.95419311523438, "logps/rejected": -377.6236267089844, "loss": 0.5666, "margin_dpo/margin_mean": 43.325355529785156, "margin_dpo/margin_std": 49.20180130004883, "step": 210 }, { "epoch": 0.45020285302970814, "grad_norm": 10.824688911437988, "learning_rate": 3.3521824616429284e-07, "logits/chosen": -0.7949780225753784, "logits/rejected": -0.8367546796798706, "logps/chosen": -349.0238952636719, "logps/ref_chosen": -273.9432373046875, "logps/ref_rejected": -260.73638916015625, "logps/rejected": -378.1654357910156, "loss": 0.56, "margin_dpo/margin_mean": 42.34842300415039, "margin_dpo/margin_std": 39.25645446777344, "step": 215 }, { "epoch": 0.4606726868210967, "grad_norm": 12.07426929473877, "learning_rate": 3.265574537815398e-07, "logits/chosen": -0.8215816617012024, "logits/rejected": -0.8321579098701477, "logps/chosen": -378.67718505859375, "logps/ref_chosen": -304.5909423828125, "logps/ref_rejected": -261.46807861328125, "logps/rejected": -380.6227111816406, "loss": 0.5366, "margin_dpo/margin_mean": 45.068355560302734, "margin_dpo/margin_std": 44.65137481689453, "step": 220 }, { "epoch": 0.4711425206124853, "grad_norm": 10.96617603302002, "learning_rate": 3.1779403380910425e-07, "logits/chosen": -0.7884172201156616, "logits/rejected": -0.7639855146408081, "logps/chosen": -346.61126708984375, "logps/ref_chosen": -256.55291748046875, "logps/ref_rejected": -261.79241943359375, "logps/rejected": -406.0009765625, "loss": 0.5519, "margin_dpo/margin_mean": 54.15024948120117, "margin_dpo/margin_std": 53.082069396972656, "step": 225 }, { "epoch": 0.48161235440387384, "grad_norm": 11.045511245727539, "learning_rate": 3.0893973387735683e-07, "logits/chosen": -0.854308009147644, "logits/rejected": -0.8677088618278503, "logps/chosen": -354.51214599609375, "logps/ref_chosen": -261.0024719238281, "logps/ref_rejected": -252.2753448486328, "logps/rejected": -392.02099609375, "loss": 0.5385, "margin_dpo/margin_mean": 46.23601531982422, "margin_dpo/margin_std": 50.89037322998047, "step": 230 }, { "epoch": 0.4920821881952624, "grad_norm": 14.624432563781738, "learning_rate": 3.000064234440111e-07, "logits/chosen": -0.8091039657592773, "logits/rejected": -0.8040207028388977, "logps/chosen": -350.145263671875, "logps/ref_chosen": -256.8251037597656, "logps/ref_rejected": -260.3736572265625, "logps/rejected": -397.73980712890625, "loss": 0.5502, "margin_dpo/margin_mean": 44.04594421386719, "margin_dpo/margin_std": 49.13228225708008, "step": 235 }, { "epoch": 0.5025520219866509, "grad_norm": 13.993529319763184, "learning_rate": 2.910060778827554e-07, "logits/chosen": -0.8354262113571167, "logits/rejected": -0.8467346429824829, "logps/chosen": -347.28582763671875, "logps/ref_chosen": -263.64447021484375, "logps/ref_rejected": -257.86846923828125, "logps/rejected": -396.7109680175781, "loss": 0.5459, "margin_dpo/margin_mean": 55.20109939575195, "margin_dpo/margin_std": 42.566429138183594, "step": 240 }, { "epoch": 0.5130218557780395, "grad_norm": 9.094046592712402, "learning_rate": 2.8195076242990116e-07, "logits/chosen": -0.8484266400337219, "logits/rejected": -0.840695858001709, "logps/chosen": -327.71484375, "logps/ref_chosen": -260.4804992675781, "logps/ref_rejected": -230.9554901123047, "logps/rejected": -351.3434143066406, "loss": 0.5374, "margin_dpo/margin_mean": 53.15354537963867, "margin_dpo/margin_std": 47.3699836730957, "step": 245 }, { "epoch": 0.523491689569428, "grad_norm": 10.525559425354004, "learning_rate": 2.7285261601056697e-07, "logits/chosen": -0.8070570230484009, "logits/rejected": -0.8098254203796387, "logps/chosen": -361.7781677246094, "logps/ref_chosen": -281.3375549316406, "logps/ref_rejected": -240.5897216796875, "logps/rejected": -378.281005859375, "loss": 0.5642, "margin_dpo/margin_mean": 57.25066375732422, "margin_dpo/margin_std": 47.42897033691406, "step": 250 }, { "epoch": 0.5339615233608166, "grad_norm": 14.809152603149414, "learning_rate": 2.6372383496608186e-07, "logits/chosen": -0.8704633712768555, "logits/rejected": -0.8387042880058289, "logps/chosen": -396.8660583496094, "logps/ref_chosen": -295.2730407714844, "logps/ref_rejected": -249.26278686523438, "logps/rejected": -399.889892578125, "loss": 0.5405, "margin_dpo/margin_mean": 49.034156799316406, "margin_dpo/margin_std": 48.20949935913086, "step": 255 }, { "epoch": 0.5444313571522053, "grad_norm": 13.24657154083252, "learning_rate": 2.5457665670441937e-07, "logits/chosen": -0.8177655935287476, "logits/rejected": -0.8094264268875122, "logps/chosen": -382.61090087890625, "logps/ref_chosen": -281.1112976074219, "logps/ref_rejected": -260.35113525390625, "logps/rejected": -419.8111267089844, "loss": 0.5637, "margin_dpo/margin_mean": 57.96039581298828, "margin_dpo/margin_std": 62.89788818359375, "step": 260 }, { "epoch": 0.5549011909435938, "grad_norm": 11.23167896270752, "learning_rate": 2.454233432955807e-07, "logits/chosen": -0.7776967287063599, "logits/rejected": -0.7686339616775513, "logps/chosen": -399.6818542480469, "logps/ref_chosen": -286.01800537109375, "logps/ref_rejected": -270.8456726074219, "logps/rejected": -439.279541015625, "loss": 0.5325, "margin_dpo/margin_mean": 54.76995086669922, "margin_dpo/margin_std": 50.13930892944336, "step": 265 }, { "epoch": 0.5653710247349824, "grad_norm": 11.047820091247559, "learning_rate": 2.3627616503391812e-07, "logits/chosen": -0.8523350954055786, "logits/rejected": -0.8563663363456726, "logps/chosen": -398.56494140625, "logps/ref_chosen": -290.1902770996094, "logps/ref_rejected": -290.7628479003906, "logps/rejected": -454.853515625, "loss": 0.5385, "margin_dpo/margin_mean": 55.71601486206055, "margin_dpo/margin_std": 55.37096405029297, "step": 270 }, { "epoch": 0.5758408585263709, "grad_norm": 9.9891357421875, "learning_rate": 2.2714738398943308e-07, "logits/chosen": -0.8351672887802124, "logits/rejected": -0.821618914604187, "logps/chosen": -354.33209228515625, "logps/ref_chosen": -266.181396484375, "logps/ref_rejected": -245.8917236328125, "logps/rejected": -396.9844970703125, "loss": 0.5225, "margin_dpo/margin_mean": 62.94205856323242, "margin_dpo/margin_std": 52.933265686035156, "step": 275 }, { "epoch": 0.5863106923177595, "grad_norm": 9.91457462310791, "learning_rate": 2.1804923757009882e-07, "logits/chosen": -0.8287727236747742, "logits/rejected": -0.8253592252731323, "logps/chosen": -378.96929931640625, "logps/ref_chosen": -285.9189453125, "logps/ref_rejected": -267.557861328125, "logps/rejected": -428.0223693847656, "loss": 0.5547, "margin_dpo/margin_mean": 67.41411590576172, "margin_dpo/margin_std": 56.702415466308594, "step": 280 }, { "epoch": 0.596780526109148, "grad_norm": 14.671507835388184, "learning_rate": 2.089939221172446e-07, "logits/chosen": -0.8091673851013184, "logits/rejected": -0.8017024993896484, "logps/chosen": -361.6111755371094, "logps/ref_chosen": -259.3731384277344, "logps/ref_rejected": -236.7803497314453, "logps/rejected": -396.0070495605469, "loss": 0.5542, "margin_dpo/margin_mean": 56.988670349121094, "margin_dpo/margin_std": 49.157142639160156, "step": 285 }, { "epoch": 0.6072503599005366, "grad_norm": 13.170257568359375, "learning_rate": 1.9999357655598891e-07, "logits/chosen": -0.7988609075546265, "logits/rejected": -0.775769829750061, "logps/chosen": -422.39862060546875, "logps/ref_chosen": -313.570556640625, "logps/ref_rejected": -293.6082458496094, "logps/rejected": -446.83148193359375, "loss": 0.5645, "margin_dpo/margin_mean": 44.39514923095703, "margin_dpo/margin_std": 50.81785583496094, "step": 290 }, { "epoch": 0.6177201936919251, "grad_norm": 8.276803016662598, "learning_rate": 1.9106026612264315e-07, "logits/chosen": -0.8381333351135254, "logits/rejected": -0.8288668394088745, "logps/chosen": -342.7650146484375, "logps/ref_chosen": -245.61093139648438, "logps/ref_rejected": -240.9221649169922, "logps/rejected": -382.5660400390625, "loss": 0.5387, "margin_dpo/margin_mean": 44.489784240722656, "margin_dpo/margin_std": 44.16176223754883, "step": 295 }, { "epoch": 0.6281900274833137, "grad_norm": 12.60323429107666, "learning_rate": 1.8220596619089573e-07, "logits/chosen": -0.7733660340309143, "logits/rejected": -0.7441512942314148, "logps/chosen": -367.76531982421875, "logps/ref_chosen": -273.33172607421875, "logps/ref_rejected": -262.4527893066406, "logps/rejected": -422.65814208984375, "loss": 0.5471, "margin_dpo/margin_mean": 65.77180480957031, "margin_dpo/margin_std": 47.35002899169922, "step": 300 }, { "epoch": 0.6386598612747023, "grad_norm": 11.17004108428955, "learning_rate": 1.7344254621846017e-07, "logits/chosen": -0.8063335418701172, "logits/rejected": -0.811654269695282, "logps/chosen": -379.2445373535156, "logps/ref_chosen": -288.01531982421875, "logps/ref_rejected": -268.05657958984375, "logps/rejected": -412.24334716796875, "loss": 0.5245, "margin_dpo/margin_mean": 52.95763397216797, "margin_dpo/margin_std": 44.546958923339844, "step": 305 }, { "epoch": 0.6491296950660909, "grad_norm": 9.911209106445312, "learning_rate": 1.647817538357072e-07, "logits/chosen": -0.7909546494483948, "logits/rejected": -0.7886890769004822, "logps/chosen": -389.10174560546875, "logps/ref_chosen": -278.2751770019531, "logps/ref_rejected": -260.12744140625, "logps/rejected": -433.76397705078125, "loss": 0.5273, "margin_dpo/margin_mean": 62.8099250793457, "margin_dpo/margin_std": 55.66056442260742, "step": 310 }, { "epoch": 0.6595995288574794, "grad_norm": 12.315938949584961, "learning_rate": 1.562351990976095e-07, "logits/chosen": -0.864617645740509, "logits/rejected": -0.8374130129814148, "logps/chosen": -419.22015380859375, "logps/ref_chosen": -308.3185119628906, "logps/ref_rejected": -250.92843627929688, "logps/rejected": -425.54095458984375, "loss": 0.5357, "margin_dpo/margin_mean": 63.710853576660156, "margin_dpo/margin_std": 49.506195068359375, "step": 315 }, { "epoch": 0.670069362648868, "grad_norm": 11.207268714904785, "learning_rate": 1.478143389201113e-07, "logits/chosen": -0.825822651386261, "logits/rejected": -0.8099110722541809, "logps/chosen": -380.16094970703125, "logps/ref_chosen": -271.6287841796875, "logps/ref_rejected": -266.8870544433594, "logps/rejected": -443.1537170410156, "loss": 0.5061, "margin_dpo/margin_mean": 67.73442840576172, "margin_dpo/margin_std": 49.473670959472656, "step": 320 }, { "epoch": 0.6805391964402565, "grad_norm": 13.092878341674805, "learning_rate": 1.3953046172178413e-07, "logits/chosen": -0.8543654680252075, "logits/rejected": -0.8229929208755493, "logps/chosen": -408.0074768066406, "logps/ref_chosen": -284.26458740234375, "logps/ref_rejected": -267.7981262207031, "logps/rejected": -459.90106201171875, "loss": 0.5501, "margin_dpo/margin_mean": 68.36006164550781, "margin_dpo/margin_std": 58.94519805908203, "step": 325 }, { "epoch": 0.6910090302316451, "grad_norm": 12.90672779083252, "learning_rate": 1.3139467229135998e-07, "logits/chosen": -0.883324146270752, "logits/rejected": -0.874428391456604, "logps/chosen": -394.7810363769531, "logps/ref_chosen": -279.2672424316406, "logps/ref_rejected": -273.91064453125, "logps/rejected": -461.871826171875, "loss": 0.5236, "margin_dpo/margin_mean": 72.44737243652344, "margin_dpo/margin_std": 56.92244338989258, "step": 330 }, { "epoch": 0.7014788640230336, "grad_norm": 14.50145435333252, "learning_rate": 1.2341787690142435e-07, "logits/chosen": -0.8287975192070007, "logits/rejected": -0.8247097730636597, "logps/chosen": -371.62786865234375, "logps/ref_chosen": -246.4220428466797, "logps/ref_rejected": -217.51162719726562, "logps/rejected": -401.34808349609375, "loss": 0.5428, "margin_dpo/margin_mean": 58.630615234375, "margin_dpo/margin_std": 60.42728805541992, "step": 335 }, { "epoch": 0.7119486978144222, "grad_norm": 13.394824981689453, "learning_rate": 1.1561076868822755e-07, "logits/chosen": -0.8153432607650757, "logits/rejected": -0.8339048624038696, "logps/chosen": -391.11431884765625, "logps/ref_chosen": -269.29949951171875, "logps/ref_rejected": -244.972412109375, "logps/rejected": -434.92144775390625, "loss": 0.5269, "margin_dpo/margin_mean": 68.1341323852539, "margin_dpo/margin_std": 57.2426643371582, "step": 340 }, { "epoch": 0.7224185316058107, "grad_norm": 11.93813419342041, "learning_rate": 1.0798381331721107e-07, "logits/chosen": -0.8510368466377258, "logits/rejected": -0.852213978767395, "logps/chosen": -431.39306640625, "logps/ref_chosen": -309.17657470703125, "logps/ref_rejected": -289.8589172363281, "logps/rejected": -470.42413330078125, "loss": 0.5512, "margin_dpo/margin_mean": 58.3487434387207, "margin_dpo/margin_std": 57.327728271484375, "step": 345 }, { "epoch": 0.7328883653971994, "grad_norm": 56.23103332519531, "learning_rate": 1.0054723495346482e-07, "logits/chosen": -0.8716479539871216, "logits/rejected": -0.8568038940429688, "logps/chosen": -430.564208984375, "logps/ref_chosen": -316.6105651855469, "logps/ref_rejected": -259.6813049316406, "logps/rejected": -427.782958984375, "loss": 0.5617, "margin_dpo/margin_mean": 54.14800262451172, "margin_dpo/margin_std": 49.129615783691406, "step": 350 }, { "epoch": 0.7433581991885879, "grad_norm": 11.990477561950684, "learning_rate": 9.331100255592436e-08, "logits/chosen": -0.8248258829116821, "logits/rejected": -0.8299292325973511, "logps/chosen": -360.440673828125, "logps/ref_chosen": -249.68185424804688, "logps/ref_rejected": -245.9772491455078, "logps/rejected": -408.6664733886719, "loss": 0.565, "margin_dpo/margin_mean": 51.93042755126953, "margin_dpo/margin_std": 57.09288787841797, "step": 355 }, { "epoch": 0.7538280329799765, "grad_norm": 12.199322700500488, "learning_rate": 8.628481651367875e-08, "logits/chosen": -0.8103006482124329, "logits/rejected": -0.8348591923713684, "logps/chosen": -363.73284912109375, "logps/ref_chosen": -257.22601318359375, "logps/ref_rejected": -240.4705047607422, "logps/rejected": -405.26239013671875, "loss": 0.529, "margin_dpo/margin_mean": 58.2850456237793, "margin_dpo/margin_std": 65.24702453613281, "step": 360 }, { "epoch": 0.764297866771365, "grad_norm": 13.863068580627441, "learning_rate": 7.947809564230445e-08, "logits/chosen": -0.83502596616745, "logits/rejected": -0.8059133291244507, "logps/chosen": -408.8477783203125, "logps/ref_chosen": -298.82977294921875, "logps/ref_rejected": -259.7737121582031, "logps/rejected": -437.01763916015625, "loss": 0.5297, "margin_dpo/margin_mean": 67.22590637207031, "margin_dpo/margin_std": 57.601585388183594, "step": 365 }, { "epoch": 0.7747677005627536, "grad_norm": 10.148484230041504, "learning_rate": 7.289996455765748e-08, "logits/chosen": -0.7842052578926086, "logits/rejected": -0.792621910572052, "logps/chosen": -401.79364013671875, "logps/ref_chosen": -297.49273681640625, "logps/ref_rejected": -261.4602966308594, "logps/rejected": -434.1449279785156, "loss": 0.5012, "margin_dpo/margin_mean": 68.3836898803711, "margin_dpo/margin_std": 56.6983528137207, "step": 370 }, { "epoch": 0.7852375343541421, "grad_norm": 11.445208549499512, "learning_rate": 6.655924144404906e-08, "logits/chosen": -0.8276312947273254, "logits/rejected": -0.8161738514900208, "logps/chosen": -405.738525390625, "logps/ref_chosen": -290.16522216796875, "logps/ref_rejected": -258.9540100097656, "logps/rejected": -437.8350524902344, "loss": 0.5289, "margin_dpo/margin_mean": 63.30780029296875, "margin_dpo/margin_std": 49.408447265625, "step": 375 }, { "epoch": 0.7957073681455307, "grad_norm": 12.313652038574219, "learning_rate": 6.046442623320145e-08, "logits/chosen": -0.8401039242744446, "logits/rejected": -0.8634229898452759, "logps/chosen": -389.7837829589844, "logps/ref_chosen": -278.06829833984375, "logps/ref_rejected": -261.755126953125, "logps/rejected": -429.049560546875, "loss": 0.5545, "margin_dpo/margin_mean": 55.578941345214844, "margin_dpo/margin_std": 55.153419494628906, "step": 380 }, { "epoch": 0.8061772019369192, "grad_norm": 11.419657707214355, "learning_rate": 5.4623689209832484e-08, "logits/chosen": -0.8053508996963501, "logits/rejected": -0.8094272613525391, "logps/chosen": -380.64813232421875, "logps/ref_chosen": -263.7462463378906, "logps/ref_rejected": -263.7913818359375, "logps/rejected": -443.0389099121094, "loss": 0.5151, "margin_dpo/margin_mean": 62.34564208984375, "margin_dpo/margin_std": 50.74353790283203, "step": 385 }, { "epoch": 0.8166470357283078, "grad_norm": 10.845842361450195, "learning_rate": 4.904486005914027e-08, "logits/chosen": -0.8586405515670776, "logits/rejected": -0.8259384036064148, "logps/chosen": -434.5126953125, "logps/ref_chosen": -305.0606384277344, "logps/ref_rejected": -263.2323913574219, "logps/rejected": -450.8243713378906, "loss": 0.53, "margin_dpo/margin_mean": 58.139869689941406, "margin_dpo/margin_std": 56.146942138671875, "step": 390 }, { "epoch": 0.8271168695196964, "grad_norm": 10.537102699279785, "learning_rate": 4.373541737087263e-08, "logits/chosen": -0.7995084524154663, "logits/rejected": -0.8020361065864563, "logps/chosen": -413.2315979003906, "logps/ref_chosen": -294.02728271484375, "logps/ref_rejected": -241.11166381835938, "logps/rejected": -425.66162109375, "loss": 0.5219, "margin_dpo/margin_mean": 65.34563446044922, "margin_dpo/margin_std": 56.103919982910156, "step": 395 }, { "epoch": 0.837586703311085, "grad_norm": 13.609760284423828, "learning_rate": 3.8702478614051345e-08, "logits/chosen": -0.8078573346138, "logits/rejected": -0.7934938669204712, "logps/chosen": -418.42034912109375, "logps/ref_chosen": -293.3166809082031, "logps/ref_rejected": -242.2424774169922, "logps/rejected": -425.05157470703125, "loss": 0.5595, "margin_dpo/margin_mean": 57.70537185668945, "margin_dpo/margin_std": 54.573448181152344, "step": 400 }, { "epoch": 0.8480565371024735, "grad_norm": 9.293983459472656, "learning_rate": 3.3952790595787986e-08, "logits/chosen": -0.8078519105911255, "logits/rejected": -0.8162975311279297, "logps/chosen": -414.7613220214844, "logps/ref_chosen": -297.1978454589844, "logps/ref_rejected": -268.5884094238281, "logps/rejected": -451.60321044921875, "loss": 0.5035, "margin_dpo/margin_mean": 65.4513168334961, "margin_dpo/margin_std": 53.9837532043457, "step": 405 }, { "epoch": 0.8585263708938621, "grad_norm": 11.017020225524902, "learning_rate": 2.9492720416985e-08, "logits/chosen": -0.8266521692276001, "logits/rejected": -0.8138245344161987, "logps/chosen": -438.3353576660156, "logps/ref_chosen": -311.4091796875, "logps/ref_rejected": -278.2716369628906, "logps/rejected": -462.28106689453125, "loss": 0.5374, "margin_dpo/margin_mean": 57.08324432373047, "margin_dpo/margin_std": 61.2265625, "step": 410 }, { "epoch": 0.8689962046852506, "grad_norm": 10.866921424865723, "learning_rate": 2.5328246937043525e-08, "logits/chosen": -0.8431590795516968, "logits/rejected": -0.8233949542045593, "logps/chosen": -425.8267517089844, "logps/ref_chosen": -301.7953186035156, "logps/ref_rejected": -277.1317443847656, "logps/rejected": -458.07257080078125, "loss": 0.5276, "margin_dpo/margin_mean": 56.90935516357422, "margin_dpo/margin_std": 58.62964630126953, "step": 415 }, { "epoch": 0.8794660384766392, "grad_norm": 16.984128952026367, "learning_rate": 2.1464952759020856e-08, "logits/chosen": -0.8165545463562012, "logits/rejected": -0.7965975999832153, "logps/chosen": -406.0804138183594, "logps/ref_chosen": -279.28424072265625, "logps/ref_rejected": -266.0011291503906, "logps/rejected": -459.39251708984375, "loss": 0.5221, "margin_dpo/margin_mean": 66.59518432617188, "margin_dpo/margin_std": 60.97441864013672, "step": 420 }, { "epoch": 0.8899358722680277, "grad_norm": 11.01903247833252, "learning_rate": 1.7908016745981856e-08, "logits/chosen": -0.8314346075057983, "logits/rejected": -0.8301789164543152, "logps/chosen": -402.17266845703125, "logps/ref_chosen": -295.50152587890625, "logps/ref_rejected": -257.7261047363281, "logps/rejected": -436.78839111328125, "loss": 0.5112, "margin_dpo/margin_mean": 72.39109802246094, "margin_dpo/margin_std": 60.561256408691406, "step": 425 }, { "epoch": 0.9004057060594163, "grad_norm": 15.038141250610352, "learning_rate": 1.4662207078575684e-08, "logits/chosen": -0.8031011819839478, "logits/rejected": -0.7918771505355835, "logps/chosen": -412.593994140625, "logps/ref_chosen": -277.38116455078125, "logps/ref_rejected": -251.1290740966797, "logps/rejected": -447.1891174316406, "loss": 0.5134, "margin_dpo/margin_mean": 60.8471794128418, "margin_dpo/margin_std": 55.47169876098633, "step": 430 }, { "epoch": 0.9108755398508048, "grad_norm": 14.224749565124512, "learning_rate": 1.1731874863145142e-08, "logits/chosen": -0.7793712615966797, "logits/rejected": -0.777908205986023, "logps/chosen": -412.5726013183594, "logps/ref_chosen": -287.42352294921875, "logps/ref_rejected": -290.1793212890625, "logps/rejected": -468.8260803222656, "loss": 0.5352, "margin_dpo/margin_mean": 53.49767303466797, "margin_dpo/margin_std": 55.022552490234375, "step": 435 }, { "epoch": 0.9213453736421934, "grad_norm": 11.504213333129883, "learning_rate": 9.12094829893642e-09, "logits/chosen": -0.8351284861564636, "logits/rejected": -0.8232718706130981, "logps/chosen": -422.15142822265625, "logps/ref_chosen": -290.55657958984375, "logps/ref_rejected": -243.29867553710938, "logps/rejected": -418.43597412109375, "loss": 0.5483, "margin_dpo/margin_mean": 43.54241943359375, "margin_dpo/margin_std": 59.4898681640625, "step": 440 }, { "epoch": 0.931815207433582, "grad_norm": 12.176145553588867, "learning_rate": 6.832927412229017e-09, "logits/chosen": -0.8022583723068237, "logits/rejected": -0.8025018572807312, "logps/chosen": -425.2530822753906, "logps/ref_chosen": -299.78863525390625, "logps/ref_rejected": -251.2100067138672, "logps/rejected": -447.94464111328125, "loss": 0.5312, "margin_dpo/margin_mean": 71.27015686035156, "margin_dpo/margin_std": 56.103118896484375, "step": 445 }, { "epoch": 0.9422850412249706, "grad_norm": 10.374246597290039, "learning_rate": 4.8708793644441086e-09, "logits/chosen": -0.8492597341537476, "logits/rejected": -0.8464733362197876, "logps/chosen": -441.6004333496094, "logps/ref_chosen": -336.39410400390625, "logps/ref_rejected": -256.09442138671875, "logps/rejected": -437.0380859375, "loss": 0.5053, "margin_dpo/margin_mean": 75.73728942871094, "margin_dpo/margin_std": 44.308250427246094, "step": 450 }, { "epoch": 0.9527548750163591, "grad_norm": 11.073409080505371, "learning_rate": 3.2374343405217884e-09, "logits/chosen": -0.8142544031143188, "logits/rejected": -0.7899129986763, "logps/chosen": -417.60809326171875, "logps/ref_chosen": -295.5001525878906, "logps/ref_rejected": -287.2071533203125, "logps/rejected": -468.614013671875, "loss": 0.5278, "margin_dpo/margin_mean": 59.29888153076172, "margin_dpo/margin_std": 59.937782287597656, "step": 455 }, { "epoch": 0.9632247088077477, "grad_norm": 12.617728233337402, "learning_rate": 1.9347820230782295e-09, "logits/chosen": -0.7838319540023804, "logits/rejected": -0.8114255666732788, "logps/chosen": -450.2615661621094, "logps/ref_chosen": -332.0260009765625, "logps/ref_rejected": -292.3576354980469, "logps/rejected": -484.081787109375, "loss": 0.5396, "margin_dpo/margin_mean": 73.48858642578125, "margin_dpo/margin_std": 53.811485290527344, "step": 460 }, { "epoch": 0.9736945425991362, "grad_norm": 11.1157808303833, "learning_rate": 9.64668657069706e-10, "logits/chosen": -0.8052261471748352, "logits/rejected": -0.7894802689552307, "logps/chosen": -413.00213623046875, "logps/ref_chosen": -294.51641845703125, "logps/ref_rejected": -261.11236572265625, "logps/rejected": -445.7872009277344, "loss": 0.5364, "margin_dpo/margin_mean": 66.1891098022461, "margin_dpo/margin_std": 57.48866653442383, "step": 465 }, { "epoch": 0.9841643763905248, "grad_norm": 12.074286460876465, "learning_rate": 3.2839470889836627e-10, "logits/chosen": -0.7617523074150085, "logits/rejected": -0.7400659322738647, "logps/chosen": -377.22015380859375, "logps/ref_chosen": -265.89263916015625, "logps/ref_rejected": -232.9622802734375, "logps/rejected": -415.216796875, "loss": 0.5139, "margin_dpo/margin_mean": 70.92707061767578, "margin_dpo/margin_std": 55.2708625793457, "step": 470 }, { "epoch": 0.9946342101819133, "grad_norm": 11.217382431030273, "learning_rate": 2.6813123097352287e-11, "logits/chosen": -0.8241547346115112, "logits/rejected": -0.8438981771469116, "logps/chosen": -395.30914306640625, "logps/ref_chosen": -282.80517578125, "logps/ref_rejected": -237.91372680664062, "logps/rejected": -402.4132080078125, "loss": 0.5177, "margin_dpo/margin_mean": 51.9954948425293, "margin_dpo/margin_std": 48.095726013183594, "step": 475 }, { "epoch": 0.9988221436984688, "step": 477, "total_flos": 0.0, "train_loss": 0.5765140090348586, "train_runtime": 5442.9429, "train_samples_per_second": 11.232, "train_steps_per_second": 0.088 } ], "logging_steps": 5, "max_steps": 477, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }