{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 330, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0030303030303030303, "grad_norm": 10.455310821533203, "learning_rate": 0.0, "logits/chosen": -0.818070113658905, "logits/rejected": -0.7612971663475037, "logps/chosen": -27.54741859436035, "logps/ref_chosen": -27.53912353515625, "logps/ref_rejected": -62.889225006103516, "logps/rejected": -62.880741119384766, "loss": 0.6926, "margin_dpo/margin_mean": -0.01677680015563965, "margin_dpo/margin_std": 0.1853054314851761, "step": 1 }, { "epoch": 0.015151515151515152, "grad_norm": 11.397998809814453, "learning_rate": 6.060606060606061e-08, "logits/chosen": -0.8404617309570312, "logits/rejected": -0.8060516119003296, "logps/chosen": -51.65924072265625, "logps/ref_chosen": -51.643856048583984, "logps/ref_rejected": -84.63095092773438, "logps/rejected": -84.6202392578125, "loss": 0.6933, "margin_dpo/margin_mean": -0.0260981023311615, "margin_dpo/margin_std": 0.3153693377971649, "step": 5 }, { "epoch": 0.030303030303030304, "grad_norm": 11.12632942199707, "learning_rate": 1.3636363636363635e-07, "logits/chosen": -0.7908369302749634, "logits/rejected": -0.7584771513938904, "logps/chosen": -64.20430755615234, "logps/ref_chosen": -64.17414855957031, "logps/ref_rejected": -96.51995849609375, "logps/rejected": -96.55589294433594, "loss": 0.6929, "margin_dpo/margin_mean": 0.0057894946075975895, "margin_dpo/margin_std": 0.33652475476264954, "step": 10 }, { "epoch": 0.045454545454545456, "grad_norm": 12.030816078186035, "learning_rate": 2.121212121212121e-07, "logits/chosen": -0.8053056001663208, "logits/rejected": -0.8063974380493164, "logps/chosen": -77.95388793945312, "logps/ref_chosen": -77.93045806884766, "logps/ref_rejected": -75.88431549072266, "logps/rejected": -75.89156341552734, "loss": 0.6927, "margin_dpo/margin_mean": -0.016180897131562233, "margin_dpo/margin_std": 0.3311070501804352, "step": 15 }, { "epoch": 0.06060606060606061, "grad_norm": 12.039678573608398, "learning_rate": 2.878787878787879e-07, "logits/chosen": -0.7935067415237427, "logits/rejected": -0.7536638975143433, "logps/chosen": -55.504188537597656, "logps/ref_chosen": -55.51140213012695, "logps/ref_rejected": -86.6218490600586, "logps/rejected": -86.65962982177734, "loss": 0.6927, "margin_dpo/margin_mean": 0.0450122132897377, "margin_dpo/margin_std": 0.37105274200439453, "step": 20 }, { "epoch": 0.07575757575757576, "grad_norm": 10.380696296691895, "learning_rate": 3.636363636363636e-07, "logits/chosen": -0.7800458669662476, "logits/rejected": -0.7748220562934875, "logps/chosen": -65.15885162353516, "logps/ref_chosen": -65.15419006347656, "logps/ref_rejected": -70.9836196899414, "logps/rejected": -71.05149841308594, "loss": 0.6929, "margin_dpo/margin_mean": 0.06321928650140762, "margin_dpo/margin_std": 0.355155885219574, "step": 25 }, { "epoch": 0.09090909090909091, "grad_norm": 10.88476276397705, "learning_rate": 4.3939393939393937e-07, "logits/chosen": -0.8358621597290039, "logits/rejected": -0.8101686239242554, "logps/chosen": -54.09563064575195, "logps/ref_chosen": -54.000160217285156, "logps/ref_rejected": -86.43263244628906, "logps/rejected": -86.5849609375, "loss": 0.6906, "margin_dpo/margin_mean": 0.05685856193304062, "margin_dpo/margin_std": 0.3642476797103882, "step": 30 }, { "epoch": 0.10606060606060606, "grad_norm": 12.026762962341309, "learning_rate": 4.999860140229787e-07, "logits/chosen": -0.811154842376709, "logits/rejected": -0.7937377691268921, "logps/chosen": -67.01231384277344, "logps/ref_chosen": -66.8745346069336, "logps/ref_rejected": -86.6573257446289, "logps/rejected": -86.97063446044922, "loss": 0.6891, "margin_dpo/margin_mean": 0.1755320429801941, "margin_dpo/margin_std": 0.46879833936691284, "step": 35 }, { "epoch": 0.12121212121212122, "grad_norm": 11.267840385437012, "learning_rate": 4.994966691179711e-07, "logits/chosen": -0.7241272926330566, "logits/rejected": -0.6869423985481262, "logps/chosen": -51.837364196777344, "logps/ref_chosen": -51.43064498901367, "logps/ref_rejected": -75.73628234863281, "logps/rejected": -76.29964447021484, "loss": 0.6848, "margin_dpo/margin_mean": 0.15664692223072052, "margin_dpo/margin_std": 0.6119893193244934, "step": 40 }, { "epoch": 0.13636363636363635, "grad_norm": 11.79084587097168, "learning_rate": 4.983095894354857e-07, "logits/chosen": -0.7654654383659363, "logits/rejected": -0.7399241328239441, "logps/chosen": -59.4940299987793, "logps/ref_chosen": -58.967918395996094, "logps/ref_rejected": -74.13176727294922, "logps/rejected": -75.02941131591797, "loss": 0.6777, "margin_dpo/margin_mean": 0.37154078483581543, "margin_dpo/margin_std": 0.763075590133667, "step": 45 }, { "epoch": 0.15151515151515152, "grad_norm": 12.672266006469727, "learning_rate": 4.964280947263676e-07, "logits/chosen": -0.7275325059890747, "logits/rejected": -0.6958032250404358, "logps/chosen": -56.945068359375, "logps/ref_chosen": -55.99009323120117, "logps/ref_rejected": -74.68233489990234, "logps/rejected": -75.86155700683594, "loss": 0.6755, "margin_dpo/margin_mean": 0.22425690293312073, "margin_dpo/margin_std": 1.2586849927902222, "step": 50 }, { "epoch": 0.16666666666666666, "grad_norm": 11.780351638793945, "learning_rate": 4.938574467213517e-07, "logits/chosen": -0.7339123487472534, "logits/rejected": -0.7103201150894165, "logps/chosen": -61.5482177734375, "logps/ref_chosen": -60.068870544433594, "logps/ref_rejected": -77.12890625, "logps/rejected": -79.0832748413086, "loss": 0.6714, "margin_dpo/margin_mean": 0.4750184416770935, "margin_dpo/margin_std": 1.5396963357925415, "step": 55 }, { "epoch": 0.18181818181818182, "grad_norm": 11.140870094299316, "learning_rate": 4.906048344162676e-07, "logits/chosen": -0.678428053855896, "logits/rejected": -0.6509960889816284, "logps/chosen": -60.9329719543457, "logps/ref_chosen": -58.871849060058594, "logps/ref_rejected": -76.81136322021484, "logps/rejected": -79.64076232910156, "loss": 0.6634, "margin_dpo/margin_mean": 0.7682675123214722, "margin_dpo/margin_std": 1.9303239583969116, "step": 60 }, { "epoch": 0.19696969696969696, "grad_norm": 11.366332054138184, "learning_rate": 4.866793539675126e-07, "logits/chosen": -0.6925519704818726, "logits/rejected": -0.6610804796218872, "logps/chosen": -69.35958099365234, "logps/ref_chosen": -66.47074890136719, "logps/ref_rejected": -100.35836029052734, "logps/rejected": -104.43794250488281, "loss": 0.6579, "margin_dpo/margin_mean": 1.1907539367675781, "margin_dpo/margin_std": 2.986706495285034, "step": 65 }, { "epoch": 0.21212121212121213, "grad_norm": 12.58990478515625, "learning_rate": 4.820919832540181e-07, "logits/chosen": -0.6219511032104492, "logits/rejected": -0.6189069747924805, "logps/chosen": -67.1957778930664, "logps/ref_chosen": -64.2503662109375, "logps/ref_rejected": -66.74681091308594, "logps/rejected": -70.51075744628906, "loss": 0.6519, "margin_dpo/margin_mean": 0.8185291290283203, "margin_dpo/margin_std": 2.976707935333252, "step": 70 }, { "epoch": 0.22727272727272727, "grad_norm": 11.002663612365723, "learning_rate": 4.768555511768486e-07, "logits/chosen": -0.5906602740287781, "logits/rejected": -0.5815819501876831, "logps/chosen": -71.80250549316406, "logps/ref_chosen": -68.28721618652344, "logps/ref_rejected": -76.16336822509766, "logps/rejected": -80.22598266601562, "loss": 0.6617, "margin_dpo/margin_mean": 0.5473247170448303, "margin_dpo/margin_std": 3.507791519165039, "step": 75 }, { "epoch": 0.24242424242424243, "grad_norm": 9.479778289794922, "learning_rate": 4.7098470178228755e-07, "logits/chosen": -0.6349095106124878, "logits/rejected": -0.6179987788200378, "logps/chosen": -57.898193359375, "logps/ref_chosen": -54.811798095703125, "logps/ref_rejected": -77.2701187133789, "logps/rejected": -81.84941101074219, "loss": 0.6448, "margin_dpo/margin_mean": 1.4929004907608032, "margin_dpo/margin_std": 3.287881851196289, "step": 80 }, { "epoch": 0.25757575757575757, "grad_norm": 10.064814567565918, "learning_rate": 4.6449585330874425e-07, "logits/chosen": -0.5931236147880554, "logits/rejected": -0.5673755407333374, "logps/chosen": -66.52117919921875, "logps/ref_chosen": -62.9375, "logps/ref_rejected": -89.00093078613281, "logps/rejected": -94.03156280517578, "loss": 0.6411, "margin_dpo/margin_mean": 1.4469609260559082, "margin_dpo/margin_std": 3.1353728771209717, "step": 85 }, { "epoch": 0.2727272727272727, "grad_norm": 10.42741584777832, "learning_rate": 4.5740715227200897e-07, "logits/chosen": -0.6528624296188354, "logits/rejected": -0.6274086833000183, "logps/chosen": -66.20284271240234, "logps/ref_chosen": -62.151451110839844, "logps/ref_rejected": -83.65849304199219, "logps/rejected": -89.31423950195312, "loss": 0.6262, "margin_dpo/margin_mean": 1.6043474674224854, "margin_dpo/margin_std": 3.8411917686462402, "step": 90 }, { "epoch": 0.2878787878787879, "grad_norm": 10.800503730773926, "learning_rate": 4.4973842271726024e-07, "logits/chosen": -0.5788562893867493, "logits/rejected": -0.5660556554794312, "logps/chosen": -67.69863891601562, "logps/ref_chosen": -63.18915939331055, "logps/ref_rejected": -77.06649017333984, "logps/rejected": -83.23294067382812, "loss": 0.6272, "margin_dpo/margin_mean": 1.6569665670394897, "margin_dpo/margin_std": 4.609116554260254, "step": 95 }, { "epoch": 0.30303030303030304, "grad_norm": 10.378731727600098, "learning_rate": 4.415111107797445e-07, "logits/chosen": -0.5960966348648071, "logits/rejected": -0.5538562536239624, "logps/chosen": -59.95014572143555, "logps/ref_chosen": -55.48549270629883, "logps/ref_rejected": -85.08012390136719, "logps/rejected": -92.14093017578125, "loss": 0.6266, "margin_dpo/margin_mean": 2.5961520671844482, "margin_dpo/margin_std": 4.217093467712402, "step": 100 }, { "epoch": 0.30303030303030304, "eval_logits/chosen": -0.5741320848464966, "eval_logits/rejected": -0.5576887130737305, "eval_logps/chosen": -75.61560821533203, "eval_logps/ref_chosen": -71.49089813232422, "eval_logps/ref_rejected": -76.31332397460938, "eval_logps/rejected": -82.72161865234375, "eval_loss": 0.6173638105392456, "eval_margin_dpo/margin_mean": 2.28357195854187, "eval_margin_dpo/margin_std": 3.9973862171173096, "eval_runtime": 18.8686, "eval_samples_per_second": 122.055, "eval_steps_per_second": 0.954, "step": 100 }, { "epoch": 0.3181818181818182, "grad_norm": 12.402639389038086, "learning_rate": 4.327482247091679e-07, "logits/chosen": -0.5790421366691589, "logits/rejected": -0.5531052350997925, "logps/chosen": -76.99128723144531, "logps/ref_chosen": -71.54103088378906, "logps/ref_rejected": -98.70140075683594, "logps/rejected": -106.15584564208984, "loss": 0.6195, "margin_dpo/margin_mean": 2.0041980743408203, "margin_dpo/margin_std": 4.225128173828125, "step": 105 }, { "epoch": 0.3333333333333333, "grad_norm": 9.073996543884277, "learning_rate": 4.234742705255272e-07, "logits/chosen": -0.49020037055015564, "logits/rejected": -0.48362722992897034, "logps/chosen": -71.53330993652344, "logps/ref_chosen": -66.31354522705078, "logps/ref_rejected": -76.78019714355469, "logps/rejected": -83.70118713378906, "loss": 0.6149, "margin_dpo/margin_mean": 1.7012172937393188, "margin_dpo/margin_std": 4.63106632232666, "step": 110 }, { "epoch": 0.3484848484848485, "grad_norm": 10.4576997756958, "learning_rate": 4.137151834863213e-07, "logits/chosen": -0.5765933394432068, "logits/rejected": -0.5322223901748657, "logps/chosen": -62.665382385253906, "logps/ref_chosen": -58.31931686401367, "logps/ref_rejected": -88.27889251708984, "logps/rejected": -95.86396789550781, "loss": 0.6004, "margin_dpo/margin_mean": 3.2390189170837402, "margin_dpo/margin_std": 4.050782203674316, "step": 115 }, { "epoch": 0.36363636363636365, "grad_norm": 12.087044715881348, "learning_rate": 4.0349825555680045e-07, "logits/chosen": -0.6157968640327454, "logits/rejected": -0.5801655650138855, "logps/chosen": -66.97267150878906, "logps/ref_chosen": -61.62066650390625, "logps/ref_rejected": -103.57926177978516, "logps/rejected": -112.13105773925781, "loss": 0.6074, "margin_dpo/margin_mean": 3.1997852325439453, "margin_dpo/margin_std": 5.21464729309082, "step": 120 }, { "epoch": 0.3787878787878788, "grad_norm": 11.476883888244629, "learning_rate": 3.9285205908608934e-07, "logits/chosen": -0.5993348360061646, "logits/rejected": -0.5867364406585693, "logps/chosen": -84.22923278808594, "logps/ref_chosen": -77.95762634277344, "logps/ref_rejected": -80.53031158447266, "logps/rejected": -88.4426040649414, "loss": 0.614, "margin_dpo/margin_mean": 1.6406761407852173, "margin_dpo/margin_std": 5.179450511932373, "step": 125 }, { "epoch": 0.3939393939393939, "grad_norm": 12.546419143676758, "learning_rate": 3.818063669026256e-07, "logits/chosen": -0.5839983224868774, "logits/rejected": -0.5685960054397583, "logps/chosen": -75.35858154296875, "logps/ref_chosen": -69.84893798828125, "logps/ref_rejected": -97.6857681274414, "logps/rejected": -106.6558837890625, "loss": 0.5884, "margin_dpo/margin_mean": 3.460472583770752, "margin_dpo/margin_std": 6.851003170013428, "step": 130 }, { "epoch": 0.4090909090909091, "grad_norm": 10.323763847351074, "learning_rate": 3.7039206905237656e-07, "logits/chosen": -0.5967100858688354, "logits/rejected": -0.6035032272338867, "logps/chosen": -76.12150573730469, "logps/ref_chosen": -69.49943542480469, "logps/ref_rejected": -76.46887969970703, "logps/rejected": -84.82896423339844, "loss": 0.5886, "margin_dpo/margin_mean": 1.7380040884017944, "margin_dpo/margin_std": 5.351980686187744, "step": 135 }, { "epoch": 0.42424242424242425, "grad_norm": 9.629522323608398, "learning_rate": 3.586410864126781e-07, "logits/chosen": -0.5848367214202881, "logits/rejected": -0.573132336139679, "logps/chosen": -63.21686553955078, "logps/ref_chosen": -58.184852600097656, "logps/ref_rejected": -72.27442169189453, "logps/rejected": -80.48677062988281, "loss": 0.5704, "margin_dpo/margin_mean": 3.1803410053253174, "margin_dpo/margin_std": 5.574404239654541, "step": 140 }, { "epoch": 0.4393939393939394, "grad_norm": 11.897682189941406, "learning_rate": 3.465862814232821e-07, "logits/chosen": -0.5436482429504395, "logits/rejected": -0.527529776096344, "logps/chosen": -73.48857116699219, "logps/ref_chosen": -67.29014587402344, "logps/ref_rejected": -78.61517333984375, "logps/rejected": -88.46278381347656, "loss": 0.5554, "margin_dpo/margin_mean": 3.6491763591766357, "margin_dpo/margin_std": 5.883833885192871, "step": 145 }, { "epoch": 0.45454545454545453, "grad_norm": 11.066961288452148, "learning_rate": 3.3426136618426043e-07, "logits/chosen": -0.5548180341720581, "logits/rejected": -0.5312086343765259, "logps/chosen": -60.678245544433594, "logps/ref_chosen": -53.7413330078125, "logps/ref_rejected": -80.63525390625, "logps/rejected": -91.57915496826172, "loss": 0.5445, "margin_dpo/margin_mean": 4.006979465484619, "margin_dpo/margin_std": 5.5384626388549805, "step": 150 }, { "epoch": 0.4696969696969697, "grad_norm": 11.354157447814941, "learning_rate": 3.2170080817777257e-07, "logits/chosen": -0.5146440863609314, "logits/rejected": -0.5049440264701843, "logps/chosen": -64.72186279296875, "logps/ref_chosen": -57.31132125854492, "logps/ref_rejected": -74.34989929199219, "logps/rejected": -85.43902587890625, "loss": 0.5766, "margin_dpo/margin_mean": 3.6785824298858643, "margin_dpo/margin_std": 7.704632759094238, "step": 155 }, { "epoch": 0.48484848484848486, "grad_norm": 11.149504661560059, "learning_rate": 3.0893973387735683e-07, "logits/chosen": -0.5834041237831116, "logits/rejected": -0.5603164434432983, "logps/chosen": -66.89668273925781, "logps/ref_chosen": -59.539772033691406, "logps/ref_rejected": -84.16561126708984, "logps/rejected": -96.21601867675781, "loss": 0.5611, "margin_dpo/margin_mean": 4.693480014801025, "margin_dpo/margin_std": 6.964644432067871, "step": 160 }, { "epoch": 0.5, "grad_norm": 13.738677978515625, "learning_rate": 2.9601383051430505e-07, "logits/chosen": -0.5313447117805481, "logits/rejected": -0.5091090798377991, "logps/chosen": -74.807861328125, "logps/ref_chosen": -66.78636169433594, "logps/ref_rejected": -88.8131103515625, "logps/rejected": -101.53221130371094, "loss": 0.5602, "margin_dpo/margin_mean": 4.697592735290527, "margin_dpo/margin_std": 7.459498405456543, "step": 165 }, { "epoch": 0.5151515151515151, "grad_norm": 13.155235290527344, "learning_rate": 2.8295924627584004e-07, "logits/chosen": -0.5154544115066528, "logits/rejected": -0.47907596826553345, "logps/chosen": -55.303504943847656, "logps/ref_chosen": -47.866973876953125, "logps/ref_rejected": -84.14051818847656, "logps/rejected": -98.28789520263672, "loss": 0.5537, "margin_dpo/margin_mean": 6.710854530334473, "margin_dpo/margin_std": 8.341263771057129, "step": 170 }, { "epoch": 0.5303030303030303, "grad_norm": 14.890878677368164, "learning_rate": 2.698124892141971e-07, "logits/chosen": -0.5128508806228638, "logits/rejected": -0.4919998049736023, "logps/chosen": -65.23526763916016, "logps/ref_chosen": -57.79303741455078, "logps/ref_rejected": -76.8666000366211, "logps/rejected": -91.17439270019531, "loss": 0.5327, "margin_dpo/margin_mean": 6.865555763244629, "margin_dpo/margin_std": 8.957682609558105, "step": 175 }, { "epoch": 0.5454545454545454, "grad_norm": 12.345190048217773, "learning_rate": 2.5661032514931834e-07, "logits/chosen": -0.5460310578346252, "logits/rejected": -0.5277290344238281, "logps/chosen": -61.90520095825195, "logps/ref_chosen": -53.86296844482422, "logps/ref_rejected": -76.9208755493164, "logps/rejected": -90.58650207519531, "loss": 0.5397, "margin_dpo/margin_mean": 5.623406887054443, "margin_dpo/margin_std": 8.307819366455078, "step": 180 }, { "epoch": 0.5606060606060606, "grad_norm": 18.609071731567383, "learning_rate": 2.4338967485068164e-07, "logits/chosen": -0.4956757426261902, "logits/rejected": -0.47750720381736755, "logps/chosen": -69.359130859375, "logps/ref_chosen": -60.57938766479492, "logps/ref_rejected": -72.99809265136719, "logps/rejected": -86.45264434814453, "loss": 0.5407, "margin_dpo/margin_mean": 4.674814701080322, "margin_dpo/margin_std": 7.796820163726807, "step": 185 }, { "epoch": 0.5757575757575758, "grad_norm": 15.594287872314453, "learning_rate": 2.3018751078580283e-07, "logits/chosen": -0.5231366157531738, "logits/rejected": -0.5017072558403015, "logps/chosen": -63.6590461730957, "logps/ref_chosen": -55.309478759765625, "logps/ref_rejected": -75.77075958251953, "logps/rejected": -89.84127807617188, "loss": 0.5477, "margin_dpo/margin_mean": 5.720963954925537, "margin_dpo/margin_std": 10.631233215332031, "step": 190 }, { "epoch": 0.5909090909090909, "grad_norm": 13.909214973449707, "learning_rate": 2.170407537241599e-07, "logits/chosen": -0.5053573846817017, "logits/rejected": -0.48142895102500916, "logps/chosen": -76.45471954345703, "logps/ref_chosen": -67.39129638671875, "logps/ref_rejected": -94.1995620727539, "logps/rejected": -109.12031555175781, "loss": 0.5555, "margin_dpo/margin_mean": 5.857341289520264, "margin_dpo/margin_std": 9.257515907287598, "step": 195 }, { "epoch": 0.6060606060606061, "grad_norm": 14.265554428100586, "learning_rate": 2.0398616948569493e-07, "logits/chosen": -0.5393396019935608, "logits/rejected": -0.5077868700027466, "logps/chosen": -75.58625793457031, "logps/ref_chosen": -65.90815734863281, "logps/ref_rejected": -98.7196273803711, "logps/rejected": -113.99732971191406, "loss": 0.5253, "margin_dpo/margin_mean": 5.5995988845825195, "margin_dpo/margin_std": 10.336074829101562, "step": 200 }, { "epoch": 0.6060606060606061, "eval_logits/chosen": -0.5199635624885559, "eval_logits/rejected": -0.5067822933197021, "eval_logps/chosen": -79.58210754394531, "eval_logps/ref_chosen": -71.49089813232422, "eval_logps/ref_rejected": -76.31332397460938, "eval_logps/rejected": -90.86639404296875, "eval_loss": 0.543655276298523, "eval_margin_dpo/margin_mean": 6.4618449211120605, "eval_margin_dpo/margin_std": 9.544526100158691, "eval_runtime": 18.8081, "eval_samples_per_second": 122.447, "eval_steps_per_second": 0.957, "step": 200 }, { "epoch": 0.6212121212121212, "grad_norm": 11.659725189208984, "learning_rate": 1.9106026612264315e-07, "logits/chosen": -0.5398346185684204, "logits/rejected": -0.5087303519248962, "logps/chosen": -59.74982833862305, "logps/ref_chosen": -52.514007568359375, "logps/ref_rejected": -94.02557373046875, "logps/rejected": -109.4577865600586, "loss": 0.508, "margin_dpo/margin_mean": 8.196396827697754, "margin_dpo/margin_std": 10.316641807556152, "step": 205 }, { "epoch": 0.6363636363636364, "grad_norm": 29.11798667907715, "learning_rate": 1.782991918222275e-07, "logits/chosen": -0.47662702202796936, "logits/rejected": -0.46838369965553284, "logps/chosen": -66.78819274902344, "logps/ref_chosen": -57.89775466918945, "logps/ref_rejected": -62.08463668823242, "logps/rejected": -77.85931396484375, "loss": 0.5482, "margin_dpo/margin_mean": 6.8842339515686035, "margin_dpo/margin_std": 11.393902778625488, "step": 210 }, { "epoch": 0.6515151515151515, "grad_norm": 23.676776885986328, "learning_rate": 1.6573863381573954e-07, "logits/chosen": -0.4756692945957184, "logits/rejected": -0.4733617305755615, "logps/chosen": -71.32975006103516, "logps/ref_chosen": -63.36411666870117, "logps/ref_rejected": -70.50566101074219, "logps/rejected": -84.5431137084961, "loss": 0.5442, "margin_dpo/margin_mean": 6.07181453704834, "margin_dpo/margin_std": 9.235767364501953, "step": 215 }, { "epoch": 0.6666666666666666, "grad_norm": 26.59471321105957, "learning_rate": 1.534137185767178e-07, "logits/chosen": -0.5520139932632446, "logits/rejected": -0.5306358933448792, "logps/chosen": -63.29638671875, "logps/ref_chosen": -54.3653564453125, "logps/ref_rejected": -80.68601989746094, "logps/rejected": -97.40142822265625, "loss": 0.529, "margin_dpo/margin_mean": 7.784371852874756, "margin_dpo/margin_std": 11.405842781066895, "step": 220 }, { "epoch": 0.6818181818181818, "grad_norm": 17.50434684753418, "learning_rate": 1.4135891358732205e-07, "logits/chosen": -0.5091781616210938, "logits/rejected": -0.4780656397342682, "logps/chosen": -74.7088851928711, "logps/ref_chosen": -65.24610137939453, "logps/ref_rejected": -85.6495590209961, "logps/rejected": -103.7113265991211, "loss": 0.5273, "margin_dpo/margin_mean": 8.598976135253906, "margin_dpo/margin_std": 11.525456428527832, "step": 225 }, { "epoch": 0.696969696969697, "grad_norm": 21.340883255004883, "learning_rate": 1.2960793094762345e-07, "logits/chosen": -0.4688114523887634, "logits/rejected": -0.46031489968299866, "logps/chosen": -79.30754089355469, "logps/ref_chosen": -69.5623550415039, "logps/ref_rejected": -86.65391540527344, "logps/rejected": -102.97904968261719, "loss": 0.5118, "margin_dpo/margin_mean": 6.579934597015381, "margin_dpo/margin_std": 10.335288047790527, "step": 230 }, { "epoch": 0.7121212121212122, "grad_norm": 20.29132652282715, "learning_rate": 1.1819363309737438e-07, "logits/chosen": -0.4904417097568512, "logits/rejected": -0.4770389199256897, "logps/chosen": -72.47919464111328, "logps/ref_chosen": -62.41870880126953, "logps/ref_rejected": -80.84742736816406, "logps/rejected": -97.89503479003906, "loss": 0.5133, "margin_dpo/margin_mean": 6.987112998962402, "margin_dpo/margin_std": 9.303082466125488, "step": 235 }, { "epoch": 0.7272727272727273, "grad_norm": 11.328718185424805, "learning_rate": 1.0714794091391072e-07, "logits/chosen": -0.5141887068748474, "logits/rejected": -0.4992826581001282, "logps/chosen": -68.79585266113281, "logps/ref_chosen": -60.14348602294922, "logps/ref_rejected": -84.51826477050781, "logps/rejected": -101.74858856201172, "loss": 0.5432, "margin_dpo/margin_mean": 8.577953338623047, "margin_dpo/margin_std": 10.39548397064209, "step": 240 }, { "epoch": 0.7424242424242424, "grad_norm": 21.313125610351562, "learning_rate": 9.650174444319956e-08, "logits/chosen": -0.5187879800796509, "logits/rejected": -0.5011430382728577, "logps/chosen": -68.9282455444336, "logps/ref_chosen": -59.89912033081055, "logps/ref_rejected": -76.29353332519531, "logps/rejected": -93.21476745605469, "loss": 0.549, "margin_dpo/margin_mean": 7.892104148864746, "margin_dpo/margin_std": 10.297919273376465, "step": 245 }, { "epoch": 0.7575757575757576, "grad_norm": 18.405746459960938, "learning_rate": 8.628481651367875e-08, "logits/chosen": -0.5289962887763977, "logits/rejected": -0.5101832151412964, "logps/chosen": -71.01588439941406, "logps/ref_chosen": -61.324790954589844, "logps/ref_rejected": -95.19871520996094, "logps/rejected": -110.73634338378906, "loss": 0.5381, "margin_dpo/margin_mean": 5.8465423583984375, "margin_dpo/margin_std": 11.49156379699707, "step": 250 }, { "epoch": 0.7727272727272727, "grad_norm": 29.608196258544922, "learning_rate": 7.652572947447272e-08, "logits/chosen": -0.5170688033103943, "logits/rejected": -0.5108999013900757, "logps/chosen": -82.85248565673828, "logps/ref_chosen": -73.00435638427734, "logps/ref_rejected": -89.8001937866211, "logps/rejected": -106.5128402709961, "loss": 0.5272, "margin_dpo/margin_mean": 6.864515781402588, "margin_dpo/margin_std": 10.157739639282227, "step": 255 }, { "epoch": 0.7878787878787878, "grad_norm": 35.19934844970703, "learning_rate": 6.725177529083209e-08, "logits/chosen": -0.5281625390052795, "logits/rejected": -0.5114730596542358, "logps/chosen": -65.01654815673828, "logps/ref_chosen": -54.35801315307617, "logps/ref_rejected": -78.89704895019531, "logps/rejected": -97.48576354980469, "loss": 0.5345, "margin_dpo/margin_mean": 7.930176734924316, "margin_dpo/margin_std": 12.07260513305664, "step": 260 }, { "epoch": 0.803030303030303, "grad_norm": 15.536827087402344, "learning_rate": 5.848888922025552e-08, "logits/chosen": -0.47202104330062866, "logits/rejected": -0.4491683542728424, "logps/chosen": -75.3332748413086, "logps/ref_chosen": -64.1512451171875, "logps/ref_rejected": -88.43415069580078, "logps/rejected": -107.0230712890625, "loss": 0.5559, "margin_dpo/margin_mean": 7.406890869140625, "margin_dpo/margin_std": 11.541508674621582, "step": 265 }, { "epoch": 0.8181818181818182, "grad_norm": 14.287105560302734, "learning_rate": 5.026157728273966e-08, "logits/chosen": -0.5008893013000488, "logits/rejected": -0.4735264778137207, "logps/chosen": -62.34975051879883, "logps/ref_chosen": -51.93467330932617, "logps/ref_rejected": -83.3440170288086, "logps/rejected": -99.53559875488281, "loss": 0.5252, "margin_dpo/margin_mean": 5.776501655578613, "margin_dpo/margin_std": 10.03078556060791, "step": 270 }, { "epoch": 0.8333333333333334, "grad_norm": 13.779406547546387, "learning_rate": 4.259284772799099e-08, "logits/chosen": -0.509304404258728, "logits/rejected": -0.5035196542739868, "logps/chosen": -74.07002258300781, "logps/ref_chosen": -66.1004638671875, "logps/ref_rejected": -77.46138000488281, "logps/rejected": -94.65324401855469, "loss": 0.5202, "margin_dpo/margin_mean": 9.222299575805664, "margin_dpo/margin_std": 10.624560356140137, "step": 275 }, { "epoch": 0.8484848484848485, "grad_norm": 28.201580047607422, "learning_rate": 3.550414669125573e-08, "logits/chosen": -0.5307421088218689, "logits/rejected": -0.5124194622039795, "logps/chosen": -78.31131744384766, "logps/ref_chosen": -68.96475982666016, "logps/ref_rejected": -93.81538391113281, "logps/rejected": -110.4820327758789, "loss": 0.5355, "margin_dpo/margin_mean": 7.320086479187012, "margin_dpo/margin_std": 12.83232307434082, "step": 280 }, { "epoch": 0.8636363636363636, "grad_norm": 18.593904495239258, "learning_rate": 2.9015298217712453e-08, "logits/chosen": -0.4980226457118988, "logits/rejected": -0.46921929717063904, "logps/chosen": -72.2420425415039, "logps/ref_chosen": -61.95045852661133, "logps/ref_rejected": -91.99930572509766, "logps/rejected": -110.4931640625, "loss": 0.5048, "margin_dpo/margin_mean": 8.202288627624512, "margin_dpo/margin_std": 12.118570327758789, "step": 285 }, { "epoch": 0.8787878787878788, "grad_norm": 19.532819747924805, "learning_rate": 2.3144448823151392e-08, "logits/chosen": -0.48515787720680237, "logits/rejected": -0.46074217557907104, "logps/chosen": -64.38178253173828, "logps/ref_chosen": -54.1287727355957, "logps/ref_rejected": -77.50074005126953, "logps/rejected": -94.30645751953125, "loss": 0.5432, "margin_dpo/margin_mean": 6.552700996398926, "margin_dpo/margin_std": 11.339497566223145, "step": 290 }, { "epoch": 0.8939393939393939, "grad_norm": 14.434176445007324, "learning_rate": 1.7908016745981856e-08, "logits/chosen": -0.4828720986843109, "logits/rejected": -0.48095735907554626, "logps/chosen": -71.822509765625, "logps/ref_chosen": -61.227928161621094, "logps/ref_rejected": -70.93891143798828, "logps/rejected": -88.13584899902344, "loss": 0.5307, "margin_dpo/margin_mean": 6.602363586425781, "margin_dpo/margin_std": 10.929509162902832, "step": 295 }, { "epoch": 0.9090909090909091, "grad_norm": 11.023996353149414, "learning_rate": 1.3320646032487393e-08, "logits/chosen": -0.5068015456199646, "logits/rejected": -0.4941573143005371, "logps/chosen": -68.61476135253906, "logps/ref_chosen": -59.28802490234375, "logps/ref_rejected": -82.7754898071289, "logps/rejected": -100.3427505493164, "loss": 0.5534, "margin_dpo/margin_mean": 8.240517616271973, "margin_dpo/margin_std": 10.162951469421387, "step": 300 }, { "epoch": 0.9090909090909091, "eval_logits/chosen": -0.49858054518699646, "eval_logits/rejected": -0.48604482412338257, "eval_logps/chosen": -80.9963607788086, "eval_logps/ref_chosen": -71.49089813232422, "eval_logps/ref_rejected": -76.31332397460938, "eval_logps/rejected": -92.93927001953125, "eval_loss": 0.5387622714042664, "eval_margin_dpo/margin_mean": 7.120471000671387, "eval_margin_dpo/margin_std": 10.49869155883789, "eval_runtime": 18.8008, "eval_samples_per_second": 122.495, "eval_steps_per_second": 0.957, "step": 300 }, { "epoch": 0.9242424242424242, "grad_norm": 28.285140991210938, "learning_rate": 9.395165583732379e-09, "logits/chosen": -0.48444804549217224, "logits/rejected": -0.4512646794319153, "logps/chosen": -63.23552322387695, "logps/ref_chosen": -54.85032272338867, "logps/ref_rejected": -96.26322174072266, "logps/rejected": -114.82981872558594, "loss": 0.5254, "margin_dpo/margin_mean": 10.181402206420898, "margin_dpo/margin_std": 10.521098136901855, "step": 305 }, { "epoch": 0.9393939393939394, "grad_norm": 17.56390953063965, "learning_rate": 6.142553278648238e-09, "logits/chosen": -0.495095819234848, "logits/rejected": -0.47865208983421326, "logps/chosen": -76.20247650146484, "logps/ref_chosen": -65.8403091430664, "logps/ref_rejected": -88.9677963256836, "logps/rejected": -106.7435073852539, "loss": 0.5117, "margin_dpo/margin_mean": 7.413548946380615, "margin_dpo/margin_std": 10.833813667297363, "step": 310 }, { "epoch": 0.9545454545454546, "grad_norm": 11.377077102661133, "learning_rate": 3.5719052736323806e-09, "logits/chosen": -0.49148210883140564, "logits/rejected": -0.4869101941585541, "logps/chosen": -82.30244445800781, "logps/ref_chosen": -72.73238372802734, "logps/ref_rejected": -74.21096801757812, "logps/rejected": -89.88545989990234, "loss": 0.508, "margin_dpo/margin_mean": 6.104436874389648, "margin_dpo/margin_std": 9.512574195861816, "step": 315 }, { "epoch": 0.9696969696969697, "grad_norm": 13.178277969360352, "learning_rate": 1.690410564514244e-09, "logits/chosen": -0.49254482984542847, "logits/rejected": -0.45911550521850586, "logps/chosen": -76.04261779785156, "logps/ref_chosen": -65.25657653808594, "logps/ref_rejected": -91.9552993774414, "logps/rejected": -111.62044525146484, "loss": 0.529, "margin_dpo/margin_mean": 8.879097938537598, "margin_dpo/margin_std": 10.679101943969727, "step": 320 }, { "epoch": 0.9848484848484849, "grad_norm": 14.677971839904785, "learning_rate": 5.033308820289184e-10, "logits/chosen": -0.502629280090332, "logits/rejected": -0.4776650071144104, "logps/chosen": -61.78889846801758, "logps/ref_chosen": -53.00225067138672, "logps/ref_rejected": -69.4771957397461, "logps/rejected": -87.58296966552734, "loss": 0.5264, "margin_dpo/margin_mean": 9.319120407104492, "margin_dpo/margin_std": 10.821681022644043, "step": 325 }, { "epoch": 1.0, "grad_norm": 16.924516677856445, "learning_rate": 1.3985977021235829e-11, "logits/chosen": -0.5281952023506165, "logits/rejected": -0.5035934448242188, "logps/chosen": -59.8553352355957, "logps/ref_chosen": -51.018646240234375, "logps/ref_rejected": -74.90043640136719, "logps/rejected": -92.38591003417969, "loss": 0.5287, "margin_dpo/margin_mean": 8.648794174194336, "margin_dpo/margin_std": 10.91873550415039, "step": 330 }, { "epoch": 1.0, "step": 330, "total_flos": 0.0, "train_loss": 0.5836806095007694, "train_runtime": 1387.0612, "train_samples_per_second": 30.522, "train_steps_per_second": 0.238 } ], "logging_steps": 5, "max_steps": 330, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }