Files
llama-3-8b-base-margin-dpo-…/trainer_state.json

10055 lines
356 KiB
JSON
Raw Permalink Normal View History

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.999244142101285,
"eval_steps": 100,
"global_step": 661,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0015117157974300832,
"grad_norm": 28.220060348510742,
"learning_rate": 0.0,
"logits/chosen": 0.13337239623069763,
"logits/rejected": 0.12492948770523071,
"logps/chosen": -64.5841293334961,
"logps/ref_chosen": -64.61280822753906,
"logps/ref_rejected": -64.17195129394531,
"logps/rejected": -64.14192199707031,
"loss": 1.3866,
"margin_dpo/margin_mean": -0.0013527870178222656,
"margin_dpo/margin_std": 0.2561596930027008,
"step": 1
},
{
"epoch": 0.0030234315948601664,
"grad_norm": 27.82727813720703,
"learning_rate": 7.462686567164179e-09,
"logits/chosen": 0.09414851665496826,
"logits/rejected": 0.07363267242908478,
"logps/chosen": -56.101890563964844,
"logps/ref_chosen": -56.0989990234375,
"logps/ref_rejected": -66.59971618652344,
"logps/rejected": -66.64006042480469,
"loss": 1.3828,
"margin_dpo/margin_mean": 0.03744968771934509,
"margin_dpo/margin_std": 0.27811938524246216,
"step": 2
},
{
"epoch": 0.0045351473922902496,
"grad_norm": 31.255678176879883,
"learning_rate": 1.4925373134328357e-08,
"logits/chosen": 0.09937217831611633,
"logits/rejected": 0.061470769345760345,
"logps/chosen": -65.42631530761719,
"logps/ref_chosen": -65.45726013183594,
"logps/ref_rejected": -90.82853698730469,
"logps/rejected": -90.77711486816406,
"loss": 1.3886,
"margin_dpo/margin_mean": -0.02046513557434082,
"margin_dpo/margin_std": 0.29388636350631714,
"step": 3
},
{
"epoch": 0.006046863189720333,
"grad_norm": 34.38330841064453,
"learning_rate": 2.2388059701492534e-08,
"logits/chosen": 0.11287139356136322,
"logits/rejected": 0.09666792303323746,
"logps/chosen": -76.84639739990234,
"logps/ref_chosen": -76.86018371582031,
"logps/ref_rejected": -79.91523742675781,
"logps/rejected": -79.90673828125,
"loss": 1.386,
"margin_dpo/margin_mean": 0.005287140607833862,
"margin_dpo/margin_std": 0.30723485350608826,
"step": 4
},
{
"epoch": 0.007558578987150416,
"grad_norm": 29.698986053466797,
"learning_rate": 2.9850746268656714e-08,
"logits/chosen": 0.08247077465057373,
"logits/rejected": 0.04365617036819458,
"logps/chosen": -62.99720764160156,
"logps/ref_chosen": -62.97134017944336,
"logps/ref_rejected": -79.91920471191406,
"logps/rejected": -79.87831115722656,
"loss": 1.3933,
"margin_dpo/margin_mean": -0.06676921248435974,
"margin_dpo/margin_std": 0.3261260688304901,
"step": 5
},
{
"epoch": 0.009070294784580499,
"grad_norm": 29.868539810180664,
"learning_rate": 3.731343283582089e-08,
"logits/chosen": 0.14518634974956512,
"logits/rejected": 0.10563785582780838,
"logps/chosen": -51.37147521972656,
"logps/ref_chosen": -51.30736541748047,
"logps/ref_rejected": -82.77239227294922,
"logps/rejected": -82.7464828491211,
"loss": 1.3956,
"margin_dpo/margin_mean": -0.09001976251602173,
"margin_dpo/margin_std": 0.3415699601173401,
"step": 6
},
{
"epoch": 0.010582010582010581,
"grad_norm": 27.209218978881836,
"learning_rate": 4.477611940298507e-08,
"logits/chosen": 0.015805965289473534,
"logits/rejected": -0.027883023023605347,
"logps/chosen": -51.44762420654297,
"logps/ref_chosen": -51.45941162109375,
"logps/ref_rejected": -66.3828125,
"logps/rejected": -66.3854751586914,
"loss": 1.385,
"margin_dpo/margin_mean": 0.014449506998062134,
"margin_dpo/margin_std": 0.22094310820102692,
"step": 7
},
{
"epoch": 0.012093726379440665,
"grad_norm": 28.625896453857422,
"learning_rate": 5.223880597014925e-08,
"logits/chosen": 0.05074763670563698,
"logits/rejected": 0.029051048681139946,
"logps/chosen": -62.23204803466797,
"logps/ref_chosen": -62.19754409790039,
"logps/ref_rejected": -74.66180419921875,
"logps/rejected": -74.68563842773438,
"loss": 1.3877,
"margin_dpo/margin_mean": -0.010671883821487427,
"margin_dpo/margin_std": 0.3669050931930542,
"step": 8
},
{
"epoch": 0.013605442176870748,
"grad_norm": 31.59478759765625,
"learning_rate": 5.970149253731343e-08,
"logits/chosen": 0.15643024444580078,
"logits/rejected": 0.09815741330385208,
"logps/chosen": -55.671485900878906,
"logps/ref_chosen": -55.629722595214844,
"logps/ref_rejected": -86.21221923828125,
"logps/rejected": -86.2544937133789,
"loss": 1.3865,
"margin_dpo/margin_mean": 0.0005104541778564453,
"margin_dpo/margin_std": 0.29080653190612793,
"step": 9
},
{
"epoch": 0.015117157974300832,
"grad_norm": 29.54948616027832,
"learning_rate": 6.71641791044776e-08,
"logits/chosen": 0.15007850527763367,
"logits/rejected": 0.1181110367178917,
"logps/chosen": -62.665287017822266,
"logps/ref_chosen": -62.69060134887695,
"logps/ref_rejected": -90.61012268066406,
"logps/rejected": -90.62495422363281,
"loss": 1.3826,
"margin_dpo/margin_mean": 0.040148526430130005,
"margin_dpo/margin_std": 0.3635545074939728,
"step": 10
},
{
"epoch": 0.016628873771730914,
"grad_norm": 29.36393928527832,
"learning_rate": 7.462686567164178e-08,
"logits/chosen": 0.10430046170949936,
"logits/rejected": 0.09751109778881073,
"logps/chosen": -65.7414321899414,
"logps/ref_chosen": -65.76712036132812,
"logps/ref_rejected": -72.4764633178711,
"logps/rejected": -72.47653198242188,
"loss": 1.384,
"margin_dpo/margin_mean": 0.02575582265853882,
"margin_dpo/margin_std": 0.2989434003829956,
"step": 11
},
{
"epoch": 0.018140589569160998,
"grad_norm": 28.3092098236084,
"learning_rate": 8.208955223880596e-08,
"logits/chosen": 0.026789026334881783,
"logits/rejected": 0.010548613965511322,
"logps/chosen": -60.722389221191406,
"logps/ref_chosen": -60.704891204833984,
"logps/ref_rejected": -69.41564178466797,
"logps/rejected": -69.4146957397461,
"loss": 1.3883,
"margin_dpo/margin_mean": -0.018443971872329712,
"margin_dpo/margin_std": 0.2729582190513611,
"step": 12
},
{
"epoch": 0.019652305366591082,
"grad_norm": 29.16897201538086,
"learning_rate": 8.955223880597014e-08,
"logits/chosen": 0.11780130863189697,
"logits/rejected": 0.054500848054885864,
"logps/chosen": -49.9007568359375,
"logps/ref_chosen": -49.90925216674805,
"logps/ref_rejected": -92.378173828125,
"logps/rejected": -92.35137939453125,
"loss": 1.3884,
"margin_dpo/margin_mean": -0.0182991623878479,
"margin_dpo/margin_std": 0.28390124440193176,
"step": 13
},
{
"epoch": 0.021164021164021163,
"grad_norm": 29.623899459838867,
"learning_rate": 9.701492537313432e-08,
"logits/chosen": 0.08536244928836823,
"logits/rejected": 0.06769540160894394,
"logps/chosen": -60.600833892822266,
"logps/ref_chosen": -60.61879348754883,
"logps/ref_rejected": -71.79306030273438,
"logps/rejected": -71.79493713378906,
"loss": 1.3846,
"margin_dpo/margin_mean": 0.019834458827972412,
"margin_dpo/margin_std": 0.32009801268577576,
"step": 14
},
{
"epoch": 0.022675736961451247,
"grad_norm": 33.00798797607422,
"learning_rate": 1.044776119402985e-07,
"logits/chosen": 0.0717407837510109,
"logits/rejected": 0.02849598601460457,
"logps/chosen": -63.45430374145508,
"logps/ref_chosen": -63.46953582763672,
"logps/ref_rejected": -88.88951110839844,
"logps/rejected": -88.90917205810547,
"loss": 1.383,
"margin_dpo/margin_mean": 0.03488925099372864,
"margin_dpo/margin_std": 0.29492419958114624,
"step": 15
},
{
"epoch": 0.02418745275888133,
"grad_norm": 26.881074905395508,
"learning_rate": 1.1194029850746268e-07,
"logits/chosen": 0.10169962048530579,
"logits/rejected": 0.06554323434829712,
"logps/chosen": -46.536376953125,
"logps/ref_chosen": -46.53229904174805,
"logps/ref_rejected": -74.27534484863281,
"logps/rejected": -74.26747131347656,
"loss": 1.3877,
"margin_dpo/margin_mean": -0.011950835585594177,
"margin_dpo/margin_std": 0.2769685983657837,
"step": 16
},
{
"epoch": 0.025699168556311415,
"grad_norm": 33.024173736572266,
"learning_rate": 1.1940298507462686e-07,
"logits/chosen": 0.06722284853458405,
"logits/rejected": 0.04799235984683037,
"logps/chosen": -64.07486724853516,
"logps/ref_chosen": -64.07783508300781,
"logps/ref_rejected": -86.40876770019531,
"logps/rejected": -86.42149353027344,
"loss": 1.3851,
"margin_dpo/margin_mean": 0.015699952840805054,
"margin_dpo/margin_std": 0.3443329334259033,
"step": 17
},
{
"epoch": 0.027210884353741496,
"grad_norm": 27.680784225463867,
"learning_rate": 1.2686567164179106e-07,
"logits/chosen": 0.09066110849380493,
"logits/rejected": 0.04530249536037445,
"logps/chosen": -44.830833435058594,
"logps/ref_chosen": -44.87433624267578,
"logps/ref_rejected": -70.9760513305664,
"logps/rejected": -70.99610900878906,
"loss": 1.3802,
"margin_dpo/margin_mean": 0.06356379389762878,
"margin_dpo/margin_std": 0.2836337685585022,
"step": 18
},
{
"epoch": 0.02872260015117158,
"grad_norm": 30.68499183654785,
"learning_rate": 1.343283582089552e-07,
"logits/chosen": 0.0559040792286396,
"logits/rejected": 0.04259665310382843,
"logps/chosen": -68.11859130859375,
"logps/ref_chosen": -68.1598129272461,
"logps/ref_rejected": -81.17138671875,
"logps/rejected": -81.19941711425781,
"loss": 1.3796,
"margin_dpo/margin_mean": 0.06925900280475616,
"margin_dpo/margin_std": 0.27486366033554077,
"step": 19
},
{
"epoch": 0.030234315948601664,
"grad_norm": 29.224321365356445,
"learning_rate": 1.4179104477611938e-07,
"logits/chosen": 0.13296857476234436,
"logits/rejected": 0.11000999808311462,
"logps/chosen": -53.684165954589844,
"logps/ref_chosen": -53.678558349609375,
"logps/ref_rejected": -74.16911315917969,
"logps/rejected": -74.16539764404297,
"loss": 1.3874,
"margin_dpo/margin_mean": -0.009327858686447144,
"margin_dpo/margin_std": 0.28060251474380493,
"step": 20
},
{
"epoch": 0.031746031746031744,
"grad_norm": 29.44382667541504,
"learning_rate": 1.4925373134328355e-07,
"logits/chosen": 0.09574359655380249,
"logits/rejected": 0.07080109417438507,
"logps/chosen": -64.70553588867188,
"logps/ref_chosen": -64.70155334472656,
"logps/ref_rejected": -81.02095031738281,
"logps/rejected": -81.00267028808594,
"loss": 1.3887,
"margin_dpo/margin_mean": -0.022250384092330933,
"margin_dpo/margin_std": 0.29801231622695923,
"step": 21
},
{
"epoch": 0.03325774754346183,
"grad_norm": 29.0031795501709,
"learning_rate": 1.5671641791044775e-07,
"logits/chosen": 0.010743018239736557,
"logits/rejected": -0.010531796142458916,
"logps/chosen": -58.05990219116211,
"logps/ref_chosen": -58.03599548339844,
"logps/ref_rejected": -80.72721862792969,
"logps/rejected": -80.74945831298828,
"loss": 1.3867,
"margin_dpo/margin_mean": -0.0016689598560333252,
"margin_dpo/margin_std": 0.27896028757095337,
"step": 22
},
{
"epoch": 0.03476946334089191,
"grad_norm": 32.909122467041016,
"learning_rate": 1.6417910447761193e-07,
"logits/chosen": 0.1421518325805664,
"logits/rejected": 0.11664065718650818,
"logps/chosen": -66.34078216552734,
"logps/ref_chosen": -66.35609436035156,
"logps/ref_rejected": -93.02769470214844,
"logps/rejected": -93.0291976928711,
"loss": 1.3849,
"margin_dpo/margin_mean": 0.01681581139564514,
"margin_dpo/margin_std": 0.3285256624221802,
"step": 23
},
{
"epoch": 0.036281179138321996,
"grad_norm": 26.222774505615234,
"learning_rate": 1.716417910447761e-07,
"logits/chosen": 0.13952991366386414,
"logits/rejected": 0.10637363791465759,
"logps/chosen": -54.47034454345703,
"logps/ref_chosen": -54.461238861083984,
"logps/ref_rejected": -68.33817291259766,
"logps/rejected": -68.36705780029297,
"loss": 1.3845,
"margin_dpo/margin_mean": 0.019780874252319336,
"margin_dpo/margin_std": 0.25748512148857117,
"step": 24
},
{
"epoch": 0.03779289493575208,
"grad_norm": 29.700237274169922,
"learning_rate": 1.7910447761194027e-07,
"logits/chosen": 0.10331503301858902,
"logits/rejected": 0.05197351798415184,
"logps/chosen": -60.05360794067383,
"logps/ref_chosen": -60.00420379638672,
"logps/ref_rejected": -90.47376251220703,
"logps/rejected": -90.52792358398438,
"loss": 1.3861,
"margin_dpo/margin_mean": 0.0047473907470703125,
"margin_dpo/margin_std": 0.3563765287399292,
"step": 25
},
{
"epoch": 0.039304610733182165,
"grad_norm": 29.708221435546875,
"learning_rate": 1.8656716417910447e-07,
"logits/chosen": 0.12899595499038696,
"logits/rejected": 0.10993089526891708,
"logps/chosen": -56.8546028137207,
"logps/ref_chosen": -56.81915283203125,
"logps/ref_rejected": -77.84333038330078,
"logps/rejected": -77.90251159667969,
"loss": 1.3842,
"margin_dpo/margin_mean": 0.02373906970024109,
"margin_dpo/margin_std": 0.30973872542381287,
"step": 26
},
{
"epoch": 0.04081632653061224,
"grad_norm": 29.129623413085938,
"learning_rate": 1.9402985074626865e-07,
"logits/chosen": 0.10666880756616592,
"logits/rejected": 0.08166207373142242,
"logps/chosen": -62.90531921386719,
"logps/ref_chosen": -62.87702178955078,
"logps/ref_rejected": -71.34437561035156,
"logps/rejected": -71.36808776855469,
"loss": 1.387,
"margin_dpo/margin_mean": -0.004584580659866333,
"margin_dpo/margin_std": 0.285133421421051,
"step": 27
},
{
"epoch": 0.042328042328042326,
"grad_norm": 27.702333450317383,
"learning_rate": 2.0149253731343282e-07,
"logits/chosen": 0.07180536538362503,
"logits/rejected": 0.06273311376571655,
"logps/chosen": -59.86304473876953,
"logps/ref_chosen": -59.833377838134766,
"logps/ref_rejected": -70.39804077148438,
"logps/rejected": -70.39251708984375,
"loss": 1.39,
"margin_dpo/margin_mean": -0.03518790006637573,
"margin_dpo/margin_std": 0.2781359553337097,
"step": 28
},
{
"epoch": 0.04383975812547241,
"grad_norm": 32.63071823120117,
"learning_rate": 2.08955223880597e-07,
"logits/chosen": 0.12691108882427216,
"logits/rejected": 0.10950787365436554,
"logps/chosen": -74.12796783447266,
"logps/ref_chosen": -74.12020111083984,
"logps/ref_rejected": -83.33098602294922,
"logps/rejected": -83.3443603515625,
"loss": 1.3859,
"margin_dpo/margin_mean": 0.005606889724731445,
"margin_dpo/margin_std": 0.2921278774738312,
"step": 29
},
{
"epoch": 0.045351473922902494,
"grad_norm": 30.414403915405273,
"learning_rate": 2.1641791044776117e-07,
"logits/chosen": 0.1324155628681183,
"logits/rejected": 0.0772828459739685,
"logps/chosen": -50.75823211669922,
"logps/ref_chosen": -50.75128936767578,
"logps/ref_rejected": -89.29063415527344,
"logps/rejected": -89.32701873779297,
"loss": 1.3836,
"margin_dpo/margin_mean": 0.02944222092628479,
"margin_dpo/margin_std": 0.33608317375183105,
"step": 30
},
{
"epoch": 0.04686318972033258,
"grad_norm": 34.32074737548828,
"learning_rate": 2.2388059701492537e-07,
"logits/chosen": 0.11036145687103271,
"logits/rejected": 0.06397197395563126,
"logps/chosen": -65.36607360839844,
"logps/ref_chosen": -65.33675384521484,
"logps/ref_rejected": -100.76666259765625,
"logps/rejected": -100.819091796875,
"loss": 1.3842,
"margin_dpo/margin_mean": 0.023108333349227905,
"margin_dpo/margin_std": 0.31038618087768555,
"step": 31
},
{
"epoch": 0.04837490551776266,
"grad_norm": 30.099220275878906,
"learning_rate": 2.3134328358208954e-07,
"logits/chosen": 0.08162057399749756,
"logits/rejected": 0.07382632791996002,
"logps/chosen": -67.17050170898438,
"logps/ref_chosen": -67.18333435058594,
"logps/ref_rejected": -82.80763244628906,
"logps/rejected": -82.83109283447266,
"loss": 1.3829,
"margin_dpo/margin_mean": 0.036289215087890625,
"margin_dpo/margin_std": 0.3070800304412842,
"step": 32
},
{
"epoch": 0.049886621315192746,
"grad_norm": 31.10877227783203,
"learning_rate": 2.388059701492537e-07,
"logits/chosen": 0.043758779764175415,
"logits/rejected": 0.01711263135075569,
"logps/chosen": -64.09259033203125,
"logps/ref_chosen": -64.03947448730469,
"logps/ref_rejected": -75.68357849121094,
"logps/rejected": -75.74140167236328,
"loss": 1.3861,
"margin_dpo/margin_mean": 0.004706323146820068,
"margin_dpo/margin_std": 0.3460730314254761,
"step": 33
},
{
"epoch": 0.05139833711262283,
"grad_norm": 28.492658615112305,
"learning_rate": 2.4626865671641786e-07,
"logits/chosen": 0.09808081388473511,
"logits/rejected": 0.06829625368118286,
"logps/chosen": -53.72270202636719,
"logps/ref_chosen": -53.66429901123047,
"logps/ref_rejected": -65.77989196777344,
"logps/rejected": -65.87895202636719,
"loss": 1.3825,
"margin_dpo/margin_mean": 0.040650635957717896,
"margin_dpo/margin_std": 0.2930639982223511,
"step": 34
},
{
"epoch": 0.05291005291005291,
"grad_norm": 27.739458084106445,
"learning_rate": 2.537313432835821e-07,
"logits/chosen": 0.04755338653922081,
"logits/rejected": 0.02539633959531784,
"logps/chosen": -61.09136962890625,
"logps/ref_chosen": -61.01686096191406,
"logps/ref_rejected": -72.78598022460938,
"logps/rejected": -72.928955078125,
"loss": 1.3797,
"margin_dpo/margin_mean": 0.06845930218696594,
"margin_dpo/margin_std": 0.3211126923561096,
"step": 35
},
{
"epoch": 0.05442176870748299,
"grad_norm": 28.59171485900879,
"learning_rate": 2.611940298507462e-07,
"logits/chosen": 0.10078567266464233,
"logits/rejected": 0.04806087166070938,
"logps/chosen": -50.616050720214844,
"logps/ref_chosen": -50.53736114501953,
"logps/ref_rejected": -78.11678314208984,
"logps/rejected": -78.25202941894531,
"loss": 1.381,
"margin_dpo/margin_mean": 0.05655008554458618,
"margin_dpo/margin_std": 0.3539975881576538,
"step": 36
},
{
"epoch": 0.055933484504913075,
"grad_norm": 37.603458404541016,
"learning_rate": 2.686567164179104e-07,
"logits/chosen": 0.09838317334651947,
"logits/rejected": 0.019203372299671173,
"logps/chosen": -59.59420394897461,
"logps/ref_chosen": -59.55394744873047,
"logps/ref_rejected": -108.27703094482422,
"logps/rejected": -108.44245910644531,
"loss": 1.3742,
"margin_dpo/margin_mean": 0.12516844272613525,
"margin_dpo/margin_std": 0.38829922676086426,
"step": 37
},
{
"epoch": 0.05744520030234316,
"grad_norm": 29.565021514892578,
"learning_rate": 2.761194029850746e-07,
"logits/chosen": 0.06575263291597366,
"logits/rejected": 0.05181782692670822,
"logps/chosen": -65.838134765625,
"logps/ref_chosen": -65.7883529663086,
"logps/ref_rejected": -76.1619873046875,
"logps/rejected": -76.24898529052734,
"loss": 1.383,
"margin_dpo/margin_mean": 0.037221550941467285,
"margin_dpo/margin_std": 0.39967113733291626,
"step": 38
},
{
"epoch": 0.05895691609977324,
"grad_norm": 29.25869369506836,
"learning_rate": 2.8358208955223876e-07,
"logits/chosen": 0.13992644846439362,
"logits/rejected": 0.1137220561504364,
"logps/chosen": -57.26211166381836,
"logps/ref_chosen": -57.17680358886719,
"logps/ref_rejected": -79.486328125,
"logps/rejected": -79.61747741699219,
"loss": 1.3822,
"margin_dpo/margin_mean": 0.04584622383117676,
"margin_dpo/margin_std": 0.43921124935150146,
"step": 39
},
{
"epoch": 0.06046863189720333,
"grad_norm": 31.404504776000977,
"learning_rate": 2.9104477611940296e-07,
"logits/chosen": 0.12540677189826965,
"logits/rejected": 0.07498523592948914,
"logps/chosen": -61.43571472167969,
"logps/ref_chosen": -61.33416748046875,
"logps/ref_rejected": -79.10697174072266,
"logps/rejected": -79.22048950195312,
"loss": 1.3853,
"margin_dpo/margin_mean": 0.0119723379611969,
"margin_dpo/margin_std": 0.2859070897102356,
"step": 40
},
{
"epoch": 0.06198034769463341,
"grad_norm": 30.36787986755371,
"learning_rate": 2.985074626865671e-07,
"logits/chosen": 0.02323339134454727,
"logits/rejected": 0.00391228124499321,
"logps/chosen": -67.64244842529297,
"logps/ref_chosen": -67.54672241210938,
"logps/ref_rejected": -83.87788391113281,
"logps/rejected": -84.09368133544922,
"loss": 1.3747,
"margin_dpo/margin_mean": 0.12007108330726624,
"margin_dpo/margin_std": 0.36786937713623047,
"step": 41
},
{
"epoch": 0.06349206349206349,
"grad_norm": 29.299169540405273,
"learning_rate": 3.059701492537313e-07,
"logits/chosen": 0.058651238679885864,
"logits/rejected": 0.03681695833802223,
"logps/chosen": -61.39659881591797,
"logps/ref_chosen": -61.26485824584961,
"logps/ref_rejected": -76.3629150390625,
"logps/rejected": -76.49739074707031,
"loss": 1.3865,
"margin_dpo/margin_mean": 0.00273972749710083,
"margin_dpo/margin_std": 0.4193479120731354,
"step": 42
},
{
"epoch": 0.06500377928949358,
"grad_norm": 34.54471206665039,
"learning_rate": 3.134328358208955e-07,
"logits/chosen": 0.08210780471563339,
"logits/rejected": 0.07124543190002441,
"logps/chosen": -71.9305419921875,
"logps/ref_chosen": -71.80902862548828,
"logps/ref_rejected": -81.12464141845703,
"logps/rejected": -81.28547668457031,
"loss": 1.3827,
"margin_dpo/margin_mean": 0.0393202006816864,
"margin_dpo/margin_std": 0.37330394983291626,
"step": 43
},
{
"epoch": 0.06651549508692366,
"grad_norm": 32.71466064453125,
"learning_rate": 3.2089552238805965e-07,
"logits/chosen": 0.022556209936738014,
"logits/rejected": -0.006862609181553125,
"logps/chosen": -66.73717498779297,
"logps/ref_chosen": -66.55043029785156,
"logps/ref_rejected": -85.06198120117188,
"logps/rejected": -85.24290466308594,
"loss": 1.3874,
"margin_dpo/margin_mean": -0.005816161632537842,
"margin_dpo/margin_std": 0.4439757466316223,
"step": 44
},
{
"epoch": 0.06802721088435375,
"grad_norm": 31.814245223999023,
"learning_rate": 3.2835820895522385e-07,
"logits/chosen": 0.10233305394649506,
"logits/rejected": 0.05074525997042656,
"logps/chosen": -62.383018493652344,
"logps/ref_chosen": -62.243858337402344,
"logps/ref_rejected": -92.96665954589844,
"logps/rejected": -93.20918273925781,
"loss": 1.3763,
"margin_dpo/margin_mean": 0.10335150361061096,
"margin_dpo/margin_std": 0.3743368983268738,
"step": 45
},
{
"epoch": 0.06953892668178382,
"grad_norm": 31.20781707763672,
"learning_rate": 3.3582089552238805e-07,
"logits/chosen": 0.13038724660873413,
"logits/rejected": 0.08414691686630249,
"logps/chosen": -61.61685562133789,
"logps/ref_chosen": -61.498905181884766,
"logps/ref_rejected": -78.91172790527344,
"logps/rejected": -79.18026733398438,
"loss": 1.3718,
"margin_dpo/margin_mean": 0.1505853533744812,
"margin_dpo/margin_std": 0.43144309520721436,
"step": 46
},
{
"epoch": 0.0710506424792139,
"grad_norm": 28.37842559814453,
"learning_rate": 3.432835820895522e-07,
"logits/chosen": 0.030327381566166878,
"logits/rejected": -0.012373650446534157,
"logps/chosen": -51.69384002685547,
"logps/ref_chosen": -51.578346252441406,
"logps/ref_rejected": -68.2215576171875,
"logps/rejected": -68.53022003173828,
"loss": 1.3674,
"margin_dpo/margin_mean": 0.19316792488098145,
"margin_dpo/margin_std": 0.33303409814834595,
"step": 47
},
{
"epoch": 0.07256235827664399,
"grad_norm": 26.839597702026367,
"learning_rate": 3.507462686567164e-07,
"logits/chosen": 0.15095138549804688,
"logits/rejected": 0.12109607458114624,
"logps/chosen": -52.03797912597656,
"logps/ref_chosen": -51.79365158081055,
"logps/ref_rejected": -64.22504425048828,
"logps/rejected": -64.51231384277344,
"loss": 1.3824,
"margin_dpo/margin_mean": 0.042948633432388306,
"margin_dpo/margin_std": 0.3989154100418091,
"step": 48
},
{
"epoch": 0.07407407407407407,
"grad_norm": 27.287622451782227,
"learning_rate": 3.5820895522388055e-07,
"logits/chosen": 0.001452181488275528,
"logits/rejected": -0.01914474181830883,
"logps/chosen": -58.36638641357422,
"logps/ref_chosen": -58.13460159301758,
"logps/ref_rejected": -64.63206481933594,
"logps/rejected": -64.96180725097656,
"loss": 1.3769,
"margin_dpo/margin_mean": 0.09795981645584106,
"margin_dpo/margin_std": 0.40706026554107666,
"step": 49
},
{
"epoch": 0.07558578987150416,
"grad_norm": 28.038366317749023,
"learning_rate": 3.6567164179104475e-07,
"logits/chosen": 0.10713882744312286,
"logits/rejected": 0.07751593738794327,
"logps/chosen": -53.1766357421875,
"logps/ref_chosen": -52.85643768310547,
"logps/ref_rejected": -72.17460632324219,
"logps/rejected": -72.57747650146484,
"loss": 1.3785,
"margin_dpo/margin_mean": 0.0826747715473175,
"margin_dpo/margin_std": 0.44336575269699097,
"step": 50
},
{
"epoch": 0.07709750566893424,
"grad_norm": 31.035123825073242,
"learning_rate": 3.7313432835820895e-07,
"logits/chosen": 0.1037454828619957,
"logits/rejected": 0.07543984055519104,
"logps/chosen": -63.92970657348633,
"logps/ref_chosen": -63.65644073486328,
"logps/ref_rejected": -86.1323013305664,
"logps/rejected": -86.58238220214844,
"loss": 1.3694,
"margin_dpo/margin_mean": 0.17682453989982605,
"margin_dpo/margin_std": 0.5223456621170044,
"step": 51
},
{
"epoch": 0.07860922146636433,
"grad_norm": 32.71433639526367,
"learning_rate": 3.805970149253731e-07,
"logits/chosen": 0.10190063714981079,
"logits/rejected": 0.0511334203183651,
"logps/chosen": -68.15455627441406,
"logps/ref_chosen": -67.8402099609375,
"logps/ref_rejected": -96.97091674804688,
"logps/rejected": -97.4761962890625,
"loss": 1.3681,
"margin_dpo/margin_mean": 0.19093959033489227,
"margin_dpo/margin_std": 0.560549795627594,
"step": 52
},
{
"epoch": 0.0801209372637944,
"grad_norm": 27.500490188598633,
"learning_rate": 3.880597014925373e-07,
"logits/chosen": 0.08714406192302704,
"logits/rejected": 0.07636132836341858,
"logps/chosen": -57.211326599121094,
"logps/ref_chosen": -56.87813949584961,
"logps/ref_rejected": -60.75569152832031,
"logps/rejected": -61.279624938964844,
"loss": 1.3678,
"margin_dpo/margin_mean": 0.1907462179660797,
"margin_dpo/margin_std": 0.4284651279449463,
"step": 53
},
{
"epoch": 0.08163265306122448,
"grad_norm": 26.583499908447266,
"learning_rate": 3.9552238805970144e-07,
"logits/chosen": 0.07808351516723633,
"logits/rejected": 0.06246686726808548,
"logps/chosen": -47.68182373046875,
"logps/ref_chosen": -47.26692199707031,
"logps/ref_rejected": -62.19426727294922,
"logps/rejected": -62.76362228393555,
"loss": 1.3716,
"margin_dpo/margin_mean": 0.15445497632026672,
"margin_dpo/margin_std": 0.5264220237731934,
"step": 54
},
{
"epoch": 0.08314436885865457,
"grad_norm": 31.259164810180664,
"learning_rate": 4.0298507462686564e-07,
"logits/chosen": 0.04221351444721222,
"logits/rejected": -0.034855540841817856,
"logps/chosen": -50.701995849609375,
"logps/ref_chosen": -50.32619094848633,
"logps/ref_rejected": -92.44389343261719,
"logps/rejected": -93.12765502929688,
"loss": 1.357,
"margin_dpo/margin_mean": 0.30795878171920776,
"margin_dpo/margin_std": 0.7030289173126221,
"step": 55
},
{
"epoch": 0.08465608465608465,
"grad_norm": 27.52784538269043,
"learning_rate": 4.1044776119402984e-07,
"logits/chosen": 0.11271204054355621,
"logits/rejected": 0.09056483209133148,
"logps/chosen": -57.138084411621094,
"logps/ref_chosen": -56.766971588134766,
"logps/ref_rejected": -66.30503845214844,
"logps/rejected": -66.86466979980469,
"loss": 1.3686,
"margin_dpo/margin_mean": 0.18851301074028015,
"margin_dpo/margin_std": 0.6414389610290527,
"step": 56
},
{
"epoch": 0.08616780045351474,
"grad_norm": 30.54404067993164,
"learning_rate": 4.17910447761194e-07,
"logits/chosen": 0.11222894489765167,
"logits/rejected": 0.047266341745853424,
"logps/chosen": -58.253807067871094,
"logps/ref_chosen": -57.76774597167969,
"logps/ref_rejected": -82.75698852539062,
"logps/rejected": -83.54388427734375,
"loss": 1.3576,
"margin_dpo/margin_mean": 0.30083510279655457,
"margin_dpo/margin_std": 0.6732007265090942,
"step": 57
},
{
"epoch": 0.08767951625094482,
"grad_norm": 30.254480361938477,
"learning_rate": 4.253731343283582e-07,
"logits/chosen": 0.05982009693980217,
"logits/rejected": 0.04435000568628311,
"logps/chosen": -73.35845947265625,
"logps/ref_chosen": -72.76408386230469,
"logps/ref_rejected": -84.49275207519531,
"logps/rejected": -85.32954406738281,
"loss": 1.3651,
"margin_dpo/margin_mean": 0.24241399765014648,
"margin_dpo/margin_std": 1.0065019130706787,
"step": 58
},
{
"epoch": 0.08919123204837491,
"grad_norm": 26.581438064575195,
"learning_rate": 4.3283582089552234e-07,
"logits/chosen": 0.1372973918914795,
"logits/rejected": 0.0706198588013649,
"logps/chosen": -50.35957336425781,
"logps/ref_chosen": -49.82077407836914,
"logps/ref_rejected": -77.14368438720703,
"logps/rejected": -77.98661041259766,
"loss": 1.3579,
"margin_dpo/margin_mean": 0.30412447452545166,
"margin_dpo/margin_std": 0.7922423481941223,
"step": 59
},
{
"epoch": 0.09070294784580499,
"grad_norm": 29.44552993774414,
"learning_rate": 4.4029850746268654e-07,
"logits/chosen": 0.12459641695022583,
"logits/rejected": 0.12324239313602448,
"logps/chosen": -63.80072784423828,
"logps/ref_chosen": -63.22477340698242,
"logps/ref_rejected": -61.360477447509766,
"logps/rejected": -62.06523132324219,
"loss": 1.3751,
"margin_dpo/margin_mean": 0.1287935972213745,
"margin_dpo/margin_std": 0.8118811845779419,
"step": 60
},
{
"epoch": 0.09221466364323508,
"grad_norm": 28.075557708740234,
"learning_rate": 4.4776119402985074e-07,
"logits/chosen": 0.1363564431667328,
"logits/rejected": 0.10384637117385864,
"logps/chosen": -49.75391387939453,
"logps/ref_chosen": -49.01679992675781,
"logps/ref_rejected": -74.90817260742188,
"logps/rejected": -75.72248840332031,
"loss": 1.3813,
"margin_dpo/margin_mean": 0.0772022008895874,
"margin_dpo/margin_std": 1.0377774238586426,
"step": 61
},
{
"epoch": 0.09372637944066516,
"grad_norm": 28.631183624267578,
"learning_rate": 4.552238805970149e-07,
"logits/chosen": 0.11217498779296875,
"logits/rejected": 0.07311881333589554,
"logps/chosen": -63.50098419189453,
"logps/ref_chosen": -62.751869201660156,
"logps/ref_rejected": -78.93360900878906,
"logps/rejected": -79.89679718017578,
"loss": 1.3671,
"margin_dpo/margin_mean": 0.2140759527683258,
"margin_dpo/margin_std": 0.912464439868927,
"step": 62
},
{
"epoch": 0.09523809523809523,
"grad_norm": 31.452425003051758,
"learning_rate": 4.626865671641791e-07,
"logits/chosen": 0.1870669424533844,
"logits/rejected": 0.1623249351978302,
"logps/chosen": -61.0701904296875,
"logps/ref_chosen": -60.51525115966797,
"logps/ref_rejected": -85.11021423339844,
"logps/rejected": -86.22477722167969,
"loss": 1.3328,
"margin_dpo/margin_mean": 0.5596264600753784,
"margin_dpo/margin_std": 0.8306376934051514,
"step": 63
},
{
"epoch": 0.09674981103552532,
"grad_norm": 26.20331573486328,
"learning_rate": 4.701492537313433e-07,
"logits/chosen": 0.0834483653306961,
"logits/rejected": 0.05883026495575905,
"logps/chosen": -52.03789520263672,
"logps/ref_chosen": -51.20684814453125,
"logps/ref_rejected": -66.93082427978516,
"logps/rejected": -67.8565673828125,
"loss": 1.3788,
"margin_dpo/margin_mean": 0.09469178318977356,
"margin_dpo/margin_std": 0.8809771537780762,
"step": 64
},
{
"epoch": 0.0982615268329554,
"grad_norm": 30.83704376220703,
"learning_rate": 4.776119402985074e-07,
"logits/chosen": 0.1760270595550537,
"logits/rejected": 0.14696374535560608,
"logps/chosen": -68.1497802734375,
"logps/ref_chosen": -67.2886962890625,
"logps/ref_rejected": -74.44281005859375,
"logps/rejected": -75.79031372070312,
"loss": 1.3414,
"margin_dpo/margin_mean": 0.48643139004707336,
"margin_dpo/margin_std": 1.1251728534698486,
"step": 65
},
{
"epoch": 0.09977324263038549,
"grad_norm": 29.31989860534668,
"learning_rate": 4.850746268656717e-07,
"logits/chosen": 0.09745411574840546,
"logits/rejected": 0.07364879548549652,
"logps/chosen": -71.69542694091797,
"logps/ref_chosen": -70.743408203125,
"logps/ref_rejected": -77.26499938964844,
"logps/rejected": -78.53517150878906,
"loss": 1.3582,
"margin_dpo/margin_mean": 0.31815215945243835,
"margin_dpo/margin_std": 1.1648637056350708,
"step": 66
},
{
"epoch": 0.10128495842781557,
"grad_norm": 29.129894256591797,
"learning_rate": 4.925373134328357e-07,
"logits/chosen": 0.09226509928703308,
"logits/rejected": 0.0366465225815773,
"logps/chosen": -61.3585319519043,
"logps/ref_chosen": -60.60260009765625,
"logps/ref_rejected": -75.22235870361328,
"logps/rejected": -76.31483459472656,
"loss": 1.3551,
"margin_dpo/margin_mean": 0.33653974533081055,
"margin_dpo/margin_std": 0.923335611820221,
"step": 67
},
{
"epoch": 0.10279667422524566,
"grad_norm": 31.375276565551758,
"learning_rate": 5e-07,
"logits/chosen": 0.060302793979644775,
"logits/rejected": 0.030451811850070953,
"logps/chosen": -78.77995300292969,
"logps/ref_chosen": -77.52836608886719,
"logps/ref_rejected": -93.17778015136719,
"logps/rejected": -94.77074432373047,
"loss": 1.3568,
"margin_dpo/margin_mean": 0.3413764536380768,
"margin_dpo/margin_std": 1.2794163227081299,
"step": 68
},
{
"epoch": 0.10430839002267574,
"grad_norm": 31.02152442932129,
"learning_rate": 4.999965034812934e-07,
"logits/chosen": 0.09882189333438873,
"logits/rejected": 0.05591355264186859,
"logps/chosen": -67.13074493408203,
"logps/ref_chosen": -65.94305419921875,
"logps/ref_rejected": -89.7735595703125,
"logps/rejected": -91.55255126953125,
"loss": 1.3319,
"margin_dpo/margin_mean": 0.5912973880767822,
"margin_dpo/margin_std": 1.2093827724456787,
"step": 69
},
{
"epoch": 0.10582010582010581,
"grad_norm": 28.711166381835938,
"learning_rate": 4.999860140229787e-07,
"logits/chosen": 0.13724008202552795,
"logits/rejected": 0.11432601511478424,
"logps/chosen": -63.15776824951172,
"logps/ref_chosen": -61.957908630371094,
"logps/ref_rejected": -75.80946350097656,
"logps/rejected": -77.35932159423828,
"loss": 1.3551,
"margin_dpo/margin_mean": 0.35000741481781006,
"margin_dpo/margin_std": 1.1883422136306763,
"step": 70
},
{
"epoch": 0.1073318216175359,
"grad_norm": 28.578266143798828,
"learning_rate": 4.999685319184688e-07,
"logits/chosen": 0.07535459101200104,
"logits/rejected": 0.060557231307029724,
"logps/chosen": -64.78665161132812,
"logps/ref_chosen": -63.34757995605469,
"logps/ref_rejected": -67.49658203125,
"logps/rejected": -69.17135620117188,
"loss": 1.3678,
"margin_dpo/margin_mean": 0.23569674789905548,
"margin_dpo/margin_std": 1.394590139389038,
"step": 71
},
{
"epoch": 0.10884353741496598,
"grad_norm": 30.338516235351562,
"learning_rate": 4.999440576567755e-07,
"logits/chosen": 0.13761760294437408,
"logits/rejected": 0.07318543642759323,
"logps/chosen": -57.080726623535156,
"logps/ref_chosen": -55.85929870605469,
"logps/ref_rejected": -68.45423889160156,
"logps/rejected": -70.37165069580078,
"loss": 1.3222,
"margin_dpo/margin_mean": 0.6959859132766724,
"margin_dpo/margin_std": 1.2814478874206543,
"step": 72
},
{
"epoch": 0.11035525321239607,
"grad_norm": 31.91270637512207,
"learning_rate": 4.999125919224965e-07,
"logits/chosen": 0.09682485461235046,
"logits/rejected": 0.0830550342798233,
"logps/chosen": -70.92161560058594,
"logps/ref_chosen": -69.13880920410156,
"logps/ref_rejected": -79.04586791992188,
"logps/rejected": -81.02145385742188,
"loss": 1.3739,
"margin_dpo/margin_mean": 0.19277739524841309,
"margin_dpo/margin_std": 1.5637190341949463,
"step": 73
},
{
"epoch": 0.11186696900982615,
"grad_norm": 27.909252166748047,
"learning_rate": 4.998741355957963e-07,
"logits/chosen": 0.12167972326278687,
"logits/rejected": 0.07026355713605881,
"logps/chosen": -51.18006134033203,
"logps/ref_chosen": -49.923736572265625,
"logps/ref_rejected": -81.73213958740234,
"logps/rejected": -83.69065856933594,
"loss": 1.3217,
"margin_dpo/margin_mean": 0.7022018432617188,
"margin_dpo/margin_std": 1.275315523147583,
"step": 74
},
{
"epoch": 0.11337868480725624,
"grad_norm": 26.173181533813477,
"learning_rate": 4.998286897523808e-07,
"logits/chosen": 0.12206012010574341,
"logits/rejected": 0.08930613100528717,
"logps/chosen": -47.47442626953125,
"logps/ref_chosen": -46.06875228881836,
"logps/ref_rejected": -66.1181411743164,
"logps/rejected": -68.2774658203125,
"loss": 1.3168,
"margin_dpo/margin_mean": 0.7536484003067017,
"margin_dpo/margin_std": 1.3362785577774048,
"step": 75
},
{
"epoch": 0.11489040060468632,
"grad_norm": 29.238500595092773,
"learning_rate": 4.997762556634679e-07,
"logits/chosen": 0.11465884745121002,
"logits/rejected": 0.07164441794157028,
"logps/chosen": -55.64056396484375,
"logps/ref_chosen": -54.06275177001953,
"logps/ref_rejected": -74.87464141845703,
"logps/rejected": -76.78042602539062,
"loss": 1.3597,
"margin_dpo/margin_mean": 0.32796770334243774,
"margin_dpo/margin_std": 1.5447661876678467,
"step": 76
},
{
"epoch": 0.1164021164021164,
"grad_norm": 29.32517433166504,
"learning_rate": 4.99716834795752e-07,
"logits/chosen": 0.1788126826286316,
"logits/rejected": 0.13609115779399872,
"logps/chosen": -54.69091796875,
"logps/ref_chosen": -53.07609176635742,
"logps/ref_rejected": -74.45601654052734,
"logps/rejected": -76.81924438476562,
"loss": 1.3182,
"margin_dpo/margin_mean": 0.748406171798706,
"margin_dpo/margin_std": 1.4287664890289307,
"step": 77
},
{
"epoch": 0.11791383219954649,
"grad_norm": 29.321949005126953,
"learning_rate": 4.996504288113623e-07,
"logits/chosen": 0.10718972980976105,
"logits/rejected": 0.08687709271907806,
"logps/chosen": -69.55426025390625,
"logps/ref_chosen": -67.72541809082031,
"logps/ref_rejected": -79.03927612304688,
"logps/rejected": -81.33000183105469,
"loss": 1.3456,
"margin_dpo/margin_mean": 0.4618911147117615,
"margin_dpo/margin_std": 1.4008138179779053,
"step": 78
},
{
"epoch": 0.11942554799697656,
"grad_norm": 30.852628707885742,
"learning_rate": 4.995770395678171e-07,
"logits/chosen": 0.20388326048851013,
"logits/rejected": 0.1409316062927246,
"logps/chosen": -53.90932083129883,
"logps/ref_chosen": -52.16064453125,
"logps/ref_rejected": -83.31062316894531,
"logps/rejected": -86.15391540527344,
"loss": 1.29,
"margin_dpo/margin_mean": 1.0946189165115356,
"margin_dpo/margin_std": 2.0189208984375,
"step": 79
},
{
"epoch": 0.12093726379440665,
"grad_norm": 28.484214782714844,
"learning_rate": 4.994966691179711e-07,
"logits/chosen": 0.1793014109134674,
"logits/rejected": 0.11673000454902649,
"logps/chosen": -63.449790954589844,
"logps/ref_chosen": -61.410560607910156,
"logps/ref_rejected": -78.66004943847656,
"logps/rejected": -81.23995971679688,
"loss": 1.3423,
"margin_dpo/margin_mean": 0.5406800508499146,
"margin_dpo/margin_std": 1.8861579895019531,
"step": 80
},
{
"epoch": 0.12244897959183673,
"grad_norm": 29.091909408569336,
"learning_rate": 4.994093197099587e-07,
"logits/chosen": 0.1445290595293045,
"logits/rejected": 0.11002925038337708,
"logps/chosen": -65.95135498046875,
"logps/ref_chosen": -63.80437088012695,
"logps/ref_rejected": -79.34840393066406,
"logps/rejected": -82.34586334228516,
"loss": 1.3111,
"margin_dpo/margin_mean": 0.8504737615585327,
"margin_dpo/margin_std": 1.7258979082107544,
"step": 81
},
{
"epoch": 0.12396069538926682,
"grad_norm": 26.700366973876953,
"learning_rate": 4.993149937871306e-07,
"logits/chosen": 0.12470388412475586,
"logits/rejected": 0.0606868639588356,
"logps/chosen": -50.59052276611328,
"logps/ref_chosen": -48.817893981933594,
"logps/ref_rejected": -70.31497955322266,
"logps/rejected": -73.132080078125,
"loss": 1.2905,
"margin_dpo/margin_mean": 1.0444717407226562,
"margin_dpo/margin_std": 1.529382586479187,
"step": 82
},
{
"epoch": 0.1254724111866969,
"grad_norm": 29.110549926757812,
"learning_rate": 4.992136939879856e-07,
"logits/chosen": 0.2097686380147934,
"logits/rejected": 0.15886147320270538,
"logps/chosen": -59.19554138183594,
"logps/ref_chosen": -57.15077209472656,
"logps/ref_rejected": -75.1710205078125,
"logps/rejected": -78.39938354492188,
"loss": 1.2798,
"margin_dpo/margin_mean": 1.183598518371582,
"margin_dpo/margin_std": 1.8257801532745361,
"step": 83
},
{
"epoch": 0.12698412698412698,
"grad_norm": 30.618595123291016,
"learning_rate": 4.991054231460969e-07,
"logits/chosen": 0.17933443188667297,
"logits/rejected": 0.13655216991901398,
"logps/chosen": -67.2566146850586,
"logps/ref_chosen": -64.77730560302734,
"logps/ref_rejected": -84.71949768066406,
"logps/rejected": -88.12132263183594,
"loss": 1.3053,
"margin_dpo/margin_mean": 0.9225126504898071,
"margin_dpo/margin_std": 1.9059677124023438,
"step": 84
},
{
"epoch": 0.12849584278155707,
"grad_norm": 26.86264419555664,
"learning_rate": 4.989901842900325e-07,
"logits/chosen": 0.17478173971176147,
"logits/rejected": 0.13025707006454468,
"logps/chosen": -52.47950744628906,
"logps/ref_chosen": -50.25169372558594,
"logps/ref_rejected": -66.55438995361328,
"logps/rejected": -70.04747009277344,
"loss": 1.274,
"margin_dpo/margin_mean": 1.265273094177246,
"margin_dpo/margin_std": 1.9645485877990723,
"step": 85
},
{
"epoch": 0.13000755857898716,
"grad_norm": 27.148221969604492,
"learning_rate": 4.988679806432711e-07,
"logits/chosen": 0.20728754997253418,
"logits/rejected": 0.1880839616060257,
"logps/chosen": -63.547637939453125,
"logps/ref_chosen": -60.72917938232422,
"logps/ref_rejected": -72.30960845947266,
"logps/rejected": -76.11151123046875,
"loss": 1.3012,
"margin_dpo/margin_mean": 0.9834346771240234,
"margin_dpo/margin_std": 2.0893325805664062,
"step": 86
},
{
"epoch": 0.13151927437641722,
"grad_norm": 30.48325538635254,
"learning_rate": 4.987388156241114e-07,
"logits/chosen": 0.17650364339351654,
"logits/rejected": 0.11666995286941528,
"logps/chosen": -68.77723693847656,
"logps/ref_chosen": -65.75796508789062,
"logps/ref_rejected": -84.81159973144531,
"logps/rejected": -88.91807556152344,
"loss": 1.2962,
"margin_dpo/margin_mean": 1.0872149467468262,
"margin_dpo/margin_std": 2.498264789581299,
"step": 87
},
{
"epoch": 0.1330309901738473,
"grad_norm": 30.30896759033203,
"learning_rate": 4.986026928455767e-07,
"logits/chosen": 0.22927185893058777,
"logits/rejected": 0.20243728160858154,
"logps/chosen": -65.86044311523438,
"logps/ref_chosen": -62.82402801513672,
"logps/ref_rejected": -74.9607162475586,
"logps/rejected": -78.82502746582031,
"loss": 1.3276,
"margin_dpo/margin_mean": 0.8278965950012207,
"margin_dpo/margin_std": 2.913116455078125,
"step": 88
},
{
"epoch": 0.1345427059712774,
"grad_norm": 29.144811630249023,
"learning_rate": 4.984596161153135e-07,
"logits/chosen": 0.2610534727573395,
"logits/rejected": 0.17740146815776825,
"logps/chosen": -43.72743225097656,
"logps/ref_chosen": -41.191436767578125,
"logps/ref_rejected": -85.44769287109375,
"logps/rejected": -89.39289855957031,
"loss": 1.267,
"margin_dpo/margin_mean": 1.409203290939331,
"margin_dpo/margin_std": 2.509359359741211,
"step": 89
},
{
"epoch": 0.1360544217687075,
"grad_norm": 29.26353645324707,
"learning_rate": 4.983095894354857e-07,
"logits/chosen": 0.18076924979686737,
"logits/rejected": 0.12345144152641296,
"logps/chosen": -59.70683670043945,
"logps/ref_chosen": -56.58390808105469,
"logps/ref_rejected": -86.86978149414062,
"logps/rejected": -91.25166320800781,
"loss": 1.2798,
"margin_dpo/margin_mean": 1.2589483261108398,
"margin_dpo/margin_std": 2.442626476287842,
"step": 90
},
{
"epoch": 0.13756613756613756,
"grad_norm": 25.777061462402344,
"learning_rate": 4.98152617002662e-07,
"logits/chosen": 0.17539820075035095,
"logits/rejected": 0.13064050674438477,
"logps/chosen": -55.531593322753906,
"logps/ref_chosen": -52.38234329223633,
"logps/ref_rejected": -72.17642211914062,
"logps/rejected": -76.45957946777344,
"loss": 1.2953,
"margin_dpo/margin_mean": 1.1339049339294434,
"margin_dpo/margin_std": 2.7821226119995117,
"step": 91
},
{
"epoch": 0.13907785336356765,
"grad_norm": 27.002519607543945,
"learning_rate": 4.979887032076988e-07,
"logits/chosen": 0.20402228832244873,
"logits/rejected": 0.16468676924705505,
"logps/chosen": -56.24298095703125,
"logps/ref_chosen": -53.00870132446289,
"logps/ref_rejected": -79.77813720703125,
"logps/rejected": -84.57749938964844,
"loss": 1.2601,
"margin_dpo/margin_mean": 1.5650835037231445,
"margin_dpo/margin_std": 3.127380847930908,
"step": 92
},
{
"epoch": 0.14058956916099774,
"grad_norm": 24.539724349975586,
"learning_rate": 4.978178526356172e-07,
"logits/chosen": 0.18196739256381989,
"logits/rejected": 0.1539272964000702,
"logps/chosen": -48.484527587890625,
"logps/ref_chosen": -44.90705108642578,
"logps/ref_rejected": -58.7879524230957,
"logps/rejected": -63.36863708496094,
"loss": 1.3137,
"margin_dpo/margin_mean": 1.0032141208648682,
"margin_dpo/margin_std": 3.2164130210876465,
"step": 93
},
{
"epoch": 0.1421012849584278,
"grad_norm": 27.276403427124023,
"learning_rate": 4.976400700654751e-07,
"logits/chosen": 0.2772451341152191,
"logits/rejected": 0.23560284078121185,
"logps/chosen": -63.25147247314453,
"logps/ref_chosen": -59.93777084350586,
"logps/ref_rejected": -79.3138427734375,
"logps/rejected": -84.50923156738281,
"loss": 1.2464,
"margin_dpo/margin_mean": 1.8816919326782227,
"margin_dpo/margin_std": 4.022270202636719,
"step": 94
},
{
"epoch": 0.1436130007558579,
"grad_norm": 29.317035675048828,
"learning_rate": 4.974553604702332e-07,
"logits/chosen": 0.15013039112091064,
"logits/rejected": 0.0862259641289711,
"logps/chosen": -64.44532775878906,
"logps/ref_chosen": -60.168487548828125,
"logps/ref_rejected": -90.73665618896484,
"logps/rejected": -96.60997009277344,
"loss": 1.2614,
"margin_dpo/margin_mean": 1.5964728593826294,
"margin_dpo/margin_std": 3.3925909996032715,
"step": 95
},
{
"epoch": 0.14512471655328799,
"grad_norm": 27.277469635009766,
"learning_rate": 4.972637290166157e-07,
"logits/chosen": 0.20983225107192993,
"logits/rejected": 0.1660861372947693,
"logps/chosen": -64.80738830566406,
"logps/ref_chosen": -60.66877746582031,
"logps/ref_rejected": -88.30673217773438,
"logps/rejected": -94.34305572509766,
"loss": 1.256,
"margin_dpo/margin_mean": 1.8977141380310059,
"margin_dpo/margin_std": 4.532830238342285,
"step": 96
},
{
"epoch": 0.14663643235071808,
"grad_norm": 36.27639389038086,
"learning_rate": 4.970651810649666e-07,
"logits/chosen": 0.14166215062141418,
"logits/rejected": 0.09780453145503998,
"logps/chosen": -70.00448608398438,
"logps/ref_chosen": -65.04412841796875,
"logps/ref_rejected": -78.42092895507812,
"logps/rejected": -84.42875671386719,
"loss": 1.3444,
"margin_dpo/margin_mean": 1.0474696159362793,
"margin_dpo/margin_std": 4.991357803344727,
"step": 97
},
{
"epoch": 0.14814814814814814,
"grad_norm": 30.31495475769043,
"learning_rate": 4.968597221690985e-07,
"logits/chosen": 0.22561600804328918,
"logits/rejected": 0.1984190046787262,
"logps/chosen": -60.15273666381836,
"logps/ref_chosen": -55.503231048583984,
"logps/ref_rejected": -72.81553649902344,
"logps/rejected": -78.14527893066406,
"loss": 1.3466,
"margin_dpo/margin_mean": 0.680237352848053,
"margin_dpo/margin_std": 3.2830731868743896,
"step": 98
},
{
"epoch": 0.14965986394557823,
"grad_norm": 32.687068939208984,
"learning_rate": 4.966473580761389e-07,
"logits/chosen": 0.24951310455799103,
"logits/rejected": 0.20980459451675415,
"logps/chosen": -63.30138397216797,
"logps/ref_chosen": -58.57563781738281,
"logps/ref_rejected": -78.69361114501953,
"logps/rejected": -84.39778137207031,
"loss": 1.3425,
"margin_dpo/margin_mean": 0.9784270524978638,
"margin_dpo/margin_std": 4.579858779907227,
"step": 99
},
{
"epoch": 0.15117157974300832,
"grad_norm": 33.35084533691406,
"learning_rate": 4.964280947263676e-07,
"logits/chosen": 0.22355619072914124,
"logits/rejected": 0.21577070653438568,
"logps/chosen": -84.90021514892578,
"logps/ref_chosen": -79.58343505859375,
"logps/ref_rejected": -92.152587890625,
"logps/rejected": -98.91537475585938,
"loss": 1.3342,
"margin_dpo/margin_mean": 1.4460134506225586,
"margin_dpo/margin_std": 5.88123893737793,
"step": 100
},
{
"epoch": 0.15117157974300832,
"eval_logits/chosen": 0.2556447982788086,
"eval_logits/rejected": 0.21831558644771576,
"eval_logps/chosen": -79.70143127441406,
"eval_logps/ref_chosen": -74.85946655273438,
"eval_logps/ref_rejected": -79.54898834228516,
"eval_logps/rejected": -85.81148529052734,
"eval_loss": 0.6557220816612244,
"eval_margin_dpo/margin_mean": 1.4205234050750732,
"eval_margin_dpo/margin_std": 4.978596210479736,
"eval_runtime": 38.9596,
"eval_samples_per_second": 59.113,
"eval_steps_per_second": 1.848,
"step": 100
},
{
"epoch": 0.15268329554043839,
"grad_norm": 25.8350772857666,
"learning_rate": 4.96201938253052e-07,
"logits/chosen": 0.24167990684509277,
"logits/rejected": 0.20206353068351746,
"logps/chosen": -56.619720458984375,
"logps/ref_chosen": -52.332786560058594,
"logps/ref_rejected": -69.55589294433594,
"logps/rejected": -75.89532470703125,
"loss": 1.2349,
"margin_dpo/margin_mean": 2.0524988174438477,
"margin_dpo/margin_std": 4.153055667877197,
"step": 101
},
{
"epoch": 0.15419501133786848,
"grad_norm": 32.661865234375,
"learning_rate": 4.959688949822748e-07,
"logits/chosen": 0.16603882610797882,
"logits/rejected": 0.12703979015350342,
"logps/chosen": -70.07479858398438,
"logps/ref_chosen": -64.74348449707031,
"logps/ref_rejected": -69.06133270263672,
"logps/rejected": -75.20350646972656,
"loss": 1.3772,
"margin_dpo/margin_mean": 0.8108617067337036,
"margin_dpo/margin_std": 5.474085807800293,
"step": 102
},
{
"epoch": 0.15570672713529857,
"grad_norm": 29.438844680786133,
"learning_rate": 4.957289714327572e-07,
"logits/chosen": 0.2649979591369629,
"logits/rejected": 0.23107215762138367,
"logps/chosen": -68.64143371582031,
"logps/ref_chosen": -63.836647033691406,
"logps/ref_rejected": -79.3236312866211,
"logps/rejected": -85.67254638671875,
"loss": 1.2717,
"margin_dpo/margin_mean": 1.5441327095031738,
"margin_dpo/margin_std": 3.6750550270080566,
"step": 103
},
{
"epoch": 0.15721844293272866,
"grad_norm": 30.968130111694336,
"learning_rate": 4.954821743156767e-07,
"logits/chosen": 0.2689138352870941,
"logits/rejected": 0.18012598156929016,
"logps/chosen": -65.74430847167969,
"logps/ref_chosen": -60.99920654296875,
"logps/ref_rejected": -98.8464584350586,
"logps/rejected": -105.85238647460938,
"loss": 1.2669,
"margin_dpo/margin_mean": 2.2608296871185303,
"margin_dpo/margin_std": 6.357419013977051,
"step": 104
},
{
"epoch": 0.15873015873015872,
"grad_norm": 30.747453689575195,
"learning_rate": 4.952285105344791e-07,
"logits/chosen": 0.21757668256759644,
"logits/rejected": 0.16157402098178864,
"logps/chosen": -76.26542663574219,
"logps/ref_chosen": -70.95027160644531,
"logps/ref_rejected": -87.88340759277344,
"logps/rejected": -95.04217529296875,
"loss": 1.27,
"margin_dpo/margin_mean": 1.8436135053634644,
"margin_dpo/margin_std": 4.962711334228516,
"step": 105
},
{
"epoch": 0.1602418745275888,
"grad_norm": 30.977380752563477,
"learning_rate": 4.949679871846857e-07,
"logits/chosen": 0.26043418049812317,
"logits/rejected": 0.245887890458107,
"logps/chosen": -67.49542236328125,
"logps/ref_chosen": -62.45933151245117,
"logps/ref_rejected": -67.00595092773438,
"logps/rejected": -73.60855102539062,
"loss": 1.3035,
"margin_dpo/margin_mean": 1.566506266593933,
"margin_dpo/margin_std": 5.295645713806152,
"step": 106
},
{
"epoch": 0.1617535903250189,
"grad_norm": 42.034934997558594,
"learning_rate": 4.947006115536947e-07,
"logits/chosen": 0.198299378156662,
"logits/rejected": 0.1734483540058136,
"logps/chosen": -82.38461303710938,
"logps/ref_chosen": -75.83796691894531,
"logps/ref_rejected": -87.74038696289062,
"logps/rejected": -95.11250305175781,
"loss": 1.3998,
"margin_dpo/margin_mean": 0.8254714012145996,
"margin_dpo/margin_std": 6.266752243041992,
"step": 107
},
{
"epoch": 0.16326530612244897,
"grad_norm": 28.688661575317383,
"learning_rate": 4.944263911205772e-07,
"logits/chosen": 0.20003950595855713,
"logits/rejected": 0.16793784499168396,
"logps/chosen": -73.90744018554688,
"logps/ref_chosen": -68.39323425292969,
"logps/ref_rejected": -83.24267578125,
"logps/rejected": -90.59925842285156,
"loss": 1.2797,
"margin_dpo/margin_mean": 1.842378854751587,
"margin_dpo/margin_std": 5.342957496643066,
"step": 108
},
{
"epoch": 0.16477702191987906,
"grad_norm": 28.382535934448242,
"learning_rate": 4.941453335558681e-07,
"logits/chosen": 0.18867141008377075,
"logits/rejected": 0.13436651229858398,
"logps/chosen": -60.398643493652344,
"logps/ref_chosen": -55.52748107910156,
"logps/ref_rejected": -83.55218505859375,
"logps/rejected": -91.43389129638672,
"loss": 1.1746,
"margin_dpo/margin_mean": 3.0105397701263428,
"margin_dpo/margin_std": 5.49082088470459,
"step": 109
},
{
"epoch": 0.16628873771730915,
"grad_norm": 44.29096984863281,
"learning_rate": 4.938574467213517e-07,
"logits/chosen": 0.16931723058223724,
"logits/rejected": 0.17854322493076324,
"logps/chosen": -87.40373229980469,
"logps/ref_chosen": -81.15874481201172,
"logps/ref_rejected": -72.56021118164062,
"logps/rejected": -79.09624481201172,
"loss": 1.4275,
"margin_dpo/margin_mean": 0.2910418212413788,
"margin_dpo/margin_std": 5.465949058532715,
"step": 110
},
{
"epoch": 0.16780045351473924,
"grad_norm": 25.80181884765625,
"learning_rate": 4.935627386698418e-07,
"logits/chosen": 0.3088276982307434,
"logits/rejected": 0.2699512839317322,
"logps/chosen": -58.064239501953125,
"logps/ref_chosen": -52.358985900878906,
"logps/ref_rejected": -77.06150817871094,
"logps/rejected": -84.72740936279297,
"loss": 1.2645,
"margin_dpo/margin_mean": 1.9606516361236572,
"margin_dpo/margin_std": 5.206435203552246,
"step": 111
},
{
"epoch": 0.1693121693121693,
"grad_norm": 32.21335220336914,
"learning_rate": 4.932612176449559e-07,
"logits/chosen": 0.19986756145954132,
"logits/rejected": 0.13390694558620453,
"logps/chosen": -68.38388061523438,
"logps/ref_chosen": -63.02006912231445,
"logps/ref_rejected": -111.36941528320312,
"logps/rejected": -119.13533020019531,
"loss": 1.2146,
"margin_dpo/margin_mean": 2.4020986557006836,
"margin_dpo/margin_std": 4.748931884765625,
"step": 112
},
{
"epoch": 0.1708238851095994,
"grad_norm": 35.52518844604492,
"learning_rate": 4.929528920808854e-07,
"logits/chosen": 0.1938510537147522,
"logits/rejected": 0.15514397621154785,
"logps/chosen": -61.15318298339844,
"logps/ref_chosen": -55.80766296386719,
"logps/ref_rejected": -69.84014129638672,
"logps/rejected": -76.59230041503906,
"loss": 1.3038,
"margin_dpo/margin_mean": 1.406644344329834,
"margin_dpo/margin_std": 4.629127502441406,
"step": 113
},
{
"epoch": 0.17233560090702948,
"grad_norm": 29.548250198364258,
"learning_rate": 4.92637770602159e-07,
"logits/chosen": 0.23794038593769073,
"logits/rejected": 0.1779293566942215,
"logps/chosen": -71.40888977050781,
"logps/ref_chosen": -66.33277130126953,
"logps/ref_rejected": -71.61489868164062,
"logps/rejected": -79.04914855957031,
"loss": 1.2177,
"margin_dpo/margin_mean": 2.3581337928771973,
"margin_dpo/margin_std": 4.735161781311035,
"step": 114
},
{
"epoch": 0.17384731670445955,
"grad_norm": 28.59156608581543,
"learning_rate": 4.923158620234019e-07,
"logits/chosen": 0.24258871376514435,
"logits/rejected": 0.17910319566726685,
"logps/chosen": -61.24739074707031,
"logps/ref_chosen": -55.74903869628906,
"logps/ref_rejected": -79.59849548339844,
"logps/rejected": -86.9781494140625,
"loss": 1.2531,
"margin_dpo/margin_mean": 1.8812994956970215,
"margin_dpo/margin_std": 4.353668689727783,
"step": 115
},
{
"epoch": 0.17535903250188964,
"grad_norm": 25.978452682495117,
"learning_rate": 4.91987175349089e-07,
"logits/chosen": 0.2222248762845993,
"logits/rejected": 0.1548064798116684,
"logps/chosen": -54.570945739746094,
"logps/ref_chosen": -49.365169525146484,
"logps/ref_rejected": -72.84671020507812,
"logps/rejected": -80.85499572753906,
"loss": 1.1655,
"margin_dpo/margin_mean": 2.8025131225585938,
"margin_dpo/margin_std": 4.0062384605407715,
"step": 116
},
{
"epoch": 0.17687074829931973,
"grad_norm": 28.063459396362305,
"learning_rate": 4.916517197732933e-07,
"logits/chosen": 0.228042870759964,
"logits/rejected": 0.1916525959968567,
"logps/chosen": -62.87383270263672,
"logps/ref_chosen": -57.710899353027344,
"logps/ref_rejected": -69.77254486083984,
"logps/rejected": -76.8323974609375,
"loss": 1.2566,
"margin_dpo/margin_mean": 1.8969154357910156,
"margin_dpo/margin_std": 4.350034713745117,
"step": 117
},
{
"epoch": 0.17838246409674982,
"grad_norm": 27.701557159423828,
"learning_rate": 4.913095046794281e-07,
"logits/chosen": 0.29179418087005615,
"logits/rejected": 0.25057122111320496,
"logps/chosen": -57.449615478515625,
"logps/ref_chosen": -52.479896545410156,
"logps/ref_rejected": -81.35912322998047,
"logps/rejected": -88.98104858398438,
"loss": 1.1754,
"margin_dpo/margin_mean": 2.6522061824798584,
"margin_dpo/margin_std": 3.8634443283081055,
"step": 118
},
{
"epoch": 0.17989417989417988,
"grad_norm": 29.29024314880371,
"learning_rate": 4.909605396399855e-07,
"logits/chosen": 0.20849823951721191,
"logits/rejected": 0.16951939463615417,
"logps/chosen": -67.72459411621094,
"logps/ref_chosen": -61.35767364501953,
"logps/ref_rejected": -75.71510314941406,
"logps/rejected": -83.93721771240234,
"loss": 1.2795,
"margin_dpo/margin_mean": 1.855197787284851,
"margin_dpo/margin_std": 5.46286678314209,
"step": 119
},
{
"epoch": 0.18140589569160998,
"grad_norm": 28.82642364501953,
"learning_rate": 4.906048344162676e-07,
"logits/chosen": 0.20952869951725006,
"logits/rejected": 0.14921404421329498,
"logps/chosen": -65.2104263305664,
"logps/ref_chosen": -59.907569885253906,
"logps/ref_rejected": -79.6910629272461,
"logps/rejected": -87.75076293945312,
"loss": 1.1657,
"margin_dpo/margin_mean": 2.7568416595458984,
"margin_dpo/margin_std": 3.812615394592285,
"step": 120
},
{
"epoch": 0.18291761148904007,
"grad_norm": 26.929418563842773,
"learning_rate": 4.902423989581143e-07,
"logits/chosen": 0.34039121866226196,
"logits/rejected": 0.25128114223480225,
"logps/chosen": -61.47722244262695,
"logps/ref_chosen": -55.666046142578125,
"logps/ref_rejected": -101.56233978271484,
"logps/rejected": -110.37584686279297,
"loss": 1.1653,
"margin_dpo/margin_mean": 3.0023269653320312,
"margin_dpo/margin_std": 5.023721694946289,
"step": 121
},
{
"epoch": 0.18442932728647016,
"grad_norm": 30.553749084472656,
"learning_rate": 4.898732434036243e-07,
"logits/chosen": 0.21363535523414612,
"logits/rejected": 0.17875628173351288,
"logps/chosen": -69.63571166992188,
"logps/ref_chosen": -63.334373474121094,
"logps/ref_rejected": -73.67523193359375,
"logps/rejected": -82.67070007324219,
"loss": 1.2097,
"margin_dpo/margin_mean": 2.6941189765930176,
"margin_dpo/margin_std": 5.76659631729126,
"step": 122
},
{
"epoch": 0.18594104308390022,
"grad_norm": 27.00295066833496,
"learning_rate": 4.894973780788722e-07,
"logits/chosen": 0.2702626883983612,
"logits/rejected": 0.22773674130439758,
"logps/chosen": -62.96525955200195,
"logps/ref_chosen": -56.89874267578125,
"logps/ref_rejected": -78.97029113769531,
"logps/rejected": -87.6001205444336,
"loss": 1.1931,
"margin_dpo/margin_mean": 2.5633208751678467,
"margin_dpo/margin_std": 4.423187732696533,
"step": 123
},
{
"epoch": 0.1874527588813303,
"grad_norm": 27.339513778686523,
"learning_rate": 4.89114813497619e-07,
"logits/chosen": 0.25164029002189636,
"logits/rejected": 0.1909327208995819,
"logps/chosen": -63.49843215942383,
"logps/ref_chosen": -57.116085052490234,
"logps/ref_rejected": -87.93074035644531,
"logps/rejected": -97.9527587890625,
"loss": 1.1111,
"margin_dpo/margin_mean": 3.63966703414917,
"margin_dpo/margin_std": 4.890292167663574,
"step": 124
},
{
"epoch": 0.1889644746787604,
"grad_norm": 29.097652435302734,
"learning_rate": 4.887255603610184e-07,
"logits/chosen": 0.2958196997642517,
"logits/rejected": 0.23451215028762817,
"logps/chosen": -72.66911315917969,
"logps/ref_chosen": -65.7061767578125,
"logps/ref_rejected": -91.72711944580078,
"logps/rejected": -101.81961059570312,
"loss": 1.1585,
"margin_dpo/margin_mean": 3.129549980163574,
"margin_dpo/margin_std": 5.042257308959961,
"step": 125
},
{
"epoch": 0.19047619047619047,
"grad_norm": 28.814516067504883,
"learning_rate": 4.883296295573176e-07,
"logits/chosen": 0.12805432081222534,
"logits/rejected": 0.1223287507891655,
"logps/chosen": -74.755126953125,
"logps/ref_chosen": -68.17608642578125,
"logps/ref_rejected": -65.1175537109375,
"logps/rejected": -73.43314361572266,
"loss": 1.2741,
"margin_dpo/margin_mean": 1.7365505695343018,
"margin_dpo/margin_std": 4.742924690246582,
"step": 126
},
{
"epoch": 0.19198790627362056,
"grad_norm": 26.409942626953125,
"learning_rate": 4.87927032161552e-07,
"logits/chosen": 0.22645384073257446,
"logits/rejected": 0.19578759372234344,
"logps/chosen": -68.62709045410156,
"logps/ref_chosen": -61.88023376464844,
"logps/ref_rejected": -68.46012878417969,
"logps/rejected": -78.52706146240234,
"loss": 1.1273,
"margin_dpo/margin_mean": 3.3200788497924805,
"margin_dpo/margin_std": 4.245710372924805,
"step": 127
},
{
"epoch": 0.19349962207105065,
"grad_norm": 27.905487060546875,
"learning_rate": 4.875177794352363e-07,
"logits/chosen": 0.25532764196395874,
"logits/rejected": 0.19668430089950562,
"logps/chosen": -74.02288818359375,
"logps/ref_chosen": -66.708984375,
"logps/ref_rejected": -94.97969055175781,
"logps/rejected": -105.91804504394531,
"loss": 1.1543,
"margin_dpo/margin_mean": 3.6244349479675293,
"margin_dpo/margin_std": 6.532609939575195,
"step": 128
},
{
"epoch": 0.19501133786848074,
"grad_norm": 34.52588653564453,
"learning_rate": 4.871018828260491e-07,
"logits/chosen": 0.2155529260635376,
"logits/rejected": 0.2097143977880478,
"logps/chosen": -73.59596252441406,
"logps/ref_chosen": -65.33882904052734,
"logps/ref_rejected": -68.06109619140625,
"logps/rejected": -78.3628921508789,
"loss": 1.2761,
"margin_dpo/margin_mean": 2.044658660888672,
"margin_dpo/margin_std": 5.964291572570801,
"step": 129
},
{
"epoch": 0.1965230536659108,
"grad_norm": 30.163326263427734,
"learning_rate": 4.866793539675126e-07,
"logits/chosen": 0.17354336380958557,
"logits/rejected": 0.12331511080265045,
"logps/chosen": -66.47359466552734,
"logps/ref_chosen": -58.660743713378906,
"logps/ref_rejected": -79.24510192871094,
"logps/rejected": -90.50035095214844,
"loss": 1.1303,
"margin_dpo/margin_mean": 3.44240140914917,
"margin_dpo/margin_std": 4.8902740478515625,
"step": 130
},
{
"epoch": 0.1980347694633409,
"grad_norm": 25.82981300354004,
"learning_rate": 4.86250204678667e-07,
"logits/chosen": 0.20545755326747894,
"logits/rejected": 0.14028334617614746,
"logps/chosen": -59.99113464355469,
"logps/ref_chosen": -52.51454162597656,
"logps/ref_rejected": -85.18299865722656,
"logps/rejected": -97.01617431640625,
"loss": 1.0927,
"margin_dpo/margin_mean": 4.3565826416015625,
"margin_dpo/margin_std": 6.3896894454956055,
"step": 131
},
{
"epoch": 0.19954648526077098,
"grad_norm": 27.41193962097168,
"learning_rate": 4.858144469637408e-07,
"logits/chosen": 0.275441974401474,
"logits/rejected": 0.24331185221672058,
"logps/chosen": -73.9671859741211,
"logps/ref_chosen": -65.68513488769531,
"logps/ref_rejected": -69.54120635986328,
"logps/rejected": -81.4193344116211,
"loss": 1.1275,
"margin_dpo/margin_mean": 3.5960710048675537,
"margin_dpo/margin_std": 5.342451095581055,
"step": 132
},
{
"epoch": 0.20105820105820105,
"grad_norm": 31.791473388671875,
"learning_rate": 4.853720930118138e-07,
"logits/chosen": 0.219748854637146,
"logits/rejected": 0.20972420275211334,
"logps/chosen": -72.35598754882812,
"logps/ref_chosen": -63.598114013671875,
"logps/ref_rejected": -73.72798156738281,
"logps/rejected": -85.32035827636719,
"loss": 1.2049,
"margin_dpo/margin_mean": 2.83449649810791,
"margin_dpo/margin_std": 5.86693000793457,
"step": 133
},
{
"epoch": 0.20256991685563114,
"grad_norm": 25.357341766357422,
"learning_rate": 4.849231551964771e-07,
"logits/chosen": 0.31225156784057617,
"logits/rejected": 0.2546500563621521,
"logps/chosen": -61.913734436035156,
"logps/ref_chosen": -53.79457092285156,
"logps/ref_rejected": -74.16741943359375,
"logps/rejected": -87.17252349853516,
"loss": 1.0432,
"margin_dpo/margin_mean": 4.8859405517578125,
"margin_dpo/margin_std": 6.1819233894348145,
"step": 134
},
{
"epoch": 0.20408163265306123,
"grad_norm": 25.44196128845215,
"learning_rate": 4.844676460754862e-07,
"logits/chosen": 0.2653396427631378,
"logits/rejected": 0.23224963247776031,
"logps/chosen": -57.6514892578125,
"logps/ref_chosen": -49.441078186035156,
"logps/ref_rejected": -65.96878051757812,
"logps/rejected": -77.29350280761719,
"loss": 1.1794,
"margin_dpo/margin_mean": 3.1143112182617188,
"margin_dpo/margin_std": 5.866373062133789,
"step": 135
},
{
"epoch": 0.20559334845049132,
"grad_norm": 31.02347755432129,
"learning_rate": 4.840055783904106e-07,
"logits/chosen": 0.23869748413562775,
"logits/rejected": 0.16473013162612915,
"logps/chosen": -76.93392944335938,
"logps/ref_chosen": -66.75926208496094,
"logps/ref_rejected": -94.61787414550781,
"logps/rejected": -108.9141845703125,
"loss": 1.1644,
"margin_dpo/margin_mean": 4.121640205383301,
"margin_dpo/margin_std": 8.036227226257324,
"step": 136
},
{
"epoch": 0.20710506424792138,
"grad_norm": 26.123674392700195,
"learning_rate": 4.835369650662767e-07,
"logits/chosen": 0.26602545380592346,
"logits/rejected": 0.23649254441261292,
"logps/chosen": -66.29134368896484,
"logps/ref_chosen": -56.78379821777344,
"logps/ref_rejected": -69.89952087402344,
"logps/rejected": -83.55348205566406,
"loss": 1.1089,
"margin_dpo/margin_mean": 4.146416664123535,
"margin_dpo/margin_std": 6.458062171936035,
"step": 137
},
{
"epoch": 0.20861678004535147,
"grad_norm": 27.785795211791992,
"learning_rate": 4.830618192112065e-07,
"logits/chosen": 0.28734517097473145,
"logits/rejected": 0.2500568628311157,
"logps/chosen": -70.0023422241211,
"logps/ref_chosen": -58.766014099121094,
"logps/ref_rejected": -68.12371826171875,
"logps/rejected": -83.01116180419922,
"loss": 1.1753,
"margin_dpo/margin_mean": 3.6511096954345703,
"margin_dpo/margin_std": 7.198888778686523,
"step": 138
},
{
"epoch": 0.21012849584278157,
"grad_norm": 30.565093994140625,
"learning_rate": 4.825801541160509e-07,
"logits/chosen": 0.21461869776248932,
"logits/rejected": 0.18480184674263,
"logps/chosen": -83.13623046875,
"logps/ref_chosen": -71.2255859375,
"logps/ref_rejected": -82.1834716796875,
"logps/rejected": -98.353759765625,
"loss": 1.0982,
"margin_dpo/margin_mean": 4.259641170501709,
"margin_dpo/margin_std": 6.333117485046387,
"step": 139
},
{
"epoch": 0.21164021164021163,
"grad_norm": 31.754594802856445,
"learning_rate": 4.820919832540181e-07,
"logits/chosen": 0.24357524514198303,
"logits/rejected": 0.19461680948734283,
"logps/chosen": -74.13949584960938,
"logps/ref_chosen": -63.27766418457031,
"logps/ref_rejected": -83.30647277832031,
"logps/rejected": -100.13687133789062,
"loss": 1.0653,
"margin_dpo/margin_mean": 5.968560218811035,
"margin_dpo/margin_std": 9.302078247070312,
"step": 140
},
{
"epoch": 0.21315192743764172,
"grad_norm": 29.929162979125977,
"learning_rate": 4.815973202802966e-07,
"logits/chosen": 0.2960834503173828,
"logits/rejected": 0.24834388494491577,
"logps/chosen": -73.89019012451172,
"logps/ref_chosen": -61.76676940917969,
"logps/ref_rejected": -88.60601806640625,
"logps/rejected": -106.03081512451172,
"loss": 1.0978,
"margin_dpo/margin_mean": 5.30136775970459,
"margin_dpo/margin_std": 8.73034954071045,
"step": 141
},
{
"epoch": 0.2146636432350718,
"grad_norm": 27.8828067779541,
"learning_rate": 4.810961790316729e-07,
"logits/chosen": 0.2557069659233093,
"logits/rejected": 0.2308621108531952,
"logps/chosen": -77.07974243164062,
"logps/ref_chosen": -65.2747802734375,
"logps/ref_rejected": -81.1378173828125,
"logps/rejected": -97.32429504394531,
"loss": 1.101,
"margin_dpo/margin_mean": 4.381509304046631,
"margin_dpo/margin_std": 6.8174028396606445,
"step": 142
},
{
"epoch": 0.2161753590325019,
"grad_norm": 39.272682189941406,
"learning_rate": 4.805885735261454e-07,
"logits/chosen": 0.287389874458313,
"logits/rejected": 0.2696545124053955,
"logps/chosen": -75.27376556396484,
"logps/ref_chosen": -62.61782455444336,
"logps/ref_rejected": -70.39239501953125,
"logps/rejected": -86.78271484375,
"loss": 1.2447,
"margin_dpo/margin_mean": 3.7343828678131104,
"margin_dpo/margin_std": 9.431318283081055,
"step": 143
},
{
"epoch": 0.21768707482993196,
"grad_norm": 30.782737731933594,
"learning_rate": 4.800745179625307e-07,
"logits/chosen": 0.29217907786369324,
"logits/rejected": 0.25874489545822144,
"logps/chosen": -73.63032531738281,
"logps/ref_chosen": -60.80268859863281,
"logps/ref_rejected": -79.07284545898438,
"logps/rejected": -96.77928161621094,
"loss": 1.1405,
"margin_dpo/margin_mean": 4.878805637359619,
"margin_dpo/margin_std": 9.18574333190918,
"step": 144
},
{
"epoch": 0.21919879062736206,
"grad_norm": 34.85631561279297,
"learning_rate": 4.795540267200686e-07,
"logits/chosen": 0.23453988134860992,
"logits/rejected": 0.2504653036594391,
"logps/chosen": -86.54104614257812,
"logps/ref_chosen": -74.61146545410156,
"logps/ref_rejected": -83.24461364746094,
"logps/rejected": -100.37863159179688,
"loss": 1.1346,
"margin_dpo/margin_mean": 5.204441547393799,
"margin_dpo/margin_std": 9.736194610595703,
"step": 145
},
{
"epoch": 0.22071050642479215,
"grad_norm": 30.479814529418945,
"learning_rate": 4.790271143580173e-07,
"logits/chosen": 0.2428048700094223,
"logits/rejected": 0.22861063480377197,
"logps/chosen": -69.46223449707031,
"logps/ref_chosen": -57.84098434448242,
"logps/ref_rejected": -67.47422790527344,
"logps/rejected": -84.34223937988281,
"loss": 1.0977,
"margin_dpo/margin_mean": 5.246753215789795,
"margin_dpo/margin_std": 8.723588943481445,
"step": 146
},
{
"epoch": 0.2222222222222222,
"grad_norm": 36.99082565307617,
"learning_rate": 4.784937956152489e-07,
"logits/chosen": 0.2565242350101471,
"logits/rejected": 0.2118988335132599,
"logps/chosen": -80.17362976074219,
"logps/ref_chosen": -66.8134765625,
"logps/ref_rejected": -81.1796875,
"logps/rejected": -98.7314224243164,
"loss": 1.1908,
"margin_dpo/margin_mean": 4.191573619842529,
"margin_dpo/margin_std": 9.052894592285156,
"step": 147
},
{
"epoch": 0.2237339380196523,
"grad_norm": 25.193017959594727,
"learning_rate": 4.779540854098347e-07,
"logits/chosen": 0.38067975640296936,
"logits/rejected": 0.3060184121131897,
"logps/chosen": -60.72514724731445,
"logps/ref_chosen": -48.68775177001953,
"logps/ref_rejected": -67.50503540039062,
"logps/rejected": -85.04951477050781,
"loss": 1.0795,
"margin_dpo/margin_mean": 5.507093906402588,
"margin_dpo/margin_std": 8.676559448242188,
"step": 148
},
{
"epoch": 0.2252456538170824,
"grad_norm": 27.02715492248535,
"learning_rate": 4.774079988386296e-07,
"logits/chosen": 0.24912810325622559,
"logits/rejected": 0.20178565382957458,
"logps/chosen": -69.67440795898438,
"logps/ref_chosen": -55.14377975463867,
"logps/ref_rejected": -64.79888916015625,
"logps/rejected": -85.38681030273438,
"loss": 1.0067,
"margin_dpo/margin_mean": 6.057290077209473,
"margin_dpo/margin_std": 7.906113624572754,
"step": 149
},
{
"epoch": 0.22675736961451248,
"grad_norm": 27.364370346069336,
"learning_rate": 4.768555511768486e-07,
"logits/chosen": 0.25342485308647156,
"logits/rejected": 0.20886380970478058,
"logps/chosen": -79.80754089355469,
"logps/ref_chosen": -67.47074890136719,
"logps/ref_rejected": -89.21170043945312,
"logps/rejected": -109.59925842285156,
"loss": 0.928,
"margin_dpo/margin_mean": 8.050765037536621,
"margin_dpo/margin_std": 9.564001083374023,
"step": 150
},
{
"epoch": 0.22826908541194255,
"grad_norm": 24.586162567138672,
"learning_rate": 4.762967578776406e-07,
"logits/chosen": 0.27967917919158936,
"logits/rejected": 0.22348374128341675,
"logps/chosen": -62.881961822509766,
"logps/ref_chosen": -52.45954132080078,
"logps/ref_rejected": -79.06301879882812,
"logps/rejected": -98.59596252441406,
"loss": 0.8551,
"margin_dpo/margin_mean": 9.110525131225586,
"margin_dpo/margin_std": 9.856712341308594,
"step": 151
},
{
"epoch": 0.22978080120937264,
"grad_norm": 30.191181182861328,
"learning_rate": 4.757316345716553e-07,
"logits/chosen": 0.3457234501838684,
"logits/rejected": 0.29061341285705566,
"logps/chosen": -69.90003967285156,
"logps/ref_chosen": -56.5538330078125,
"logps/ref_rejected": -76.55074310302734,
"logps/rejected": -95.70962524414062,
"loss": 1.126,
"margin_dpo/margin_mean": 5.8126726150512695,
"margin_dpo/margin_std": 10.448650360107422,
"step": 152
},
{
"epoch": 0.23129251700680273,
"grad_norm": 27.93670654296875,
"learning_rate": 4.751601970666064e-07,
"logits/chosen": 0.23497727513313293,
"logits/rejected": 0.1989767700433731,
"logps/chosen": -80.46448516845703,
"logps/ref_chosen": -68.00689697265625,
"logps/ref_rejected": -74.83482360839844,
"logps/rejected": -94.14203643798828,
"loss": 0.9868,
"margin_dpo/margin_mean": 6.849617958068848,
"margin_dpo/margin_std": 8.963647842407227,
"step": 153
},
{
"epoch": 0.2328042328042328,
"grad_norm": 31.00896453857422,
"learning_rate": 4.745824613468292e-07,
"logits/chosen": 0.32073622941970825,
"logits/rejected": 0.31813472509384155,
"logps/chosen": -72.668701171875,
"logps/ref_chosen": -59.222537994384766,
"logps/ref_rejected": -64.19132232666016,
"logps/rejected": -82.69126892089844,
"loss": 1.1691,
"margin_dpo/margin_mean": 5.05378532409668,
"margin_dpo/margin_std": 10.114835739135742,
"step": 154
},
{
"epoch": 0.23431594860166288,
"grad_norm": 31.16225242614746,
"learning_rate": 4.7399844357283393e-07,
"logits/chosen": 0.31456199288368225,
"logits/rejected": 0.2942940294742584,
"logps/chosen": -82.74867248535156,
"logps/ref_chosen": -68.45469665527344,
"logps/ref_rejected": -77.91763305664062,
"logps/rejected": -99.23988342285156,
"loss": 1.1028,
"margin_dpo/margin_mean": 7.028270721435547,
"margin_dpo/margin_std": 12.166418075561523,
"step": 155
},
{
"epoch": 0.23582766439909297,
"grad_norm": 27.807287216186523,
"learning_rate": 4.7340816008085305e-07,
"logits/chosen": 0.26738643646240234,
"logits/rejected": 0.22227174043655396,
"logps/chosen": -81.36599731445312,
"logps/ref_chosen": -67.26959991455078,
"logps/ref_rejected": -86.95914459228516,
"logps/rejected": -109.32891845703125,
"loss": 0.9253,
"margin_dpo/margin_mean": 8.273383140563965,
"margin_dpo/margin_std": 10.093043327331543,
"step": 156
},
{
"epoch": 0.23733938019652306,
"grad_norm": 30.04600715637207,
"learning_rate": 4.728116273823847e-07,
"logits/chosen": 0.2983720600605011,
"logits/rejected": 0.27970391511917114,
"logps/chosen": -67.21517181396484,
"logps/ref_chosen": -54.77287292480469,
"logps/ref_rejected": -63.87866973876953,
"logps/rejected": -82.85496520996094,
"loss": 1.0662,
"margin_dpo/margin_mean": 6.533998012542725,
"margin_dpo/margin_std": 9.998207092285156,
"step": 157
},
{
"epoch": 0.23885109599395313,
"grad_norm": 30.961042404174805,
"learning_rate": 4.7220886216373085e-07,
"logits/chosen": 0.3131396174430847,
"logits/rejected": 0.274808406829834,
"logps/chosen": -78.74344635009766,
"logps/ref_chosen": -64.92271423339844,
"logps/ref_rejected": -82.23789978027344,
"logps/rejected": -103.00406646728516,
"loss": 1.059,
"margin_dpo/margin_mean": 6.945442199707031,
"margin_dpo/margin_std": 11.043777465820312,
"step": 158
},
{
"epoch": 0.24036281179138322,
"grad_norm": 32.95261001586914,
"learning_rate": 4.715998812855304e-07,
"logits/chosen": 0.3162548840045929,
"logits/rejected": 0.27656984329223633,
"logps/chosen": -70.92852783203125,
"logps/ref_chosen": -57.04698944091797,
"logps/ref_rejected": -73.32441711425781,
"logps/rejected": -95.10652160644531,
"loss": 1.0111,
"margin_dpo/margin_mean": 7.900569915771484,
"margin_dpo/margin_std": 11.508578300476074,
"step": 159
},
{
"epoch": 0.2418745275888133,
"grad_norm": 29.758676528930664,
"learning_rate": 4.7098470178228755e-07,
"logits/chosen": 0.19076451659202576,
"logits/rejected": 0.14449487626552582,
"logps/chosen": -65.33329772949219,
"logps/ref_chosen": -49.806915283203125,
"logps/ref_rejected": -68.3370132446289,
"logps/rejected": -90.4979248046875,
"loss": 1.1061,
"margin_dpo/margin_mean": 6.634532928466797,
"margin_dpo/margin_std": 11.820215225219727,
"step": 160
},
{
"epoch": 0.24338624338624337,
"grad_norm": 28.17973518371582,
"learning_rate": 4.703633408618955e-07,
"logits/chosen": 0.35479670763015747,
"logits/rejected": 0.3140922486782074,
"logps/chosen": -67.69532012939453,
"logps/ref_chosen": -52.50048828125,
"logps/ref_rejected": -66.04540252685547,
"logps/rejected": -88.5848617553711,
"loss": 1.0352,
"margin_dpo/margin_mean": 7.344627857208252,
"margin_dpo/margin_std": 11.086959838867188,
"step": 161
},
{
"epoch": 0.24489795918367346,
"grad_norm": 30.024721145629883,
"learning_rate": 4.697358159051549e-07,
"logits/chosen": 0.3540419638156891,
"logits/rejected": 0.3033027648925781,
"logps/chosen": -85.53033447265625,
"logps/ref_chosen": -69.46919250488281,
"logps/ref_rejected": -92.00952911376953,
"logps/rejected": -117.64090728759766,
"loss": 0.8992,
"margin_dpo/margin_mean": 9.570234298706055,
"margin_dpo/margin_std": 11.462608337402344,
"step": 162
},
{
"epoch": 0.24640967498110355,
"grad_norm": 27.456750869750977,
"learning_rate": 4.691021444652876e-07,
"logits/chosen": 0.2802005112171173,
"logits/rejected": 0.2320232391357422,
"logps/chosen": -64.46339416503906,
"logps/ref_chosen": -50.613834381103516,
"logps/ref_rejected": -74.62033081054688,
"logps/rejected": -97.98037719726562,
"loss": 0.9264,
"margin_dpo/margin_mean": 9.510485649108887,
"margin_dpo/margin_std": 11.316560745239258,
"step": 163
},
{
"epoch": 0.24792139077853365,
"grad_norm": 27.12653350830078,
"learning_rate": 4.6846234426744624e-07,
"logits/chosen": 0.27665191888809204,
"logits/rejected": 0.21187984943389893,
"logps/chosen": -69.832275390625,
"logps/ref_chosen": -54.848114013671875,
"logps/ref_rejected": -79.0630111694336,
"logps/rejected": -103.15208435058594,
"loss": 0.9709,
"margin_dpo/margin_mean": 9.104915618896484,
"margin_dpo/margin_std": 12.152335166931152,
"step": 164
},
{
"epoch": 0.2494331065759637,
"grad_norm": 29.926708221435547,
"learning_rate": 4.678164332082175e-07,
"logits/chosen": 0.3327620327472687,
"logits/rejected": 0.2769315242767334,
"logps/chosen": -67.74002838134766,
"logps/ref_chosen": -51.089210510253906,
"logps/ref_rejected": -71.23370361328125,
"logps/rejected": -96.0595703125,
"loss": 1.0398,
"margin_dpo/margin_mean": 8.175054550170898,
"margin_dpo/margin_std": 12.092546463012695,
"step": 165
},
{
"epoch": 0.2509448223733938,
"grad_norm": 33.68288803100586,
"learning_rate": 4.6716442935512214e-07,
"logits/chosen": 0.3111993670463562,
"logits/rejected": 0.216147780418396,
"logps/chosen": -79.06806182861328,
"logps/ref_chosen": -63.19081115722656,
"logps/ref_rejected": -93.8402099609375,
"logps/rejected": -116.00359344482422,
"loss": 1.0843,
"margin_dpo/margin_mean": 6.286128997802734,
"margin_dpo/margin_std": 10.556337356567383,
"step": 166
},
{
"epoch": 0.25245653817082386,
"grad_norm": 27.461366653442383,
"learning_rate": 4.6650635094610966e-07,
"logits/chosen": 0.2629314363002777,
"logits/rejected": 0.22724226117134094,
"logps/chosen": -72.44942474365234,
"logps/ref_chosen": -58.92427062988281,
"logps/ref_rejected": -72.97377014160156,
"logps/rejected": -96.02169036865234,
"loss": 0.8741,
"margin_dpo/margin_mean": 9.522764205932617,
"margin_dpo/margin_std": 10.78989028930664,
"step": 167
},
{
"epoch": 0.25396825396825395,
"grad_norm": 30.712583541870117,
"learning_rate": 4.6584221638904767e-07,
"logits/chosen": 0.24128466844558716,
"logits/rejected": 0.20569536089897156,
"logps/chosen": -81.29560089111328,
"logps/ref_chosen": -65.65138244628906,
"logps/ref_rejected": -79.71418762207031,
"logps/rejected": -102.78042602539062,
"loss": 1.0124,
"margin_dpo/margin_mean": 7.422019004821777,
"margin_dpo/margin_std": 10.804079055786133,
"step": 168
},
{
"epoch": 0.25547996976568405,
"grad_norm": 34.614925384521484,
"learning_rate": 4.651720442612075e-07,
"logits/chosen": 0.3499833941459656,
"logits/rejected": 0.31476855278015137,
"logps/chosen": -74.23861694335938,
"logps/ref_chosen": -61.425865173339844,
"logps/ref_rejected": -76.09590148925781,
"logps/rejected": -97.2747802734375,
"loss": 1.0498,
"margin_dpo/margin_mean": 8.366128921508789,
"margin_dpo/margin_std": 13.607866287231445,
"step": 169
},
{
"epoch": 0.25699168556311414,
"grad_norm": 28.536113739013672,
"learning_rate": 4.6449585330874425e-07,
"logits/chosen": 0.2757744789123535,
"logits/rejected": 0.2731916606426239,
"logps/chosen": -68.78987121582031,
"logps/ref_chosen": -56.65319061279297,
"logps/ref_rejected": -63.45965576171875,
"logps/rejected": -83.94721984863281,
"loss": 1.0727,
"margin_dpo/margin_mean": 8.3508882522583,
"margin_dpo/margin_std": 13.240031242370605,
"step": 170
},
{
"epoch": 0.2585034013605442,
"grad_norm": 31.635536193847656,
"learning_rate": 4.6381366244617224e-07,
"logits/chosen": 0.3446548283100128,
"logits/rejected": 0.2929956614971161,
"logps/chosen": -77.2772216796875,
"logps/ref_chosen": -63.734764099121094,
"logps/ref_rejected": -78.50328063964844,
"logps/rejected": -101.05764770507812,
"loss": 1.0595,
"margin_dpo/margin_mean": 9.011910438537598,
"margin_dpo/margin_std": 13.920629501342773,
"step": 171
},
{
"epoch": 0.2600151171579743,
"grad_norm": 30.46820640563965,
"learning_rate": 4.631254907558365e-07,
"logits/chosen": 0.3391519784927368,
"logits/rejected": 0.2795373201370239,
"logps/chosen": -66.91981506347656,
"logps/ref_chosen": -52.201759338378906,
"logps/ref_rejected": -82.85285949707031,
"logps/rejected": -105.961181640625,
"loss": 1.0236,
"margin_dpo/margin_mean": 8.390253067016602,
"margin_dpo/margin_std": 12.497659683227539,
"step": 172
},
{
"epoch": 0.2615268329554044,
"grad_norm": 32.092742919921875,
"learning_rate": 4.624313574873786e-07,
"logits/chosen": 0.33410799503326416,
"logits/rejected": 0.24394693970680237,
"logps/chosen": -69.11787414550781,
"logps/ref_chosen": -55.43472671508789,
"logps/ref_rejected": -77.8196792602539,
"logps/rejected": -100.34062194824219,
"loss": 1.1421,
"margin_dpo/margin_mean": 8.837798118591309,
"margin_dpo/margin_std": 15.246070861816406,
"step": 173
},
{
"epoch": 0.26303854875283444,
"grad_norm": 32.23822021484375,
"learning_rate": 4.61731282057198e-07,
"logits/chosen": 0.3190588355064392,
"logits/rejected": 0.25044068694114685,
"logps/chosen": -72.16645050048828,
"logps/ref_chosen": -57.17195129394531,
"logps/ref_rejected": -85.47578430175781,
"logps/rejected": -109.65440368652344,
"loss": 0.9885,
"margin_dpo/margin_mean": 9.184125900268555,
"margin_dpo/margin_std": 13.11090087890625,
"step": 174
},
{
"epoch": 0.26455026455026454,
"grad_norm": 31.132305145263672,
"learning_rate": 4.6102528404790965e-07,
"logits/chosen": 0.35476261377334595,
"logits/rejected": 0.3234173655509949,
"logps/chosen": -80.94275665283203,
"logps/ref_chosen": -67.6656265258789,
"logps/ref_rejected": -84.36767578125,
"logps/rejected": -107.86813354492188,
"loss": 0.9698,
"margin_dpo/margin_mean": 10.22334098815918,
"margin_dpo/margin_std": 14.124544143676758,
"step": 175
},
{
"epoch": 0.2660619803476946,
"grad_norm": 42.15481185913086,
"learning_rate": 4.603133832077953e-07,
"logits/chosen": 0.28306037187576294,
"logits/rejected": 0.2561969459056854,
"logps/chosen": -93.82209777832031,
"logps/ref_chosen": -77.8587646484375,
"logps/ref_rejected": -81.08732604980469,
"logps/rejected": -103.56967163085938,
"loss": 1.2243,
"margin_dpo/margin_mean": 6.519006729125977,
"margin_dpo/margin_std": 14.000919342041016,
"step": 176
},
{
"epoch": 0.2675736961451247,
"grad_norm": 33.34611511230469,
"learning_rate": 4.5959559945025183e-07,
"logits/chosen": 0.40190625190734863,
"logits/rejected": 0.3057857155799866,
"logps/chosen": -67.558837890625,
"logps/ref_chosen": -55.22039794921875,
"logps/ref_rejected": -92.54974365234375,
"logps/rejected": -117.42526245117188,
"loss": 0.7731,
"margin_dpo/margin_mean": 12.537084579467773,
"margin_dpo/margin_std": 12.930008888244629,
"step": 177
},
{
"epoch": 0.2690854119425548,
"grad_norm": 34.81660079956055,
"learning_rate": 4.588719528532341e-07,
"logits/chosen": 0.2513394057750702,
"logits/rejected": 0.2027273029088974,
"logps/chosen": -74.91337585449219,
"logps/ref_chosen": -60.81048583984375,
"logps/ref_rejected": -81.12973022460938,
"logps/rejected": -102.42813110351562,
"loss": 1.0615,
"margin_dpo/margin_mean": 7.1955084800720215,
"margin_dpo/margin_std": 11.241630554199219,
"step": 178
},
{
"epoch": 0.2705971277399849,
"grad_norm": 36.181095123291016,
"learning_rate": 4.581424636586928e-07,
"logits/chosen": 0.31768059730529785,
"logits/rejected": 0.30116671323776245,
"logps/chosen": -80.27589416503906,
"logps/ref_chosen": -65.67171478271484,
"logps/ref_rejected": -75.32586669921875,
"logps/rejected": -98.68672180175781,
"loss": 1.0505,
"margin_dpo/margin_mean": 8.756677627563477,
"margin_dpo/margin_std": 13.856939315795898,
"step": 179
},
{
"epoch": 0.272108843537415,
"grad_norm": 32.07508850097656,
"learning_rate": 4.5740715227200897e-07,
"logits/chosen": 0.1807074397802353,
"logits/rejected": 0.1605818122625351,
"logps/chosen": -67.91615295410156,
"logps/ref_chosen": -56.68280792236328,
"logps/ref_rejected": -64.94414520263672,
"logps/rejected": -83.39351654052734,
"loss": 1.1246,
"margin_dpo/margin_mean": 7.216020107269287,
"margin_dpo/margin_std": 13.130704879760742,
"step": 180
},
{
"epoch": 0.273620559334845,
"grad_norm": 28.916379928588867,
"learning_rate": 4.566660392614228e-07,
"logits/chosen": 0.3331793546676636,
"logits/rejected": 0.2920447587966919,
"logps/chosen": -70.67239379882812,
"logps/ref_chosen": -60.77604675292969,
"logps/ref_rejected": -83.98361206054688,
"logps/rejected": -104.28045654296875,
"loss": 0.8325,
"margin_dpo/margin_mean": 10.40049934387207,
"margin_dpo/margin_std": 11.44849967956543,
"step": 181
},
{
"epoch": 0.2751322751322751,
"grad_norm": 32.2235107421875,
"learning_rate": 4.5591914535745817e-07,
"logits/chosen": 0.2782343626022339,
"logits/rejected": 0.1968703716993332,
"logps/chosen": -72.92794036865234,
"logps/ref_chosen": -60.2537841796875,
"logps/ref_rejected": -89.7706298828125,
"logps/rejected": -112.99656677246094,
"loss": 0.9611,
"margin_dpo/margin_mean": 10.551786422729492,
"margin_dpo/margin_std": 14.136862754821777,
"step": 182
},
{
"epoch": 0.2766439909297052,
"grad_norm": 35.716888427734375,
"learning_rate": 4.551664914523433e-07,
"logits/chosen": 0.2787359952926636,
"logits/rejected": 0.25693994760513306,
"logps/chosen": -77.08547973632812,
"logps/ref_chosen": -61.76142120361328,
"logps/ref_rejected": -72.54627990722656,
"logps/rejected": -92.31723022460938,
"loss": 1.2769,
"margin_dpo/margin_mean": 4.446890830993652,
"margin_dpo/margin_std": 11.614776611328125,
"step": 183
},
{
"epoch": 0.2781557067271353,
"grad_norm": 26.859216690063477,
"learning_rate": 4.544080985994258e-07,
"logits/chosen": 0.3913435935974121,
"logits/rejected": 0.3254718780517578,
"logps/chosen": -56.61007308959961,
"logps/ref_chosen": -46.840721130371094,
"logps/ref_rejected": -69.3609390258789,
"logps/rejected": -86.7547378540039,
"loss": 0.9711,
"margin_dpo/margin_mean": 7.624444007873535,
"margin_dpo/margin_std": 10.045758247375488,
"step": 184
},
{
"epoch": 0.2796674225245654,
"grad_norm": 30.3480224609375,
"learning_rate": 4.5364398801258394e-07,
"logits/chosen": 0.3209272027015686,
"logits/rejected": 0.2729586958885193,
"logps/chosen": -64.24158477783203,
"logps/ref_chosen": -52.321136474609375,
"logps/ref_rejected": -68.3885726928711,
"logps/rejected": -88.5174560546875,
"loss": 1.1302,
"margin_dpo/margin_mean": 8.20844554901123,
"margin_dpo/margin_std": 14.266172409057617,
"step": 185
},
{
"epoch": 0.2811791383219955,
"grad_norm": 38.07506561279297,
"learning_rate": 4.5287418106563354e-07,
"logits/chosen": 0.2616552710533142,
"logits/rejected": 0.21807625889778137,
"logps/chosen": -77.84188079833984,
"logps/ref_chosen": -67.42012786865234,
"logps/ref_rejected": -82.50968933105469,
"logps/rejected": -101.93424987792969,
"loss": 1.0208,
"margin_dpo/margin_mean": 9.002808570861816,
"margin_dpo/margin_std": 13.614838600158691,
"step": 186
},
{
"epoch": 0.28269085411942557,
"grad_norm": 38.66781234741211,
"learning_rate": 4.520986992917297e-07,
"logits/chosen": 0.274710476398468,
"logits/rejected": 0.2203405201435089,
"logps/chosen": -87.93611907958984,
"logps/ref_chosen": -75.52549743652344,
"logps/ref_rejected": -94.76289367675781,
"logps/rejected": -115.23272705078125,
"loss": 1.0829,
"margin_dpo/margin_mean": 8.059209823608398,
"margin_dpo/margin_std": 13.422420501708984,
"step": 187
},
{
"epoch": 0.2842025699168556,
"grad_norm": 33.12516403198242,
"learning_rate": 4.5131756438276466e-07,
"logits/chosen": 0.2892908751964569,
"logits/rejected": 0.24562746286392212,
"logps/chosen": -82.09852600097656,
"logps/ref_chosen": -71.52333068847656,
"logps/ref_rejected": -78.29949951171875,
"logps/rejected": -97.32029724121094,
"loss": 1.0067,
"margin_dpo/margin_mean": 8.44560432434082,
"margin_dpo/margin_std": 12.50286865234375,
"step": 188
},
{
"epoch": 0.2857142857142857,
"grad_norm": 34.14213943481445,
"learning_rate": 4.5053079818876096e-07,
"logits/chosen": 0.29462122917175293,
"logits/rejected": 0.3075593411922455,
"logps/chosen": -82.23419189453125,
"logps/ref_chosen": -72.17626953125,
"logps/ref_rejected": -75.26313781738281,
"logps/rejected": -93.46210479736328,
"loss": 1.0092,
"margin_dpo/margin_mean": 8.141053199768066,
"margin_dpo/margin_std": 11.562788963317871,
"step": 189
},
{
"epoch": 0.2872260015117158,
"grad_norm": 36.18482208251953,
"learning_rate": 4.4973842271726024e-07,
"logits/chosen": 0.38970842957496643,
"logits/rejected": 0.2420949637889862,
"logps/chosen": -64.18421936035156,
"logps/ref_chosen": -54.624267578125,
"logps/ref_rejected": -101.47068786621094,
"logps/rejected": -120.8863525390625,
"loss": 0.8885,
"margin_dpo/margin_mean": 9.855717658996582,
"margin_dpo/margin_std": 11.999580383300781,
"step": 190
},
{
"epoch": 0.2887377173091459,
"grad_norm": 40.0859260559082,
"learning_rate": 4.48940460132708e-07,
"logits/chosen": 0.355924129486084,
"logits/rejected": 0.3264332413673401,
"logps/chosen": -85.68058776855469,
"logps/ref_chosen": -72.9325180053711,
"logps/ref_rejected": -89.95103454589844,
"logps/rejected": -110.76531982421875,
"loss": 1.0575,
"margin_dpo/margin_mean": 8.066211700439453,
"margin_dpo/margin_std": 12.901345252990723,
"step": 191
},
{
"epoch": 0.29024943310657597,
"grad_norm": 28.41532325744629,
"learning_rate": 4.481369327558329e-07,
"logits/chosen": 0.33127346634864807,
"logits/rejected": 0.30464470386505127,
"logps/chosen": -66.98524475097656,
"logps/ref_chosen": -54.001121520996094,
"logps/ref_rejected": -63.53154754638672,
"logps/rejected": -81.61679077148438,
"loss": 1.1598,
"margin_dpo/margin_mean": 5.101117134094238,
"margin_dpo/margin_std": 10.235267639160156,
"step": 192
},
{
"epoch": 0.29176114890400606,
"grad_norm": 29.145444869995117,
"learning_rate": 4.47327863063023e-07,
"logits/chosen": 0.27890801429748535,
"logits/rejected": 0.25508564710617065,
"logps/chosen": -67.446044921875,
"logps/ref_chosen": -56.74927520751953,
"logps/ref_rejected": -58.80628967285156,
"logps/rejected": -78.36878204345703,
"loss": 0.9339,
"margin_dpo/margin_mean": 8.865718841552734,
"margin_dpo/margin_std": 11.468514442443848,
"step": 193
},
{
"epoch": 0.29327286470143615,
"grad_norm": 28.73845863342285,
"learning_rate": 4.4651327368569684e-07,
"logits/chosen": 0.3396986722946167,
"logits/rejected": 0.30936378240585327,
"logps/chosen": -66.97239685058594,
"logps/ref_chosen": -56.649444580078125,
"logps/ref_rejected": -69.98954772949219,
"logps/rejected": -88.03752899169922,
"loss": 1.0814,
"margin_dpo/margin_mean": 7.725024700164795,
"margin_dpo/margin_std": 12.432638168334961,
"step": 194
},
{
"epoch": 0.2947845804988662,
"grad_norm": 35.004730224609375,
"learning_rate": 4.4569318740967043e-07,
"logits/chosen": 0.21706655621528625,
"logits/rejected": 0.21985185146331787,
"logps/chosen": -83.40870666503906,
"logps/ref_chosen": -70.40978240966797,
"logps/ref_rejected": -74.39448547363281,
"logps/rejected": -96.0592041015625,
"loss": 0.9816,
"margin_dpo/margin_mean": 8.665790557861328,
"margin_dpo/margin_std": 11.95964241027832,
"step": 195
},
{
"epoch": 0.2962962962962963,
"grad_norm": 27.6475830078125,
"learning_rate": 4.448676271745197e-07,
"logits/chosen": 0.3428490161895752,
"logits/rejected": 0.29989540576934814,
"logps/chosen": -71.03530883789062,
"logps/ref_chosen": -59.227577209472656,
"logps/ref_rejected": -83.54757690429688,
"logps/rejected": -102.79083251953125,
"loss": 1.0041,
"margin_dpo/margin_mean": 7.435524940490723,
"margin_dpo/margin_std": 10.328752517700195,
"step": 196
},
{
"epoch": 0.29780801209372637,
"grad_norm": 30.879732131958008,
"learning_rate": 4.440366160729392e-07,
"logits/chosen": 0.4135209619998932,
"logits/rejected": 0.36199289560317993,
"logps/chosen": -61.91220474243164,
"logps/ref_chosen": -51.52912902832031,
"logps/ref_rejected": -73.70631408691406,
"logps/rejected": -92.97248840332031,
"loss": 1.0631,
"margin_dpo/margin_mean": 8.88310432434082,
"margin_dpo/margin_std": 13.773177146911621,
"step": 197
},
{
"epoch": 0.29931972789115646,
"grad_norm": 29.16172981262207,
"learning_rate": 4.432001773500957e-07,
"logits/chosen": 0.3768477439880371,
"logits/rejected": 0.33511894941329956,
"logps/chosen": -70.8149642944336,
"logps/ref_chosen": -59.78268051147461,
"logps/ref_rejected": -72.24533081054688,
"logps/rejected": -92.16888427734375,
"loss": 0.9079,
"margin_dpo/margin_mean": 8.891267776489258,
"margin_dpo/margin_std": 10.59740161895752,
"step": 198
},
{
"epoch": 0.30083144368858655,
"grad_norm": 31.893878936767578,
"learning_rate": 4.4235833440297856e-07,
"logits/chosen": 0.33347535133361816,
"logits/rejected": 0.24467086791992188,
"logps/chosen": -69.01676940917969,
"logps/ref_chosen": -56.38677215576172,
"logps/ref_rejected": -74.56779479980469,
"logps/rejected": -94.29923248291016,
"loss": 1.1132,
"margin_dpo/margin_mean": 7.101439476013184,
"margin_dpo/margin_std": 12.053339958190918,
"step": 199
},
{
"epoch": 0.30234315948601664,
"grad_norm": 28.685178756713867,
"learning_rate": 4.415111107797445e-07,
"logits/chosen": 0.36256861686706543,
"logits/rejected": 0.289836585521698,
"logps/chosen": -67.39482116699219,
"logps/ref_chosen": -57.82432556152344,
"logps/ref_rejected": -89.28246307373047,
"logps/rejected": -109.56626892089844,
"loss": 0.9165,
"margin_dpo/margin_mean": 10.713313102722168,
"margin_dpo/margin_std": 13.711478233337402,
"step": 200
},
{
"epoch": 0.30234315948601664,
"eval_logits/chosen": 0.3345372676849365,
"eval_logits/rejected": 0.28681638836860657,
"eval_logps/chosen": -86.55070495605469,
"eval_logps/ref_chosen": -74.85946655273438,
"eval_logps/ref_rejected": -79.54898834228516,
"eval_logps/rejected": -98.71231079101562,
"eval_loss": 0.5447199940681458,
"eval_margin_dpo/margin_mean": 7.472083568572998,
"eval_margin_dpo/margin_std": 12.559971809387207,
"eval_runtime": 38.8204,
"eval_samples_per_second": 59.324,
"eval_steps_per_second": 1.855,
"step": 200
},
{
"epoch": 0.30385487528344673,
"grad_norm": 36.19288635253906,
"learning_rate": 4.4065853017905953e-07,
"logits/chosen": 0.37724757194519043,
"logits/rejected": 0.3313670754432678,
"logps/chosen": -72.42201232910156,
"logps/ref_chosen": -58.999759674072266,
"logps/ref_rejected": -84.67575073242188,
"logps/rejected": -105.98941802978516,
"loss": 1.0514,
"margin_dpo/margin_mean": 7.891414165496826,
"margin_dpo/margin_std": 12.470344543457031,
"step": 201
},
{
"epoch": 0.30536659108087677,
"grad_norm": 30.247343063354492,
"learning_rate": 4.3980061644943575e-07,
"logits/chosen": 0.30372416973114014,
"logits/rejected": 0.23039022088050842,
"logps/chosen": -58.71697235107422,
"logps/ref_chosen": -47.660648345947266,
"logps/ref_rejected": -73.63249206542969,
"logps/rejected": -93.1816635131836,
"loss": 1.0008,
"margin_dpo/margin_mean": 8.492842674255371,
"margin_dpo/margin_std": 12.252424240112305,
"step": 202
},
{
"epoch": 0.30687830687830686,
"grad_norm": 34.87704086303711,
"learning_rate": 4.3893739358856455e-07,
"logits/chosen": 0.34375059604644775,
"logits/rejected": 0.26960641145706177,
"logps/chosen": -75.08006286621094,
"logps/ref_chosen": -62.32553482055664,
"logps/ref_rejected": -99.37225341796875,
"logps/rejected": -119.60014343261719,
"loss": 1.0524,
"margin_dpo/margin_mean": 7.473361968994141,
"margin_dpo/margin_std": 11.997831344604492,
"step": 203
},
{
"epoch": 0.30839002267573695,
"grad_norm": 28.913715362548828,
"learning_rate": 4.380688857426449e-07,
"logits/chosen": 0.30120253562927246,
"logits/rejected": 0.23232108354568481,
"logps/chosen": -62.70631790161133,
"logps/ref_chosen": -50.62931442260742,
"logps/ref_rejected": -66.60475158691406,
"logps/rejected": -87.44784545898438,
"loss": 0.9799,
"margin_dpo/margin_mean": 8.766094207763672,
"margin_dpo/margin_std": 11.864765167236328,
"step": 204
},
{
"epoch": 0.30990173847316704,
"grad_norm": 41.798789978027344,
"learning_rate": 4.3719511720570814e-07,
"logits/chosen": 0.37054669857025146,
"logits/rejected": 0.30491408705711365,
"logps/chosen": -83.5518569946289,
"logps/ref_chosen": -70.35617065429688,
"logps/ref_rejected": -93.39848327636719,
"logps/rejected": -114.78820037841797,
"loss": 1.0827,
"margin_dpo/margin_mean": 8.19404411315918,
"margin_dpo/margin_std": 13.707481384277344,
"step": 205
},
{
"epoch": 0.31141345427059713,
"grad_norm": 35.05681610107422,
"learning_rate": 4.363161124189387e-07,
"logits/chosen": 0.3611377775669098,
"logits/rejected": 0.34499603509902954,
"logps/chosen": -81.56375122070312,
"logps/ref_chosen": -67.64547729492188,
"logps/ref_rejected": -79.89584350585938,
"logps/rejected": -100.60433959960938,
"loss": 1.1781,
"margin_dpo/margin_mean": 6.790228843688965,
"margin_dpo/margin_std": 13.328733444213867,
"step": 206
},
{
"epoch": 0.3129251700680272,
"grad_norm": 29.690143585205078,
"learning_rate": 4.3543189596998986e-07,
"logits/chosen": 0.287255197763443,
"logits/rejected": 0.22083953022956848,
"logps/chosen": -83.5641860961914,
"logps/ref_chosen": -67.66419219970703,
"logps/ref_rejected": -85.10249328613281,
"logps/rejected": -110.21287536621094,
"loss": 0.9536,
"margin_dpo/margin_mean": 9.210400581359863,
"margin_dpo/margin_std": 12.47515869140625,
"step": 207
},
{
"epoch": 0.3144368858654573,
"grad_norm": 35.294986724853516,
"learning_rate": 4.3454249259229664e-07,
"logits/chosen": 0.3147898316383362,
"logits/rejected": 0.28870663046836853,
"logps/chosen": -70.04411315917969,
"logps/ref_chosen": -57.731712341308594,
"logps/ref_rejected": -74.19276428222656,
"logps/rejected": -91.46826171875,
"loss": 1.2561,
"margin_dpo/margin_mean": 4.963096618652344,
"margin_dpo/margin_std": 11.9405517578125,
"step": 208
},
{
"epoch": 0.31594860166288735,
"grad_norm": 35.39768981933594,
"learning_rate": 4.336479271643833e-07,
"logits/chosen": 0.2884509563446045,
"logits/rejected": 0.23318856954574585,
"logps/chosen": -81.53282165527344,
"logps/ref_chosen": -68.55007934570312,
"logps/ref_rejected": -87.90542602539062,
"logps/rejected": -110.58944702148438,
"loss": 0.9925,
"margin_dpo/margin_mean": 9.701295852661133,
"margin_dpo/margin_std": 13.965337753295898,
"step": 209
},
{
"epoch": 0.31746031746031744,
"grad_norm": 29.025083541870117,
"learning_rate": 4.327482247091679e-07,
"logits/chosen": 0.36582332849502563,
"logits/rejected": 0.26993420720100403,
"logps/chosen": -70.67583465576172,
"logps/ref_chosen": -57.268272399902344,
"logps/ref_rejected": -85.72807312011719,
"logps/rejected": -108.93333435058594,
"loss": 0.9766,
"margin_dpo/margin_mean": 9.797691345214844,
"margin_dpo/margin_std": 13.741132736206055,
"step": 210
},
{
"epoch": 0.31897203325774753,
"grad_norm": 35.95180130004883,
"learning_rate": 4.3184341039326217e-07,
"logits/chosen": 0.373915433883667,
"logits/rejected": 0.28178757429122925,
"logps/chosen": -65.3688735961914,
"logps/ref_chosen": -53.640708923339844,
"logps/ref_rejected": -93.03880310058594,
"logps/rejected": -113.27926635742188,
"loss": 0.9593,
"margin_dpo/margin_mean": 8.512306213378906,
"margin_dpo/margin_std": 11.634811401367188,
"step": 211
},
{
"epoch": 0.3204837490551776,
"grad_norm": 30.75922966003418,
"learning_rate": 4.309335095262675e-07,
"logits/chosen": 0.35718491673469543,
"logits/rejected": 0.2856866717338562,
"logps/chosen": -71.02579498291016,
"logps/ref_chosen": -57.36674499511719,
"logps/ref_rejected": -79.89643096923828,
"logps/rejected": -102.65644836425781,
"loss": 0.9821,
"margin_dpo/margin_mean": 9.100960731506348,
"margin_dpo/margin_std": 12.861712455749512,
"step": 212
},
{
"epoch": 0.3219954648526077,
"grad_norm": 30.642589569091797,
"learning_rate": 4.3001854756006724e-07,
"logits/chosen": 0.3269196152687073,
"logits/rejected": 0.30273616313934326,
"logps/chosen": -75.70863342285156,
"logps/ref_chosen": -65.22111511230469,
"logps/ref_rejected": -80.1810302734375,
"logps/rejected": -100.77503967285156,
"loss": 0.9715,
"margin_dpo/margin_mean": 10.106489181518555,
"margin_dpo/margin_std": 14.099427223205566,
"step": 213
},
{
"epoch": 0.3235071806500378,
"grad_norm": 36.855289459228516,
"learning_rate": 4.290985500881143e-07,
"logits/chosen": 0.22347499430179596,
"logits/rejected": 0.2006388008594513,
"logps/chosen": -73.53646850585938,
"logps/ref_chosen": -61.292327880859375,
"logps/ref_rejected": -67.69841003417969,
"logps/rejected": -89.87754821777344,
"loss": 0.9768,
"margin_dpo/margin_mean": 9.935011863708496,
"margin_dpo/margin_std": 13.721136093139648,
"step": 214
},
{
"epoch": 0.3250188964474679,
"grad_norm": 32.00320053100586,
"learning_rate": 4.281735428447157e-07,
"logits/chosen": 0.25742843747138977,
"logits/rejected": 0.15232022106647491,
"logps/chosen": -77.86520385742188,
"logps/ref_chosen": -63.86913299560547,
"logps/ref_rejected": -98.7657241821289,
"logps/rejected": -123.19509887695312,
"loss": 0.9391,
"margin_dpo/margin_mean": 10.43331241607666,
"margin_dpo/margin_std": 13.504112243652344,
"step": 215
},
{
"epoch": 0.32653061224489793,
"grad_norm": 29.31924819946289,
"learning_rate": 4.2724355170431247e-07,
"logits/chosen": 0.3576727509498596,
"logits/rejected": 0.2741282284259796,
"logps/chosen": -80.46449279785156,
"logps/ref_chosen": -67.824951171875,
"logps/ref_rejected": -96.40231323242188,
"logps/rejected": -119.64827728271484,
"loss": 0.9112,
"margin_dpo/margin_mean": 10.606414794921875,
"margin_dpo/margin_std": 13.692004203796387,
"step": 216
},
{
"epoch": 0.328042328042328,
"grad_norm": 28.878402709960938,
"learning_rate": 4.26308602680756e-07,
"logits/chosen": 0.33748987317085266,
"logits/rejected": 0.23344279825687408,
"logps/chosen": -74.45117950439453,
"logps/ref_chosen": -60.50499725341797,
"logps/ref_rejected": -84.26618194580078,
"logps/rejected": -109.54447937011719,
"loss": 0.8522,
"margin_dpo/margin_mean": 11.332113265991211,
"margin_dpo/margin_std": 13.563920021057129,
"step": 217
},
{
"epoch": 0.3295540438397581,
"grad_norm": 35.3365592956543,
"learning_rate": 4.253687219265803e-07,
"logits/chosen": 0.2013692557811737,
"logits/rejected": 0.19567659497261047,
"logps/chosen": -85.214111328125,
"logps/ref_chosen": -70.59431457519531,
"logps/ref_rejected": -73.89038848876953,
"logps/rejected": -95.47663116455078,
"loss": 1.2315,
"margin_dpo/margin_mean": 6.9664506912231445,
"margin_dpo/margin_std": 14.436254501342773,
"step": 218
},
{
"epoch": 0.3310657596371882,
"grad_norm": 32.062076568603516,
"learning_rate": 4.2442393573227043e-07,
"logits/chosen": 0.2901223301887512,
"logits/rejected": 0.24815303087234497,
"logps/chosen": -73.95545959472656,
"logps/ref_chosen": -60.490943908691406,
"logps/ref_rejected": -75.85001373291016,
"logps/rejected": -96.14393615722656,
"loss": 1.0681,
"margin_dpo/margin_mean": 6.829412460327148,
"margin_dpo/margin_std": 11.092742919921875,
"step": 219
},
{
"epoch": 0.3325774754346183,
"grad_norm": 29.108675003051758,
"learning_rate": 4.234742705255272e-07,
"logits/chosen": 0.3480263352394104,
"logits/rejected": 0.28689658641815186,
"logps/chosen": -56.813446044921875,
"logps/ref_chosen": -45.013397216796875,
"logps/ref_rejected": -70.49369812011719,
"logps/rejected": -90.70354461669922,
"loss": 1.0393,
"margin_dpo/margin_mean": 8.409794807434082,
"margin_dpo/margin_std": 13.006105422973633,
"step": 220
},
{
"epoch": 0.3340891912320484,
"grad_norm": 30.952327728271484,
"learning_rate": 4.22519752870528e-07,
"logits/chosen": 0.33196067810058594,
"logits/rejected": 0.26177024841308594,
"logps/chosen": -70.25636291503906,
"logps/ref_chosen": -59.09584045410156,
"logps/ref_rejected": -88.64388275146484,
"logps/rejected": -108.89505004882812,
"loss": 1.0002,
"margin_dpo/margin_mean": 9.09065055847168,
"margin_dpo/margin_std": 13.789722442626953,
"step": 221
},
{
"epoch": 0.3356009070294785,
"grad_norm": 29.369138717651367,
"learning_rate": 4.2156040946718343e-07,
"logits/chosen": 0.3795938193798065,
"logits/rejected": 0.29519224166870117,
"logps/chosen": -67.63729858398438,
"logps/ref_chosen": -55.9976921081543,
"logps/ref_rejected": -111.94727325439453,
"logps/rejected": -135.4719696044922,
"loss": 0.8308,
"margin_dpo/margin_mean": 11.885089874267578,
"margin_dpo/margin_std": 13.283910751342773,
"step": 222
},
{
"epoch": 0.3371126228269085,
"grad_norm": 28.02263069152832,
"learning_rate": 4.2059626715039065e-07,
"logits/chosen": 0.3661195635795593,
"logits/rejected": 0.30957603454589844,
"logps/chosen": -72.39132690429688,
"logps/ref_chosen": -59.891422271728516,
"logps/ref_rejected": -86.28954315185547,
"logps/rejected": -109.10917663574219,
"loss": 0.8733,
"margin_dpo/margin_mean": 10.319726943969727,
"margin_dpo/margin_std": 12.016202926635742,
"step": 223
},
{
"epoch": 0.3386243386243386,
"grad_norm": 36.042808532714844,
"learning_rate": 4.1962735288928304e-07,
"logits/chosen": 0.3896036744117737,
"logits/rejected": 0.36814165115356445,
"logps/chosen": -77.76066589355469,
"logps/ref_chosen": -64.04463195800781,
"logps/ref_rejected": -75.05450439453125,
"logps/rejected": -94.96039581298828,
"loss": 1.123,
"margin_dpo/margin_mean": 6.189866065979004,
"margin_dpo/margin_std": 11.54200267791748,
"step": 224
},
{
"epoch": 0.3401360544217687,
"grad_norm": 39.249610900878906,
"learning_rate": 4.186536937864752e-07,
"logits/chosen": 0.3702365756034851,
"logits/rejected": 0.257682740688324,
"logps/chosen": -79.36314392089844,
"logps/ref_chosen": -66.0958251953125,
"logps/ref_rejected": -97.68675231933594,
"logps/rejected": -120.77241516113281,
"loss": 0.9436,
"margin_dpo/margin_mean": 9.818339347839355,
"margin_dpo/margin_std": 13.009382247924805,
"step": 225
},
{
"epoch": 0.3416477702191988,
"grad_norm": 30.140186309814453,
"learning_rate": 4.176753170773052e-07,
"logits/chosen": 0.3666003942489624,
"logits/rejected": 0.3200559914112091,
"logps/chosen": -63.04680633544922,
"logps/ref_chosen": -51.4168701171875,
"logps/ref_rejected": -66.30068969726562,
"logps/rejected": -85.97077941894531,
"loss": 1.067,
"margin_dpo/margin_mean": 8.040155410766602,
"margin_dpo/margin_std": 12.97111701965332,
"step": 226
},
{
"epoch": 0.3431594860166289,
"grad_norm": 37.78723907470703,
"learning_rate": 4.166922501290729e-07,
"logits/chosen": 0.4130534529685974,
"logits/rejected": 0.3729804754257202,
"logps/chosen": -70.11088562011719,
"logps/ref_chosen": -57.98978042602539,
"logps/ref_rejected": -75.05464172363281,
"logps/rejected": -95.43928527832031,
"loss": 1.0876,
"margin_dpo/margin_mean": 8.263538360595703,
"margin_dpo/margin_std": 13.883774757385254,
"step": 227
},
{
"epoch": 0.34467120181405897,
"grad_norm": 29.51940155029297,
"learning_rate": 4.1570452044027405e-07,
"logits/chosen": 0.36011219024658203,
"logits/rejected": 0.28398561477661133,
"logps/chosen": -68.40899658203125,
"logps/ref_chosen": -55.559364318847656,
"logps/ref_rejected": -77.02364349365234,
"logps/rejected": -98.55033874511719,
"loss": 1.0238,
"margin_dpo/margin_mean": 8.67706298828125,
"margin_dpo/margin_std": 13.047750473022461,
"step": 228
},
{
"epoch": 0.34618291761148906,
"grad_norm": 73.98247528076172,
"learning_rate": 4.147121556398312e-07,
"logits/chosen": 0.4597111940383911,
"logits/rejected": 0.3937884569168091,
"logps/chosen": -61.63206481933594,
"logps/ref_chosen": -50.79466247558594,
"logps/ref_rejected": -78.44740295410156,
"logps/rejected": -96.77717590332031,
"loss": 1.1032,
"margin_dpo/margin_mean": 7.492367744445801,
"margin_dpo/margin_std": 12.852076530456543,
"step": 229
},
{
"epoch": 0.3476946334089191,
"grad_norm": 31.33769989013672,
"learning_rate": 4.137151834863213e-07,
"logits/chosen": 0.33958911895751953,
"logits/rejected": 0.33543652296066284,
"logps/chosen": -68.4496841430664,
"logps/ref_chosen": -56.729225158691406,
"logps/ref_rejected": -62.99180603027344,
"logps/rejected": -83.51909637451172,
"loss": 0.979,
"margin_dpo/margin_mean": 8.806828498840332,
"margin_dpo/margin_std": 12.270458221435547,
"step": 230
},
{
"epoch": 0.3492063492063492,
"grad_norm": 29.804668426513672,
"learning_rate": 4.1271363186719835e-07,
"logits/chosen": 0.271785706281662,
"logits/rejected": 0.2568974196910858,
"logps/chosen": -85.11524200439453,
"logps/ref_chosen": -72.59710693359375,
"logps/ref_rejected": -86.2322998046875,
"logps/rejected": -111.23246002197266,
"loss": 0.7668,
"margin_dpo/margin_mean": 12.482011795043945,
"margin_dpo/margin_std": 12.127299308776855,
"step": 231
},
{
"epoch": 0.3507180650037793,
"grad_norm": 36.537750244140625,
"learning_rate": 4.1170752879801436e-07,
"logits/chosen": 0.32862627506256104,
"logits/rejected": 0.29829728603363037,
"logps/chosen": -80.80549621582031,
"logps/ref_chosen": -68.1185302734375,
"logps/ref_rejected": -83.79415893554688,
"logps/rejected": -105.02426147460938,
"loss": 1.0913,
"margin_dpo/margin_mean": 8.543130874633789,
"margin_dpo/margin_std": 14.806516647338867,
"step": 232
},
{
"epoch": 0.35222978080120937,
"grad_norm": 33.485225677490234,
"learning_rate": 4.106969024216348e-07,
"logits/chosen": 0.36027052998542786,
"logits/rejected": 0.302791953086853,
"logps/chosen": -69.43925476074219,
"logps/ref_chosen": -55.070152282714844,
"logps/ref_rejected": -66.61845397949219,
"logps/rejected": -88.03739166259766,
"loss": 1.1212,
"margin_dpo/margin_mean": 7.049837589263916,
"margin_dpo/margin_std": 12.219358444213867,
"step": 233
},
{
"epoch": 0.35374149659863946,
"grad_norm": 34.852638244628906,
"learning_rate": 4.09681781007452e-07,
"logits/chosen": 0.27572065591812134,
"logits/rejected": 0.26296335458755493,
"logps/chosen": -68.66838073730469,
"logps/ref_chosen": -55.92589569091797,
"logps/ref_rejected": -51.11608123779297,
"logps/rejected": -69.86701965332031,
"loss": 1.2086,
"margin_dpo/margin_mean": 6.008461952209473,
"margin_dpo/margin_std": 12.386802673339844,
"step": 234
},
{
"epoch": 0.35525321239606955,
"grad_norm": 27.907825469970703,
"learning_rate": 4.08662192950594e-07,
"logits/chosen": 0.3932538628578186,
"logits/rejected": 0.3740085959434509,
"logps/chosen": -74.77420806884766,
"logps/ref_chosen": -64.53972625732422,
"logps/ref_rejected": -77.69151306152344,
"logps/rejected": -98.98680877685547,
"loss": 0.7748,
"margin_dpo/margin_mean": 11.060816764831543,
"margin_dpo/margin_std": 10.737753868103027,
"step": 235
},
{
"epoch": 0.35676492819349964,
"grad_norm": 33.75122833251953,
"learning_rate": 4.076381667711306e-07,
"logits/chosen": 0.3117947280406952,
"logits/rejected": 0.2975386381149292,
"logps/chosen": -86.66970825195312,
"logps/ref_chosen": -71.15473937988281,
"logps/ref_rejected": -84.88542175292969,
"logps/rejected": -109.42959594726562,
"loss": 1.0202,
"margin_dpo/margin_mean": 9.029207229614258,
"margin_dpo/margin_std": 13.528522491455078,
"step": 236
},
{
"epoch": 0.35827664399092973,
"grad_norm": 35.89513397216797,
"learning_rate": 4.066097311132753e-07,
"logits/chosen": 0.37658143043518066,
"logits/rejected": 0.3630974590778351,
"logps/chosen": -88.70964050292969,
"logps/ref_chosen": -76.14201354980469,
"logps/ref_rejected": -80.88479614257812,
"logps/rejected": -102.75782775878906,
"loss": 1.012,
"margin_dpo/margin_mean": 9.305397033691406,
"margin_dpo/margin_std": 13.216224670410156,
"step": 237
},
{
"epoch": 0.35978835978835977,
"grad_norm": 40.203250885009766,
"learning_rate": 4.0557691474458414e-07,
"logits/chosen": 0.3156086802482605,
"logits/rejected": 0.30020976066589355,
"logps/chosen": -80.70606231689453,
"logps/ref_chosen": -68.88484954833984,
"logps/ref_rejected": -75.8946304321289,
"logps/rejected": -97.01763916015625,
"loss": 0.9747,
"margin_dpo/margin_mean": 9.301795959472656,
"margin_dpo/margin_std": 12.781771659851074,
"step": 238
},
{
"epoch": 0.36130007558578986,
"grad_norm": 30.92405128479004,
"learning_rate": 4.045397465551513e-07,
"logits/chosen": 0.4430525302886963,
"logits/rejected": 0.32228922843933105,
"logps/chosen": -71.54067993164062,
"logps/ref_chosen": -56.771827697753906,
"logps/ref_rejected": -116.23049926757812,
"logps/rejected": -141.01368713378906,
"loss": 0.952,
"margin_dpo/margin_mean": 10.014330863952637,
"margin_dpo/margin_std": 13.373289108276367,
"step": 239
},
{
"epoch": 0.36281179138321995,
"grad_norm": 24.328351974487305,
"learning_rate": 4.0349825555680045e-07,
"logits/chosen": 0.3731088638305664,
"logits/rejected": 0.28009554743766785,
"logps/chosen": -67.007080078125,
"logps/ref_chosen": -53.35411071777344,
"logps/ref_rejected": -80.12019348144531,
"logps/rejected": -105.83534240722656,
"loss": 0.8245,
"margin_dpo/margin_mean": 12.062172889709473,
"margin_dpo/margin_std": 13.12973403930664,
"step": 240
},
{
"epoch": 0.36432350718065004,
"grad_norm": 36.700382232666016,
"learning_rate": 4.0245247088227377e-07,
"logits/chosen": 0.3141937851905823,
"logits/rejected": 0.2778211832046509,
"logps/chosen": -85.28544616699219,
"logps/ref_chosen": -71.89541625976562,
"logps/ref_rejected": -83.03492736816406,
"logps/rejected": -103.01599884033203,
"loss": 1.1318,
"margin_dpo/margin_mean": 6.591043472290039,
"margin_dpo/margin_std": 12.219178199768066,
"step": 241
},
{
"epoch": 0.36583522297808013,
"grad_norm": 26.824167251586914,
"learning_rate": 4.0140242178441665e-07,
"logits/chosen": 0.3006839156150818,
"logits/rejected": 0.2773579955101013,
"logps/chosen": -70.70115661621094,
"logps/ref_chosen": -57.927433013916016,
"logps/ref_rejected": -67.83861541748047,
"logps/rejected": -91.4476318359375,
"loss": 0.9275,
"margin_dpo/margin_mean": 10.835296630859375,
"margin_dpo/margin_std": 14.33999252319336,
"step": 242
},
{
"epoch": 0.3673469387755102,
"grad_norm": 32.75960922241211,
"learning_rate": 4.003481376353596e-07,
"logits/chosen": 0.3232348561286926,
"logits/rejected": 0.32300877571105957,
"logps/chosen": -86.814453125,
"logps/ref_chosen": -74.27667236328125,
"logps/ref_rejected": -73.24340057373047,
"logps/rejected": -95.0753402709961,
"loss": 0.9749,
"margin_dpo/margin_mean": 9.294158935546875,
"margin_dpo/margin_std": 12.90059757232666,
"step": 243
},
{
"epoch": 0.3688586545729403,
"grad_norm": 23.66632843017578,
"learning_rate": 3.9928964792569654e-07,
"logits/chosen": 0.36699485778808594,
"logits/rejected": 0.2831515073776245,
"logps/chosen": -66.01345825195312,
"logps/ref_chosen": -53.36390686035156,
"logps/ref_rejected": -71.10276794433594,
"logps/rejected": -95.97030639648438,
"loss": 0.7451,
"margin_dpo/margin_mean": 12.21798324584961,
"margin_dpo/margin_std": 11.687814712524414,
"step": 244
},
{
"epoch": 0.37037037037037035,
"grad_norm": 80.20890808105469,
"learning_rate": 3.982269822636601e-07,
"logits/chosen": 0.3938656449317932,
"logits/rejected": 0.36316561698913574,
"logps/chosen": -84.29216003417969,
"logps/ref_chosen": -71.19510650634766,
"logps/ref_rejected": -80.76235961914062,
"logps/rejected": -107.34429168701172,
"loss": 0.7235,
"margin_dpo/margin_mean": 13.484872817993164,
"margin_dpo/margin_std": 12.965568542480469,
"step": 245
},
{
"epoch": 0.37188208616780044,
"grad_norm": 36.538177490234375,
"learning_rate": 3.971601703742932e-07,
"logits/chosen": 0.3998722434043884,
"logits/rejected": 0.33893048763275146,
"logps/chosen": -87.93953704833984,
"logps/ref_chosen": -71.62104797363281,
"logps/ref_rejected": -94.03392028808594,
"logps/rejected": -121.22987365722656,
"loss": 1.0117,
"margin_dpo/margin_mean": 10.877462387084961,
"margin_dpo/margin_std": 15.941274642944336,
"step": 246
},
{
"epoch": 0.37339380196523053,
"grad_norm": 37.14095687866211,
"learning_rate": 3.960892420986177e-07,
"logits/chosen": 0.36805200576782227,
"logits/rejected": 0.3567737936973572,
"logps/chosen": -96.70210266113281,
"logps/ref_chosen": -80.02254486083984,
"logps/ref_rejected": -89.22705078125,
"logps/rejected": -112.31900024414062,
"loss": 1.169,
"margin_dpo/margin_mean": 6.412394046783447,
"margin_dpo/margin_std": 12.312080383300781,
"step": 247
},
{
"epoch": 0.3749055177626606,
"grad_norm": 39.509544372558594,
"learning_rate": 3.9501422739279953e-07,
"logits/chosen": 0.34632980823516846,
"logits/rejected": 0.38421913981437683,
"logps/chosen": -80.1478271484375,
"logps/ref_chosen": -65.37796020507812,
"logps/ref_rejected": -61.36579132080078,
"logps/rejected": -86.59133911132812,
"loss": 1.0258,
"margin_dpo/margin_mean": 10.455678939819336,
"margin_dpo/margin_std": 15.734882354736328,
"step": 248
},
{
"epoch": 0.3764172335600907,
"grad_norm": 47.325138092041016,
"learning_rate": 3.9393515632731094e-07,
"logits/chosen": 0.34551554918289185,
"logits/rejected": 0.3774801194667816,
"logps/chosen": -92.70658874511719,
"logps/ref_chosen": -74.60145568847656,
"logps/ref_rejected": -63.79338455200195,
"logps/rejected": -86.26434326171875,
"loss": 1.3957,
"margin_dpo/margin_mean": 4.365830421447754,
"margin_dpo/margin_std": 13.425724029541016,
"step": 249
},
{
"epoch": 0.3779289493575208,
"grad_norm": 30.818462371826172,
"learning_rate": 3.9285205908608934e-07,
"logits/chosen": 0.4377121031284332,
"logits/rejected": 0.39241671562194824,
"logps/chosen": -77.27799224853516,
"logps/ref_chosen": -61.93821334838867,
"logps/ref_rejected": -72.21602630615234,
"logps/rejected": -98.83741760253906,
"loss": 0.9205,
"margin_dpo/margin_mean": 11.281606674194336,
"margin_dpo/margin_std": 14.549354553222656,
"step": 250
},
{
"epoch": 0.3794406651549509,
"grad_norm": 38.55635452270508,
"learning_rate": 3.9176496596569265e-07,
"logits/chosen": 0.41947078704833984,
"logits/rejected": 0.37922918796539307,
"logps/chosen": -83.08955383300781,
"logps/ref_chosen": -66.85694122314453,
"logps/ref_rejected": -84.83396911621094,
"logps/rejected": -108.45306396484375,
"loss": 1.1422,
"margin_dpo/margin_mean": 7.386477470397949,
"margin_dpo/margin_std": 13.823336601257324,
"step": 251
},
{
"epoch": 0.38095238095238093,
"grad_norm": 37.60855484008789,
"learning_rate": 3.9067390737445254e-07,
"logits/chosen": 0.3326997756958008,
"logits/rejected": 0.2792121469974518,
"logps/chosen": -71.88542938232422,
"logps/ref_chosen": -56.22393035888672,
"logps/ref_rejected": -77.1136245727539,
"logps/rejected": -99.84173583984375,
"loss": 1.2423,
"margin_dpo/margin_mean": 7.06661319732666,
"margin_dpo/margin_std": 14.898574829101562,
"step": 252
},
{
"epoch": 0.382464096749811,
"grad_norm": 30.489540100097656,
"learning_rate": 3.8957891383162304e-07,
"logits/chosen": 0.40060853958129883,
"logits/rejected": 0.3583984971046448,
"logps/chosen": -67.68878173828125,
"logps/ref_chosen": -52.21001434326172,
"logps/ref_rejected": -58.75764465332031,
"logps/rejected": -81.91943359375,
"loss": 1.0707,
"margin_dpo/margin_mean": 7.683013439178467,
"margin_dpo/margin_std": 12.151586532592773,
"step": 253
},
{
"epoch": 0.3839758125472411,
"grad_norm": 35.47545623779297,
"learning_rate": 3.884800159665276e-07,
"logits/chosen": 0.33447444438934326,
"logits/rejected": 0.28286612033843994,
"logps/chosen": -81.57742309570312,
"logps/ref_chosen": -65.63632202148438,
"logps/ref_rejected": -82.34425354003906,
"logps/rejected": -106.67283630371094,
"loss": 1.0607,
"margin_dpo/margin_mean": 8.387495040893555,
"margin_dpo/margin_std": 13.46995735168457,
"step": 254
},
{
"epoch": 0.3854875283446712,
"grad_norm": 30.077173233032227,
"learning_rate": 3.873772445177015e-07,
"logits/chosen": 0.33370327949523926,
"logits/rejected": 0.301089346408844,
"logps/chosen": -81.6827392578125,
"logps/ref_chosen": -67.91109466552734,
"logps/ref_rejected": -83.89114379882812,
"logps/rejected": -108.51498413085938,
"loss": 0.9371,
"margin_dpo/margin_mean": 10.852192878723145,
"margin_dpo/margin_std": 14.53363037109375,
"step": 255
},
{
"epoch": 0.3869992441421013,
"grad_norm": 35.74851989746094,
"learning_rate": 3.862706303320329e-07,
"logits/chosen": 0.3504742980003357,
"logits/rejected": 0.28951138257980347,
"logps/chosen": -80.06915283203125,
"logps/ref_chosen": -63.49998474121094,
"logps/ref_rejected": -90.77104187011719,
"logps/rejected": -116.5390625,
"loss": 1.0522,
"margin_dpo/margin_mean": 9.198848724365234,
"margin_dpo/margin_std": 14.691661834716797,
"step": 256
},
{
"epoch": 0.3885109599395314,
"grad_norm": 34.2720947265625,
"learning_rate": 3.851602043638994e-07,
"logits/chosen": 0.354351282119751,
"logits/rejected": 0.2911589741706848,
"logps/chosen": -88.06474304199219,
"logps/ref_chosen": -70.60064697265625,
"logps/ref_rejected": -108.5831298828125,
"logps/rejected": -138.23046875,
"loss": 0.9146,
"margin_dpo/margin_mean": 12.183237075805664,
"margin_dpo/margin_std": 16.567211151123047,
"step": 257
},
{
"epoch": 0.3900226757369615,
"grad_norm": 33.438987731933594,
"learning_rate": 3.840459976743023e-07,
"logits/chosen": 0.3662058711051941,
"logits/rejected": 0.31560778617858887,
"logps/chosen": -76.77505493164062,
"logps/ref_chosen": -59.25416564941406,
"logps/ref_rejected": -85.58709716796875,
"logps/rejected": -112.08135986328125,
"loss": 0.9038,
"margin_dpo/margin_mean": 8.97337818145752,
"margin_dpo/margin_std": 10.622298240661621,
"step": 258
},
{
"epoch": 0.3915343915343915,
"grad_norm": 29.57775115966797,
"learning_rate": 3.8292804142999796e-07,
"logits/chosen": 0.32210099697113037,
"logits/rejected": 0.22113582491874695,
"logps/chosen": -79.4823989868164,
"logps/ref_chosen": -65.43487548828125,
"logps/ref_rejected": -95.41731262207031,
"logps/rejected": -123.13349151611328,
"loss": 0.7817,
"margin_dpo/margin_mean": 13.668659210205078,
"margin_dpo/margin_std": 14.191095352172852,
"step": 259
},
{
"epoch": 0.3930461073318216,
"grad_norm": 32.224021911621094,
"learning_rate": 3.818063669026256e-07,
"logits/chosen": 0.36096808314323425,
"logits/rejected": 0.274898886680603,
"logps/chosen": -64.27941131591797,
"logps/ref_chosen": -49.08958435058594,
"logps/ref_rejected": -79.01708221435547,
"logps/rejected": -104.52508544921875,
"loss": 0.9997,
"margin_dpo/margin_mean": 10.318174362182617,
"margin_dpo/margin_std": 14.65306568145752,
"step": 260
},
{
"epoch": 0.3945578231292517,
"grad_norm": 40.651424407958984,
"learning_rate": 3.806810054678331e-07,
"logits/chosen": 0.2242593765258789,
"logits/rejected": 0.24470031261444092,
"logps/chosen": -86.40238952636719,
"logps/ref_chosen": -70.87239074707031,
"logps/ref_rejected": -65.01522064208984,
"logps/rejected": -88.8436279296875,
"loss": 1.0641,
"margin_dpo/margin_mean": 8.298402786254883,
"margin_dpo/margin_std": 13.585638999938965,
"step": 261
},
{
"epoch": 0.3960695389266818,
"grad_norm": 33.025047302246094,
"learning_rate": 3.7955198860439887e-07,
"logits/chosen": 0.4122297167778015,
"logits/rejected": 0.3500533998012543,
"logps/chosen": -83.70536804199219,
"logps/ref_chosen": -67.87063598632812,
"logps/ref_rejected": -88.7205810546875,
"logps/rejected": -114.37922668457031,
"loss": 0.9181,
"margin_dpo/margin_mean": 9.823917388916016,
"margin_dpo/margin_std": 12.397453308105469,
"step": 262
},
{
"epoch": 0.3975812547241119,
"grad_norm": 30.91775894165039,
"learning_rate": 3.784193478933516e-07,
"logits/chosen": 0.342675119638443,
"logits/rejected": 0.24015334248542786,
"logps/chosen": -70.62643432617188,
"logps/ref_chosen": -55.194580078125,
"logps/ref_rejected": -80.54048156738281,
"logps/rejected": -103.75779724121094,
"loss": 1.0746,
"margin_dpo/margin_mean": 7.785467147827148,
"margin_dpo/margin_std": 12.66977596282959,
"step": 263
},
{
"epoch": 0.39909297052154197,
"grad_norm": 37.161075592041016,
"learning_rate": 3.7728311501708674e-07,
"logits/chosen": 0.2615211606025696,
"logits/rejected": 0.21496251225471497,
"logps/chosen": -99.06900024414062,
"logps/ref_chosen": -83.17068481445312,
"logps/ref_rejected": -88.33625793457031,
"logps/rejected": -113.83183288574219,
"loss": 1.0343,
"margin_dpo/margin_mean": 9.59725570678711,
"margin_dpo/margin_std": 14.79243278503418,
"step": 264
},
{
"epoch": 0.40060468631897206,
"grad_norm": 38.972530364990234,
"learning_rate": 3.7614332175848027e-07,
"logits/chosen": 0.4157707691192627,
"logits/rejected": 0.3489346504211426,
"logps/chosen": -67.16413879394531,
"logps/ref_chosen": -51.66284942626953,
"logps/ref_rejected": -67.1720962524414,
"logps/rejected": -92.74415588378906,
"loss": 1.0821,
"margin_dpo/margin_mean": 10.07077407836914,
"margin_dpo/margin_std": 15.829389572143555,
"step": 265
},
{
"epoch": 0.4021164021164021,
"grad_norm": 33.671661376953125,
"learning_rate": 3.75e-07,
"logits/chosen": 0.3509487807750702,
"logits/rejected": 0.2787778973579407,
"logps/chosen": -72.02786254882812,
"logps/ref_chosen": -57.45049285888672,
"logps/ref_rejected": -77.60826110839844,
"logps/rejected": -101.64306640625,
"loss": 0.9936,
"margin_dpo/margin_mean": 9.457446098327637,
"margin_dpo/margin_std": 13.773505210876465,
"step": 266
},
{
"epoch": 0.4036281179138322,
"grad_norm": 31.01226234436035,
"learning_rate": 3.738531817228131e-07,
"logits/chosen": 0.39176273345947266,
"logits/rejected": 0.3712225556373596,
"logps/chosen": -68.095703125,
"logps/ref_chosen": -55.03534698486328,
"logps/ref_rejected": -66.0953369140625,
"logps/rejected": -86.55458068847656,
"loss": 1.1479,
"margin_dpo/margin_mean": 7.398888111114502,
"margin_dpo/margin_std": 13.48221206665039,
"step": 267
},
{
"epoch": 0.4051398337112623,
"grad_norm": 28.24346160888672,
"learning_rate": 3.7270289900589204e-07,
"logits/chosen": 0.25299277901649475,
"logits/rejected": 0.23204386234283447,
"logps/chosen": -77.99839782714844,
"logps/ref_chosen": -65.07174682617188,
"logps/ref_rejected": -71.42486572265625,
"logps/rejected": -93.17605590820312,
"loss": 0.9574,
"margin_dpo/margin_mean": 8.824535369873047,
"margin_dpo/margin_std": 11.893662452697754,
"step": 268
},
{
"epoch": 0.40665154950869237,
"grad_norm": 27.909160614013672,
"learning_rate": 3.7154918402511714e-07,
"logits/chosen": 0.4610741436481476,
"logits/rejected": 0.41174769401550293,
"logps/chosen": -81.76616668701172,
"logps/ref_chosen": -67.1362075805664,
"logps/ref_rejected": -82.55778503417969,
"logps/rejected": -106.98124694824219,
"loss": 0.9278,
"margin_dpo/margin_mean": 9.793498992919922,
"margin_dpo/margin_std": 12.174901962280273,
"step": 269
},
{
"epoch": 0.40816326530612246,
"grad_norm": 34.2803840637207,
"learning_rate": 3.7039206905237656e-07,
"logits/chosen": 0.3973391056060791,
"logits/rejected": 0.321039617061615,
"logps/chosen": -80.73213195800781,
"logps/ref_chosen": -66.6886978149414,
"logps/ref_rejected": -85.16129302978516,
"logps/rejected": -107.2292709350586,
"loss": 1.0113,
"margin_dpo/margin_mean": 8.024534225463867,
"margin_dpo/margin_std": 11.851795196533203,
"step": 270
},
{
"epoch": 0.40967498110355255,
"grad_norm": 38.43772888183594,
"learning_rate": 3.692315864546635e-07,
"logits/chosen": 0.3932819962501526,
"logits/rejected": 0.33234933018684387,
"logps/chosen": -85.6281967163086,
"logps/ref_chosen": -72.40754699707031,
"logps/ref_rejected": -92.0631103515625,
"logps/rejected": -112.34785461425781,
"loss": 1.2241,
"margin_dpo/margin_mean": 7.064090728759766,
"margin_dpo/margin_std": 15.192045211791992,
"step": 271
},
{
"epoch": 0.41118669690098264,
"grad_norm": 27.955270767211914,
"learning_rate": 3.6806776869317067e-07,
"logits/chosen": 0.36552393436431885,
"logits/rejected": 0.372539758682251,
"logps/chosen": -77.8504867553711,
"logps/ref_chosen": -66.60140228271484,
"logps/ref_rejected": -67.74339294433594,
"logps/rejected": -90.815185546875,
"loss": 0.7762,
"margin_dpo/margin_mean": 11.822696685791016,
"margin_dpo/margin_std": 12.071565628051758,
"step": 272
},
{
"epoch": 0.4126984126984127,
"grad_norm": 33.07332229614258,
"learning_rate": 3.669006483223828e-07,
"logits/chosen": 0.3804330825805664,
"logits/rejected": 0.3152013123035431,
"logps/chosen": -71.91949462890625,
"logps/ref_chosen": -57.35487365722656,
"logps/ref_rejected": -84.17168426513672,
"logps/rejected": -108.44322204589844,
"loss": 1.0126,
"margin_dpo/margin_mean": 9.70692253112793,
"margin_dpo/margin_std": 14.580245018005371,
"step": 273
},
{
"epoch": 0.41421012849584277,
"grad_norm": 28.571704864501953,
"learning_rate": 3.657302579891656e-07,
"logits/chosen": 0.24608616530895233,
"logits/rejected": 0.22161783277988434,
"logps/chosen": -73.17286682128906,
"logps/ref_chosen": -59.64149475097656,
"logps/ref_rejected": -68.29348754882812,
"logps/rejected": -91.27055358886719,
"loss": 0.98,
"margin_dpo/margin_mean": 9.445707321166992,
"margin_dpo/margin_std": 13.195257186889648,
"step": 274
},
{
"epoch": 0.41572184429327286,
"grad_norm": 28.784631729125977,
"learning_rate": 3.645566304318526e-07,
"logits/chosen": 0.339069664478302,
"logits/rejected": 0.25784239172935486,
"logps/chosen": -66.6173095703125,
"logps/ref_chosen": -53.26664733886719,
"logps/ref_rejected": -73.84062194824219,
"logps/rejected": -97.59869384765625,
"loss": 0.9142,
"margin_dpo/margin_mean": 10.407403945922852,
"margin_dpo/margin_std": 13.310070037841797,
"step": 275
},
{
"epoch": 0.41723356009070295,
"grad_norm": 27.40554428100586,
"learning_rate": 3.633797984793294e-07,
"logits/chosen": 0.29572436213493347,
"logits/rejected": 0.26014089584350586,
"logps/chosen": -65.08883666992188,
"logps/ref_chosen": -53.02079772949219,
"logps/ref_rejected": -61.56678771972656,
"logps/rejected": -83.73110961914062,
"loss": 0.8431,
"margin_dpo/margin_mean": 10.096281051635742,
"margin_dpo/margin_std": 11.02065658569336,
"step": 276
},
{
"epoch": 0.41874527588813304,
"grad_norm": 40.223724365234375,
"learning_rate": 3.6219979505011555e-07,
"logits/chosen": 0.4081869125366211,
"logits/rejected": 0.42862510681152344,
"logps/chosen": -85.9726333618164,
"logps/ref_chosen": -71.43299102783203,
"logps/ref_rejected": -67.65852355957031,
"logps/rejected": -88.13728332519531,
"loss": 1.1897,
"margin_dpo/margin_mean": 5.939116477966309,
"margin_dpo/margin_std": 12.596590995788574,
"step": 277
},
{
"epoch": 0.42025699168556313,
"grad_norm": 33.17136001586914,
"learning_rate": 3.6101665315144353e-07,
"logits/chosen": 0.26065516471862793,
"logits/rejected": 0.21105234324932098,
"logps/chosen": -80.96624755859375,
"logps/ref_chosen": -67.11076354980469,
"logps/ref_rejected": -88.74851989746094,
"logps/rejected": -112.27589416503906,
"loss": 0.959,
"margin_dpo/margin_mean": 9.671895980834961,
"margin_dpo/margin_std": 12.61258316040039,
"step": 278
},
{
"epoch": 0.4217687074829932,
"grad_norm": 25.243629455566406,
"learning_rate": 3.5983040587833563e-07,
"logits/chosen": 0.31561681628227234,
"logits/rejected": 0.2724509835243225,
"logps/chosen": -64.49711608886719,
"logps/ref_chosen": -54.49748611450195,
"logps/ref_rejected": -70.4237289428711,
"logps/rejected": -92.49717712402344,
"loss": 0.769,
"margin_dpo/margin_mean": 12.073814392089844,
"margin_dpo/margin_std": 11.994571685791016,
"step": 279
},
{
"epoch": 0.42328042328042326,
"grad_norm": 24.20424461364746,
"learning_rate": 3.586410864126781e-07,
"logits/chosen": 0.33959245681762695,
"logits/rejected": 0.2953893542289734,
"logps/chosen": -71.14170837402344,
"logps/ref_chosen": -60.43281173706055,
"logps/ref_rejected": -78.39051818847656,
"logps/rejected": -101.32532501220703,
"loss": 0.724,
"margin_dpo/margin_mean": 12.225910186767578,
"margin_dpo/margin_std": 11.65771484375,
"step": 280
},
{
"epoch": 0.42479213907785335,
"grad_norm": 26.259754180908203,
"learning_rate": 3.574487280222929e-07,
"logits/chosen": 0.32685786485671997,
"logits/rejected": 0.33631807565689087,
"logps/chosen": -72.1130142211914,
"logps/ref_chosen": -60.2820930480957,
"logps/ref_rejected": -62.04009246826172,
"logps/rejected": -84.68453979492188,
"loss": 0.8761,
"margin_dpo/margin_mean": 10.813521385192871,
"margin_dpo/margin_std": 12.737310409545898,
"step": 281
},
{
"epoch": 0.42630385487528344,
"grad_norm": 31.841053009033203,
"learning_rate": 3.562533640600075e-07,
"logits/chosen": 0.2697226107120514,
"logits/rejected": 0.21670247614383698,
"logps/chosen": -73.82437133789062,
"logps/ref_chosen": -60.623924255371094,
"logps/ref_rejected": -68.67400360107422,
"logps/rejected": -92.24623107910156,
"loss": 0.9296,
"margin_dpo/margin_mean": 10.371776580810547,
"margin_dpo/margin_std": 13.209760665893555,
"step": 282
},
{
"epoch": 0.42781557067271353,
"grad_norm": 37.73383712768555,
"learning_rate": 3.550550279627215e-07,
"logits/chosen": 0.33592864871025085,
"logits/rejected": 0.24044758081436157,
"logps/chosen": -81.87095642089844,
"logps/ref_chosen": -67.64775085449219,
"logps/ref_rejected": -99.96835327148438,
"logps/rejected": -122.95320129394531,
"loss": 1.0706,
"margin_dpo/margin_mean": 8.761629104614258,
"margin_dpo/margin_std": 14.105447769165039,
"step": 283
},
{
"epoch": 0.4293272864701436,
"grad_norm": 28.0395450592041,
"learning_rate": 3.5385375325047163e-07,
"logits/chosen": 0.3895118832588196,
"logits/rejected": 0.325203537940979,
"logps/chosen": -70.05673217773438,
"logps/ref_chosen": -56.967430114746094,
"logps/ref_rejected": -86.36236572265625,
"logps/rejected": -109.27890014648438,
"loss": 0.9022,
"margin_dpo/margin_mean": 9.827235221862793,
"margin_dpo/margin_std": 12.691057205200195,
"step": 284
},
{
"epoch": 0.4308390022675737,
"grad_norm": 34.68986129760742,
"learning_rate": 3.5264957352549375e-07,
"logits/chosen": 0.39354732632637024,
"logits/rejected": 0.3635793924331665,
"logps/chosen": -87.55171203613281,
"logps/ref_chosen": -71.65611267089844,
"logps/ref_rejected": -81.63829803466797,
"logps/rejected": -105.88143920898438,
"loss": 1.0097,
"margin_dpo/margin_mean": 8.347532272338867,
"margin_dpo/margin_std": 12.33486557006836,
"step": 285
},
{
"epoch": 0.4323507180650038,
"grad_norm": 27.683141708374023,
"learning_rate": 3.514425224712835e-07,
"logits/chosen": 0.3177586793899536,
"logits/rejected": 0.22109857201576233,
"logps/chosen": -75.74806213378906,
"logps/ref_chosen": -61.07952117919922,
"logps/ref_rejected": -91.28128051757812,
"logps/rejected": -119.27302551269531,
"loss": 0.7605,
"margin_dpo/margin_mean": 13.323205947875977,
"margin_dpo/margin_std": 13.627128601074219,
"step": 286
},
{
"epoch": 0.43386243386243384,
"grad_norm": 28.568553924560547,
"learning_rate": 3.502326338516534e-07,
"logits/chosen": 0.36465156078338623,
"logits/rejected": 0.3220062255859375,
"logps/chosen": -58.37489318847656,
"logps/ref_chosen": -46.035789489746094,
"logps/ref_rejected": -59.95293426513672,
"logps/rejected": -85.12361907958984,
"loss": 0.7988,
"margin_dpo/margin_mean": 12.83158016204834,
"margin_dpo/margin_std": 14.233297348022461,
"step": 287
},
{
"epoch": 0.43537414965986393,
"grad_norm": 38.184391021728516,
"learning_rate": 3.490199415097892e-07,
"logits/chosen": 0.2519870400428772,
"logits/rejected": 0.1951802521944046,
"logps/chosen": -81.74073791503906,
"logps/ref_chosen": -65.3908462524414,
"logps/ref_rejected": -88.53607177734375,
"logps/rejected": -113.48297119140625,
"loss": 1.0718,
"margin_dpo/margin_mean": 8.597015380859375,
"margin_dpo/margin_std": 14.126516342163086,
"step": 288
},
{
"epoch": 0.436885865457294,
"grad_norm": 31.642969131469727,
"learning_rate": 3.4780447936730247e-07,
"logits/chosen": 0.42887574434280396,
"logits/rejected": 0.38934090733528137,
"logps/chosen": -71.02117156982422,
"logps/ref_chosen": -54.5936279296875,
"logps/ref_rejected": -67.20855712890625,
"logps/rejected": -92.87224578857422,
"loss": 1.0588,
"margin_dpo/margin_mean": 9.236154556274414,
"margin_dpo/margin_std": 14.629898071289062,
"step": 289
},
{
"epoch": 0.4383975812547241,
"grad_norm": 38.783477783203125,
"learning_rate": 3.465862814232821e-07,
"logits/chosen": 0.42857229709625244,
"logits/rejected": 0.3584096431732178,
"logps/chosen": -79.49140930175781,
"logps/ref_chosen": -61.38457489013672,
"logps/ref_rejected": -91.92778015136719,
"logps/rejected": -121.74188232421875,
"loss": 0.9043,
"margin_dpo/margin_mean": 11.707275390625,
"margin_dpo/margin_std": 14.761146545410156,
"step": 290
},
{
"epoch": 0.4399092970521542,
"grad_norm": 33.59368133544922,
"learning_rate": 3.4536538175334343e-07,
"logits/chosen": 0.5070397853851318,
"logits/rejected": 0.4380100965499878,
"logps/chosen": -67.4765853881836,
"logps/ref_chosen": -50.863037109375,
"logps/ref_rejected": -82.20868682861328,
"logps/rejected": -110.57408142089844,
"loss": 0.9453,
"margin_dpo/margin_mean": 11.751852035522461,
"margin_dpo/margin_std": 15.614330291748047,
"step": 291
},
{
"epoch": 0.4414210128495843,
"grad_norm": 37.523006439208984,
"learning_rate": 3.4414181450867465e-07,
"logits/chosen": 0.3803209662437439,
"logits/rejected": 0.32875359058380127,
"logps/chosen": -80.26577758789062,
"logps/ref_chosen": -64.34888458251953,
"logps/ref_rejected": -72.86434936523438,
"logps/rejected": -98.49430847167969,
"loss": 0.9875,
"margin_dpo/margin_mean": 9.71307373046875,
"margin_dpo/margin_std": 14.086446762084961,
"step": 292
},
{
"epoch": 0.4429327286470144,
"grad_norm": 25.93706512451172,
"learning_rate": 3.4291561391508185e-07,
"logits/chosen": 0.4410402476787567,
"logits/rejected": 0.3540686368942261,
"logps/chosen": -72.45366668701172,
"logps/ref_chosen": -54.86946487426758,
"logps/ref_rejected": -81.858642578125,
"logps/rejected": -112.58206176757812,
"loss": 0.8776,
"margin_dpo/margin_mean": 13.139215469360352,
"margin_dpo/margin_std": 15.717092514038086,
"step": 293
},
{
"epoch": 0.4444444444444444,
"grad_norm": 26.823040008544922,
"learning_rate": 3.4168681427203153e-07,
"logits/chosen": 0.3839040994644165,
"logits/rejected": 0.3379971981048584,
"logps/chosen": -72.74606323242188,
"logps/ref_chosen": -56.6708984375,
"logps/ref_rejected": -70.32819366455078,
"logps/rejected": -97.80169677734375,
"loss": 0.865,
"margin_dpo/margin_mean": 11.398344039916992,
"margin_dpo/margin_std": 14.199283599853516,
"step": 294
},
{
"epoch": 0.4459561602418745,
"grad_norm": 34.710105895996094,
"learning_rate": 3.4045544995169125e-07,
"logits/chosen": 0.3906914293766022,
"logits/rejected": 0.2945278286933899,
"logps/chosen": -68.47222900390625,
"logps/ref_chosen": -50.40088653564453,
"logps/ref_rejected": -83.43521881103516,
"logps/rejected": -110.99464416503906,
"loss": 1.0035,
"margin_dpo/margin_mean": 9.488086700439453,
"margin_dpo/margin_std": 14.339176177978516,
"step": 295
},
{
"epoch": 0.4474678760393046,
"grad_norm": 36.2674560546875,
"learning_rate": 3.392215553979679e-07,
"logits/chosen": 0.3322068750858307,
"logits/rejected": 0.2882440686225891,
"logps/chosen": -87.16343688964844,
"logps/ref_chosen": -69.15034484863281,
"logps/ref_rejected": -89.60166931152344,
"logps/rejected": -119.3226318359375,
"loss": 0.8998,
"margin_dpo/margin_mean": 11.70787239074707,
"margin_dpo/margin_std": 14.860249519348145,
"step": 296
},
{
"epoch": 0.4489795918367347,
"grad_norm": 28.834030151367188,
"learning_rate": 3.3798516512554485e-07,
"logits/chosen": 0.36532461643218994,
"logits/rejected": 0.30405259132385254,
"logps/chosen": -77.24577331542969,
"logps/ref_chosen": -58.01630401611328,
"logps/ref_rejected": -69.95780944824219,
"logps/rejected": -101.18505859375,
"loss": 0.8016,
"margin_dpo/margin_mean": 11.997785568237305,
"margin_dpo/margin_std": 12.494903564453125,
"step": 297
},
{
"epoch": 0.4504913076341648,
"grad_norm": 32.918025970458984,
"learning_rate": 3.367463137189156e-07,
"logits/chosen": 0.5060819387435913,
"logits/rejected": 0.44333118200302124,
"logps/chosen": -75.56437683105469,
"logps/ref_chosen": -56.1693115234375,
"logps/ref_rejected": -68.55052185058594,
"logps/rejected": -97.68669891357422,
"loss": 1.0316,
"margin_dpo/margin_mean": 9.741113662719727,
"margin_dpo/margin_std": 14.936502456665039,
"step": 298
},
{
"epoch": 0.4520030234315949,
"grad_norm": 34.70051574707031,
"learning_rate": 3.355050358314172e-07,
"logits/chosen": 0.27476340532302856,
"logits/rejected": 0.24570544064044952,
"logps/chosen": -80.7921142578125,
"logps/ref_chosen": -62.31780242919922,
"logps/ref_rejected": -72.60028839111328,
"logps/rejected": -100.0041732788086,
"loss": 1.1003,
"margin_dpo/margin_mean": 8.929572105407715,
"margin_dpo/margin_std": 15.097801208496094,
"step": 299
},
{
"epoch": 0.45351473922902497,
"grad_norm": 33.734275817871094,
"learning_rate": 3.3426136618426043e-07,
"logits/chosen": 0.3949470818042755,
"logits/rejected": 0.3290703594684601,
"logps/chosen": -80.29096984863281,
"logps/ref_chosen": -60.38157653808594,
"logps/ref_rejected": -75.45442199707031,
"logps/rejected": -105.5843734741211,
"loss": 0.9692,
"margin_dpo/margin_mean": 10.220561027526855,
"margin_dpo/margin_std": 13.958076477050781,
"step": 300
},
{
"epoch": 0.45351473922902497,
"eval_logits/chosen": 0.4017273187637329,
"eval_logits/rejected": 0.35065409541130066,
"eval_logps/chosen": -93.17941284179688,
"eval_logps/ref_chosen": -74.85946655273438,
"eval_logps/ref_rejected": -79.54898834228516,
"eval_logps/rejected": -107.24836730957031,
"eval_loss": 0.5345103144645691,
"eval_margin_dpo/margin_mean": 9.37943172454834,
"eval_margin_dpo/margin_std": 14.973759651184082,
"eval_runtime": 38.6788,
"eval_samples_per_second": 59.542,
"eval_steps_per_second": 1.861,
"step": 300
},
{
"epoch": 0.455026455026455,
"grad_norm": 31.209848403930664,
"learning_rate": 3.3301533956555885e-07,
"logits/chosen": 0.42101189494132996,
"logits/rejected": 0.38934606313705444,
"logps/chosen": -71.96283721923828,
"logps/ref_chosen": -52.85089111328125,
"logps/ref_rejected": -69.97584533691406,
"logps/rejected": -98.6317138671875,
"loss": 1.0621,
"margin_dpo/margin_mean": 9.543924331665039,
"margin_dpo/margin_std": 15.364656448364258,
"step": 301
},
{
"epoch": 0.4565381708238851,
"grad_norm": 37.35209274291992,
"learning_rate": 3.317669908293554e-07,
"logits/chosen": 0.2666228413581848,
"logits/rejected": 0.21263065934181213,
"logps/chosen": -88.26080322265625,
"logps/ref_chosen": -66.96651458740234,
"logps/ref_rejected": -88.0951156616211,
"logps/rejected": -115.92124938964844,
"loss": 1.1827,
"margin_dpo/margin_mean": 6.531844139099121,
"margin_dpo/margin_std": 13.561601638793945,
"step": 302
},
{
"epoch": 0.4580498866213152,
"grad_norm": 28.165313720703125,
"learning_rate": 3.3051635489464793e-07,
"logits/chosen": 0.3595314621925354,
"logits/rejected": 0.2943815290927887,
"logps/chosen": -80.27766418457031,
"logps/ref_chosen": -62.12152862548828,
"logps/ref_rejected": -90.31204223632812,
"logps/rejected": -121.2498779296875,
"loss": 0.9179,
"margin_dpo/margin_mean": 12.781694412231445,
"margin_dpo/margin_std": 16.26740264892578,
"step": 303
},
{
"epoch": 0.4595616024187453,
"grad_norm": 28.933210372924805,
"learning_rate": 3.292634667444117e-07,
"logits/chosen": 0.38219448924064636,
"logits/rejected": 0.3254969120025635,
"logps/chosen": -78.41160583496094,
"logps/ref_chosen": -60.69508361816406,
"logps/ref_rejected": -78.25254821777344,
"logps/rejected": -108.52848815917969,
"loss": 0.7588,
"margin_dpo/margin_mean": 12.559429168701172,
"margin_dpo/margin_std": 12.42584228515625,
"step": 304
},
{
"epoch": 0.46107331821617537,
"grad_norm": 42.87114334106445,
"learning_rate": 3.280083614246217e-07,
"logits/chosen": 0.32686659693717957,
"logits/rejected": 0.3448963165283203,
"logps/chosen": -93.79513549804688,
"logps/ref_chosen": -72.69914245605469,
"logps/ref_rejected": -65.65670776367188,
"logps/rejected": -96.12918090820312,
"loss": 1.0819,
"margin_dpo/margin_mean": 9.376477241516113,
"margin_dpo/margin_std": 15.228221893310547,
"step": 305
},
{
"epoch": 0.46258503401360546,
"grad_norm": 28.915489196777344,
"learning_rate": 3.267510740432719e-07,
"logits/chosen": 0.4310336410999298,
"logits/rejected": 0.31829434633255005,
"logps/chosen": -73.72835540771484,
"logps/ref_chosen": -53.97052764892578,
"logps/ref_rejected": -71.02423095703125,
"logps/rejected": -99.50276184082031,
"loss": 0.9685,
"margin_dpo/margin_mean": 8.720699310302734,
"margin_dpo/margin_std": 11.995311737060547,
"step": 306
},
{
"epoch": 0.46409674981103555,
"grad_norm": 35.531272888183594,
"learning_rate": 3.2549163976939285e-07,
"logits/chosen": 0.4187588095664978,
"logits/rejected": 0.37263208627700806,
"logps/chosen": -74.16632080078125,
"logps/ref_chosen": -57.413108825683594,
"logps/ref_rejected": -68.68011474609375,
"logps/rejected": -92.7537612915039,
"loss": 1.2241,
"margin_dpo/margin_mean": 7.320440292358398,
"margin_dpo/margin_std": 15.137401580810547,
"step": 307
},
{
"epoch": 0.4656084656084656,
"grad_norm": 30.61981773376465,
"learning_rate": 3.2423009383206874e-07,
"logits/chosen": 0.36663228273391724,
"logits/rejected": 0.34980136156082153,
"logps/chosen": -84.69308471679688,
"logps/ref_chosen": -66.59878540039062,
"logps/ref_rejected": -74.337158203125,
"logps/rejected": -101.78118133544922,
"loss": 1.0013,
"margin_dpo/margin_mean": 9.3497314453125,
"margin_dpo/margin_std": 13.311956405639648,
"step": 308
},
{
"epoch": 0.4671201814058957,
"grad_norm": 42.480690002441406,
"learning_rate": 3.229664715194511e-07,
"logits/chosen": 0.4416660666465759,
"logits/rejected": 0.3822864294052124,
"logps/chosen": -86.4417724609375,
"logps/ref_chosen": -65.39474487304688,
"logps/ref_rejected": -75.70930480957031,
"logps/rejected": -105.63099670410156,
"loss": 1.0418,
"margin_dpo/margin_mean": 8.874654769897461,
"margin_dpo/margin_std": 13.672810554504395,
"step": 309
},
{
"epoch": 0.46863189720332576,
"grad_norm": 38.71232223510742,
"learning_rate": 3.2170080817777257e-07,
"logits/chosen": 0.42346304655075073,
"logits/rejected": 0.4100581407546997,
"logps/chosen": -95.80581665039062,
"logps/ref_chosen": -74.66827392578125,
"logps/ref_rejected": -80.5689697265625,
"logps/rejected": -107.26795959472656,
"loss": 1.2398,
"margin_dpo/margin_mean": 5.561444282531738,
"margin_dpo/margin_std": 13.021998405456543,
"step": 310
},
{
"epoch": 0.47014361300075586,
"grad_norm": 32.81990432739258,
"learning_rate": 3.204331392103574e-07,
"logits/chosen": 0.3420078754425049,
"logits/rejected": 0.21063414216041565,
"logps/chosen": -76.67539978027344,
"logps/ref_chosen": -59.73802947998047,
"logps/ref_rejected": -93.60757446289062,
"logps/rejected": -119.02238464355469,
"loss": 1.072,
"margin_dpo/margin_mean": 8.477436065673828,
"margin_dpo/margin_std": 14.32551383972168,
"step": 311
},
{
"epoch": 0.47165532879818595,
"grad_norm": 26.659669876098633,
"learning_rate": 3.1916350007663176e-07,
"logits/chosen": 0.38971781730651855,
"logits/rejected": 0.2917502522468567,
"logps/chosen": -71.44837951660156,
"logps/ref_chosen": -53.816436767578125,
"logps/ref_rejected": -68.6575698852539,
"logps/rejected": -97.73336791992188,
"loss": 0.8242,
"margin_dpo/margin_mean": 11.443855285644531,
"margin_dpo/margin_std": 12.68366813659668,
"step": 312
},
{
"epoch": 0.47316704459561604,
"grad_norm": 34.83692932128906,
"learning_rate": 3.178919262911314e-07,
"logits/chosen": 0.45790788531303406,
"logits/rejected": 0.43460702896118164,
"logps/chosen": -77.33392333984375,
"logps/ref_chosen": -59.957359313964844,
"logps/ref_rejected": -69.31729888916016,
"logps/rejected": -94.20777893066406,
"loss": 1.158,
"margin_dpo/margin_mean": 7.513920783996582,
"margin_dpo/margin_std": 14.714834213256836,
"step": 313
},
{
"epoch": 0.47467876039304613,
"grad_norm": 26.88656234741211,
"learning_rate": 3.166184534225087e-07,
"logits/chosen": 0.3927513360977173,
"logits/rejected": 0.41145119071006775,
"logps/chosen": -87.16189575195312,
"logps/ref_chosen": -70.26815795898438,
"logps/ref_rejected": -69.23971557617188,
"logps/rejected": -97.7408447265625,
"loss": 0.8053,
"margin_dpo/margin_mean": 11.607396125793457,
"margin_dpo/margin_std": 12.333427429199219,
"step": 314
},
{
"epoch": 0.47619047619047616,
"grad_norm": 28.846769332885742,
"learning_rate": 3.1534311709253723e-07,
"logits/chosen": 0.3296147286891937,
"logits/rejected": 0.28625988960266113,
"logps/chosen": -85.16159057617188,
"logps/ref_chosen": -67.79469299316406,
"logps/ref_rejected": -74.55148315429688,
"logps/rejected": -101.44076538085938,
"loss": 0.9432,
"margin_dpo/margin_mean": 9.52238655090332,
"margin_dpo/margin_std": 11.987009048461914,
"step": 315
},
{
"epoch": 0.47770219198790626,
"grad_norm": 29.60664939880371,
"learning_rate": 3.1406595297511564e-07,
"logits/chosen": 0.3064402639865875,
"logits/rejected": 0.18259896337985992,
"logps/chosen": -71.98179626464844,
"logps/ref_chosen": -55.288482666015625,
"logps/ref_rejected": -96.15723419189453,
"logps/rejected": -123.8666000366211,
"loss": 0.8814,
"margin_dpo/margin_mean": 11.016056060791016,
"margin_dpo/margin_std": 12.859323501586914,
"step": 316
},
{
"epoch": 0.47921390778533635,
"grad_norm": 27.596532821655273,
"learning_rate": 3.1278699679526975e-07,
"logits/chosen": 0.4423693120479584,
"logits/rejected": 0.3917258381843567,
"logps/chosen": -68.88323974609375,
"logps/ref_chosen": -54.58137512207031,
"logps/ref_rejected": -72.77232360839844,
"logps/rejected": -100.27496337890625,
"loss": 0.7309,
"margin_dpo/margin_mean": 13.200772285461426,
"margin_dpo/margin_std": 12.760543823242188,
"step": 317
},
{
"epoch": 0.48072562358276644,
"grad_norm": 37.705631256103516,
"learning_rate": 3.1150628432815336e-07,
"logits/chosen": 0.42552927136421204,
"logits/rejected": 0.35953375697135925,
"logps/chosen": -70.18506622314453,
"logps/ref_chosen": -52.88822937011719,
"logps/ref_rejected": -80.63988494873047,
"logps/rejected": -105.99900817871094,
"loss": 1.2682,
"margin_dpo/margin_mean": 8.06229019165039,
"margin_dpo/margin_std": 16.971851348876953,
"step": 318
},
{
"epoch": 0.48223733938019653,
"grad_norm": 29.031652450561523,
"learning_rate": 3.1022385139804707e-07,
"logits/chosen": 0.3563545346260071,
"logits/rejected": 0.33486121892929077,
"logps/chosen": -79.50239562988281,
"logps/ref_chosen": -64.36333465576172,
"logps/ref_rejected": -79.47296142578125,
"logps/rejected": -106.15999603271484,
"loss": 0.8888,
"margin_dpo/margin_mean": 11.5479736328125,
"margin_dpo/margin_std": 14.676193237304688,
"step": 319
},
{
"epoch": 0.4837490551776266,
"grad_norm": 32.68266296386719,
"learning_rate": 3.0893973387735683e-07,
"logits/chosen": 0.2867361307144165,
"logits/rejected": 0.24268287420272827,
"logps/chosen": -64.48707580566406,
"logps/ref_chosen": -49.558746337890625,
"logps/ref_rejected": -71.23444366455078,
"logps/rejected": -94.714599609375,
"loss": 1.1087,
"margin_dpo/margin_mean": 8.551826477050781,
"margin_dpo/margin_std": 14.7891206741333,
"step": 320
},
{
"epoch": 0.4852607709750567,
"grad_norm": 29.052352905273438,
"learning_rate": 3.0765396768561004e-07,
"logits/chosen": 0.3400368392467499,
"logits/rejected": 0.3190291225910187,
"logps/chosen": -68.13433837890625,
"logps/ref_chosen": -52.085269927978516,
"logps/ref_rejected": -55.58674621582031,
"logps/rejected": -80.85602569580078,
"loss": 1.0443,
"margin_dpo/margin_mean": 9.220213890075684,
"margin_dpo/margin_std": 13.879295349121094,
"step": 321
},
{
"epoch": 0.48677248677248675,
"grad_norm": 26.432174682617188,
"learning_rate": 3.063665887884511e-07,
"logits/chosen": 0.4483844041824341,
"logits/rejected": 0.36465245485305786,
"logps/chosen": -63.66038513183594,
"logps/ref_chosen": -47.404109954833984,
"logps/ref_rejected": -73.4260025024414,
"logps/rejected": -102.2420654296875,
"loss": 0.7555,
"margin_dpo/margin_mean": 12.559789657592773,
"margin_dpo/margin_std": 12.248069763183594,
"step": 322
},
{
"epoch": 0.48828420256991684,
"grad_norm": 36.914634704589844,
"learning_rate": 3.0507763319663517e-07,
"logits/chosen": 0.32914960384368896,
"logits/rejected": 0.25222885608673096,
"logps/chosen": -86.92568969726562,
"logps/ref_chosen": -70.00630187988281,
"logps/ref_rejected": -86.96690368652344,
"logps/rejected": -112.45706176757812,
"loss": 1.1786,
"margin_dpo/margin_mean": 8.570770263671875,
"margin_dpo/margin_std": 16.58449363708496,
"step": 323
},
{
"epoch": 0.4897959183673469,
"grad_norm": 28.488908767700195,
"learning_rate": 3.0378713696502097e-07,
"logits/chosen": 0.4177021384239197,
"logits/rejected": 0.35851603746414185,
"logps/chosen": -69.78921508789062,
"logps/ref_chosen": -55.88882064819336,
"logps/ref_rejected": -75.23088073730469,
"logps/rejected": -99.71137237548828,
"loss": 0.8159,
"margin_dpo/margin_mean": 10.580099105834961,
"margin_dpo/margin_std": 11.190553665161133,
"step": 324
},
{
"epoch": 0.491307634164777,
"grad_norm": 31.288358688354492,
"learning_rate": 3.0249513619156206e-07,
"logits/chosen": 0.3787188231945038,
"logits/rejected": 0.3136671781539917,
"logps/chosen": -81.69401550292969,
"logps/ref_chosen": -64.14701843261719,
"logps/ref_rejected": -79.91143035888672,
"logps/rejected": -107.18046569824219,
"loss": 0.9959,
"margin_dpo/margin_mean": 9.72203254699707,
"margin_dpo/margin_std": 14.314732551574707,
"step": 325
},
{
"epoch": 0.4928193499622071,
"grad_norm": 42.53952407836914,
"learning_rate": 3.012016670162977e-07,
"logits/chosen": 0.32631510496139526,
"logits/rejected": 0.32735714316368103,
"logps/chosen": -95.90467834472656,
"logps/ref_chosen": -75.53131103515625,
"logps/ref_rejected": -76.5898666381836,
"logps/rejected": -102.08723449707031,
"loss": 1.2879,
"margin_dpo/margin_mean": 5.123997688293457,
"margin_dpo/margin_std": 12.83140754699707,
"step": 326
},
{
"epoch": 0.4943310657596372,
"grad_norm": 34.51028060913086,
"learning_rate": 2.99906765620341e-07,
"logits/chosen": 0.2692107856273651,
"logits/rejected": 0.22894282639026642,
"logps/chosen": -87.27745056152344,
"logps/ref_chosen": -69.337158203125,
"logps/ref_rejected": -73.37751770019531,
"logps/rejected": -99.87676239013672,
"loss": 1.1168,
"margin_dpo/margin_mean": 8.558960914611816,
"margin_dpo/margin_std": 15.341290473937988,
"step": 327
},
{
"epoch": 0.4958427815570673,
"grad_norm": 29.792686462402344,
"learning_rate": 2.9861046822486766e-07,
"logits/chosen": 0.3397352695465088,
"logits/rejected": 0.3053368926048279,
"logps/chosen": -76.30783081054688,
"logps/ref_chosen": -61.70623016357422,
"logps/ref_rejected": -83.73808288574219,
"logps/rejected": -107.90557861328125,
"loss": 0.8958,
"margin_dpo/margin_mean": 9.565893173217773,
"margin_dpo/margin_std": 11.804071426391602,
"step": 328
},
{
"epoch": 0.4973544973544973,
"grad_norm": 35.93716049194336,
"learning_rate": 2.9731281109010253e-07,
"logits/chosen": 0.41245976090431213,
"logits/rejected": 0.3557360768318176,
"logps/chosen": -82.2403564453125,
"logps/ref_chosen": -64.4984130859375,
"logps/ref_rejected": -83.6591796875,
"logps/rejected": -111.41017150878906,
"loss": 0.9671,
"margin_dpo/margin_mean": 10.009061813354492,
"margin_dpo/margin_std": 14.135910987854004,
"step": 329
},
{
"epoch": 0.4988662131519274,
"grad_norm": 29.736265182495117,
"learning_rate": 2.9601383051430505e-07,
"logits/chosen": 0.36417415738105774,
"logits/rejected": 0.2935639023780823,
"logps/chosen": -69.07024383544922,
"logps/ref_chosen": -54.80464172363281,
"logps/ref_rejected": -75.31942749023438,
"logps/rejected": -99.49536895751953,
"loss": 1.0317,
"margin_dpo/margin_mean": 9.9103364944458,
"margin_dpo/margin_std": 14.863494873046875,
"step": 330
},
{
"epoch": 0.5003779289493575,
"grad_norm": 27.832918167114258,
"learning_rate": 2.947135628327544e-07,
"logits/chosen": 0.46214789152145386,
"logits/rejected": 0.42954060435295105,
"logps/chosen": -74.25878143310547,
"logps/ref_chosen": -59.242576599121094,
"logps/ref_rejected": -69.87483215332031,
"logps/rejected": -98.79541778564453,
"loss": 0.8271,
"margin_dpo/margin_mean": 13.904380798339844,
"margin_dpo/margin_std": 16.01378631591797,
"step": 331
},
{
"epoch": 0.5018896447467877,
"grad_norm": 32.08829116821289,
"learning_rate": 2.934120444167326e-07,
"logits/chosen": 0.32679808139801025,
"logits/rejected": 0.2778639793395996,
"logps/chosen": -83.60955810546875,
"logps/ref_chosen": -67.10975646972656,
"logps/ref_rejected": -77.11839294433594,
"logps/rejected": -103.84635925292969,
"loss": 0.9184,
"margin_dpo/margin_mean": 10.228164672851562,
"margin_dpo/margin_std": 13.449007034301758,
"step": 332
},
{
"epoch": 0.5034013605442177,
"grad_norm": 26.578086853027344,
"learning_rate": 2.921093116725076e-07,
"logits/chosen": 0.3908570408821106,
"logits/rejected": 0.3197840452194214,
"logps/chosen": -74.55105590820312,
"logps/ref_chosen": -58.381126403808594,
"logps/ref_rejected": -85.02839660644531,
"logps/rejected": -114.37781524658203,
"loss": 0.7668,
"margin_dpo/margin_mean": 13.17949104309082,
"margin_dpo/margin_std": 14.035030364990234,
"step": 333
},
{
"epoch": 0.5049130763416477,
"grad_norm": 37.4295539855957,
"learning_rate": 2.9080540104031484e-07,
"logits/chosen": 0.4013047218322754,
"logits/rejected": 0.35357779264450073,
"logps/chosen": -83.73133850097656,
"logps/ref_chosen": -66.89199829101562,
"logps/ref_rejected": -91.83695220947266,
"logps/rejected": -117.2813491821289,
"loss": 1.1505,
"margin_dpo/margin_mean": 8.60505485534668,
"margin_dpo/margin_std": 15.63020133972168,
"step": 334
},
{
"epoch": 0.5064247921390779,
"grad_norm": 31.525114059448242,
"learning_rate": 2.895003489933375e-07,
"logits/chosen": 0.4016830623149872,
"logits/rejected": 0.3621571660041809,
"logps/chosen": -77.64753723144531,
"logps/ref_chosen": -61.51445770263672,
"logps/ref_rejected": -75.68916320800781,
"logps/rejected": -102.18275451660156,
"loss": 0.9979,
"margin_dpo/margin_mean": 10.360504150390625,
"margin_dpo/margin_std": 15.029167175292969,
"step": 335
},
{
"epoch": 0.5079365079365079,
"grad_norm": 37.780757904052734,
"learning_rate": 2.8819419203668675e-07,
"logits/chosen": 0.3117416501045227,
"logits/rejected": 0.2896695137023926,
"logps/chosen": -89.01457214355469,
"logps/ref_chosen": -68.85006713867188,
"logps/ref_rejected": -92.99603271484375,
"logps/rejected": -122.92210388183594,
"loss": 1.0539,
"margin_dpo/margin_mean": 9.76156997680664,
"margin_dpo/margin_std": 16.293773651123047,
"step": 336
},
{
"epoch": 0.509448223733938,
"grad_norm": 34.2430419921875,
"learning_rate": 2.8688696670638053e-07,
"logits/chosen": 0.2840282917022705,
"logits/rejected": 0.24821636080741882,
"logps/chosen": -92.56698608398438,
"logps/ref_chosen": -73.18783569335938,
"logps/ref_rejected": -86.89118957519531,
"logps/rejected": -114.37174987792969,
"loss": 1.0807,
"margin_dpo/margin_mean": 8.101402282714844,
"margin_dpo/margin_std": 13.815885543823242,
"step": 337
},
{
"epoch": 0.5109599395313681,
"grad_norm": 33.25615692138672,
"learning_rate": 2.8557870956832133e-07,
"logits/chosen": 0.33388757705688477,
"logits/rejected": 0.3053157329559326,
"logps/chosen": -83.34864044189453,
"logps/ref_chosen": -63.939613342285156,
"logps/ref_rejected": -75.34243774414062,
"logps/rejected": -103.55175018310547,
"loss": 1.0324,
"margin_dpo/margin_mean": 8.800281524658203,
"margin_dpo/margin_std": 13.543548583984375,
"step": 338
},
{
"epoch": 0.5124716553287982,
"grad_norm": 29.72187614440918,
"learning_rate": 2.842694572172736e-07,
"logits/chosen": 0.47476959228515625,
"logits/rejected": 0.3901880979537964,
"logps/chosen": -61.99988555908203,
"logps/ref_chosen": -45.54913330078125,
"logps/ref_rejected": -67.0482177734375,
"logps/rejected": -93.39913940429688,
"loss": 0.9283,
"margin_dpo/margin_mean": 9.900163650512695,
"margin_dpo/margin_std": 12.933232307434082,
"step": 339
},
{
"epoch": 0.5139833711262283,
"grad_norm": 33.46787643432617,
"learning_rate": 2.8295924627584004e-07,
"logits/chosen": 0.36157703399658203,
"logits/rejected": 0.3349456489086151,
"logps/chosen": -72.63715362548828,
"logps/ref_chosen": -54.00564956665039,
"logps/ref_rejected": -61.314430236816406,
"logps/rejected": -90.69908142089844,
"loss": 1.0838,
"margin_dpo/margin_mean": 10.753141403198242,
"margin_dpo/margin_std": 16.928892135620117,
"step": 340
},
{
"epoch": 0.5154950869236583,
"grad_norm": 36.91661834716797,
"learning_rate": 2.816481133934373e-07,
"logits/chosen": 0.3951025605201721,
"logits/rejected": 0.34404298663139343,
"logps/chosen": -80.79064178466797,
"logps/ref_chosen": -63.39509582519531,
"logps/ref_rejected": -76.20973205566406,
"logps/rejected": -104.360107421875,
"loss": 1.0046,
"margin_dpo/margin_mean": 10.754829406738281,
"margin_dpo/margin_std": 14.602158546447754,
"step": 341
},
{
"epoch": 0.5170068027210885,
"grad_norm": 25.358030319213867,
"learning_rate": 2.8033609524527046e-07,
"logits/chosen": 0.4110845625400543,
"logits/rejected": 0.3650810718536377,
"logps/chosen": -70.498779296875,
"logps/ref_chosen": -53.047813415527344,
"logps/ref_rejected": -68.2854232788086,
"logps/rejected": -98.16246795654297,
"loss": 0.8504,
"margin_dpo/margin_mean": 12.426074981689453,
"margin_dpo/margin_std": 15.092041015625,
"step": 342
},
{
"epoch": 0.5185185185185185,
"grad_norm": 32.1805534362793,
"learning_rate": 2.7902322853130753e-07,
"logits/chosen": 0.27944275736808777,
"logits/rejected": 0.2696824073791504,
"logps/chosen": -87.056640625,
"logps/ref_chosen": -70.57853698730469,
"logps/ref_rejected": -84.73873901367188,
"logps/rejected": -108.2984848022461,
"loss": 1.1287,
"margin_dpo/margin_mean": 7.081644058227539,
"margin_dpo/margin_std": 12.546673774719238,
"step": 343
},
{
"epoch": 0.5200302343159486,
"grad_norm": 33.30143737792969,
"learning_rate": 2.7770954997525274e-07,
"logits/chosen": 0.37141865491867065,
"logits/rejected": 0.3081679344177246,
"logps/chosen": -75.291015625,
"logps/ref_chosen": -55.811004638671875,
"logps/ref_rejected": -84.7763671875,
"logps/rejected": -114.64897918701172,
"loss": 0.9329,
"margin_dpo/margin_mean": 10.392600059509277,
"margin_dpo/margin_std": 13.607931137084961,
"step": 344
},
{
"epoch": 0.5215419501133787,
"grad_norm": 28.18509292602539,
"learning_rate": 2.7639509632351927e-07,
"logits/chosen": 0.461821585893631,
"logits/rejected": 0.41458916664123535,
"logps/chosen": -72.05027770996094,
"logps/ref_chosen": -57.786094665527344,
"logps/ref_rejected": -78.91847229003906,
"logps/rejected": -103.32426452636719,
"loss": 0.9874,
"margin_dpo/margin_mean": 10.14161491394043,
"margin_dpo/margin_std": 14.554746627807617,
"step": 345
},
{
"epoch": 0.5230536659108088,
"grad_norm": 31.265893936157227,
"learning_rate": 2.7507990434420123e-07,
"logits/chosen": 0.43421846628189087,
"logits/rejected": 0.3566897511482239,
"logps/chosen": -72.08039855957031,
"logps/ref_chosen": -56.285125732421875,
"logps/ref_rejected": -91.15303039550781,
"logps/rejected": -118.93693542480469,
"loss": 0.9291,
"margin_dpo/margin_mean": 11.988627433776855,
"margin_dpo/margin_std": 15.381285667419434,
"step": 346
},
{
"epoch": 0.5245653817082389,
"grad_norm": 31.849092483520508,
"learning_rate": 2.737640108260456e-07,
"logits/chosen": 0.49194493889808655,
"logits/rejected": 0.4413068890571594,
"logps/chosen": -71.65328979492188,
"logps/ref_chosen": -53.499542236328125,
"logps/ref_rejected": -72.52565002441406,
"logps/rejected": -99.69212341308594,
"loss": 1.0635,
"margin_dpo/margin_mean": 9.012718200683594,
"margin_dpo/margin_std": 15.008731842041016,
"step": 347
},
{
"epoch": 0.5260770975056689,
"grad_norm": 27.368101119995117,
"learning_rate": 2.724474525774229e-07,
"logits/chosen": 0.5060254335403442,
"logits/rejected": 0.47323817014694214,
"logps/chosen": -65.67312622070312,
"logps/ref_chosen": -50.78684997558594,
"logps/ref_rejected": -68.63732147216797,
"logps/rejected": -96.16216278076172,
"loss": 0.9281,
"margin_dpo/margin_mean": 12.63856315612793,
"margin_dpo/margin_std": 17.71712875366211,
"step": 348
},
{
"epoch": 0.527588813303099,
"grad_norm": 29.27608299255371,
"learning_rate": 2.711302664252973e-07,
"logits/chosen": 0.43929237127304077,
"logits/rejected": 0.34969234466552734,
"logps/chosen": -68.72640991210938,
"logps/ref_chosen": -53.32501220703125,
"logps/ref_rejected": -83.21235656738281,
"logps/rejected": -110.11225891113281,
"loss": 0.8832,
"margin_dpo/margin_mean": 11.49850082397461,
"margin_dpo/margin_std": 14.258995056152344,
"step": 349
},
{
"epoch": 0.5291005291005291,
"grad_norm": 32.149715423583984,
"learning_rate": 2.698124892141971e-07,
"logits/chosen": 0.3800434470176697,
"logits/rejected": 0.3004031777381897,
"logps/chosen": -78.58746337890625,
"logps/ref_chosen": -61.62577438354492,
"logps/ref_rejected": -87.63627624511719,
"logps/rejected": -118.50624084472656,
"loss": 0.7994,
"margin_dpo/margin_mean": 13.908271789550781,
"margin_dpo/margin_std": 15.544957160949707,
"step": 350
},
{
"epoch": 0.5306122448979592,
"grad_norm": 28.493574142456055,
"learning_rate": 2.6849415780518357e-07,
"logits/chosen": 0.33453550934791565,
"logits/rejected": 0.25856178998947144,
"logps/chosen": -71.84654235839844,
"logps/ref_chosen": -56.2563362121582,
"logps/ref_rejected": -79.11589813232422,
"logps/rejected": -105.61781311035156,
"loss": 1.0,
"margin_dpo/margin_mean": 10.911720275878906,
"margin_dpo/margin_std": 15.10361099243164,
"step": 351
},
{
"epoch": 0.5321239606953893,
"grad_norm": 31.185754776000977,
"learning_rate": 2.6717530907482024e-07,
"logits/chosen": 0.41737568378448486,
"logits/rejected": 0.36074936389923096,
"logps/chosen": -78.22322082519531,
"logps/ref_chosen": -63.05195617675781,
"logps/ref_rejected": -85.52035522460938,
"logps/rejected": -110.64308166503906,
"loss": 0.9394,
"margin_dpo/margin_mean": 9.951467514038086,
"margin_dpo/margin_std": 13.785094261169434,
"step": 352
},
{
"epoch": 0.5336356764928194,
"grad_norm": 27.732799530029297,
"learning_rate": 2.658559799141411e-07,
"logits/chosen": 0.3916947841644287,
"logits/rejected": 0.38925182819366455,
"logps/chosen": -82.88592529296875,
"logps/ref_chosen": -69.00918579101562,
"logps/ref_rejected": -72.65840148925781,
"logps/rejected": -97.80364990234375,
"loss": 0.9485,
"margin_dpo/margin_mean": 11.268506050109863,
"margin_dpo/margin_std": 15.41522216796875,
"step": 353
},
{
"epoch": 0.5351473922902494,
"grad_norm": 31.069217681884766,
"learning_rate": 2.6453620722761895e-07,
"logits/chosen": 0.4487117528915405,
"logits/rejected": 0.31808778643608093,
"logps/chosen": -54.345149993896484,
"logps/ref_chosen": -39.78833770751953,
"logps/ref_rejected": -69.56885528564453,
"logps/rejected": -96.2296371459961,
"loss": 0.8994,
"margin_dpo/margin_mean": 12.10396957397461,
"margin_dpo/margin_std": 15.011777877807617,
"step": 354
},
{
"epoch": 0.5366591080876795,
"grad_norm": 34.739498138427734,
"learning_rate": 2.632160279321328e-07,
"logits/chosen": 0.3989719748497009,
"logits/rejected": 0.2739609479904175,
"logps/chosen": -62.557098388671875,
"logps/ref_chosen": -46.25537872314453,
"logps/ref_rejected": -78.20236206054688,
"logps/rejected": -106.247802734375,
"loss": 0.9684,
"margin_dpo/margin_mean": 11.743728637695312,
"margin_dpo/margin_std": 15.869745254516602,
"step": 355
},
{
"epoch": 0.5381708238851096,
"grad_norm": 33.271121978759766,
"learning_rate": 2.618954789559356e-07,
"logits/chosen": 0.3909762501716614,
"logits/rejected": 0.31542685627937317,
"logps/chosen": -62.46385955810547,
"logps/ref_chosen": -47.906158447265625,
"logps/ref_rejected": -74.29397583007812,
"logps/rejected": -98.53050994873047,
"loss": 1.2069,
"margin_dpo/margin_mean": 9.678831100463867,
"margin_dpo/margin_std": 17.947500228881836,
"step": 356
},
{
"epoch": 0.5396825396825397,
"grad_norm": 36.11610412597656,
"learning_rate": 2.6057459723762076e-07,
"logits/chosen": 0.3990505039691925,
"logits/rejected": 0.3684351444244385,
"logps/chosen": -79.40770721435547,
"logps/ref_chosen": -62.63499450683594,
"logps/ref_rejected": -65.11400604248047,
"logps/rejected": -90.53868103027344,
"loss": 1.1123,
"margin_dpo/margin_mean": 8.651975631713867,
"margin_dpo/margin_std": 14.43873405456543,
"step": 357
},
{
"epoch": 0.5411942554799698,
"grad_norm": 29.094547271728516,
"learning_rate": 2.5925341972508954e-07,
"logits/chosen": 0.3368394374847412,
"logits/rejected": 0.3385453522205353,
"logps/chosen": -82.24810791015625,
"logps/ref_chosen": -67.20960998535156,
"logps/ref_rejected": -69.34715270996094,
"logps/rejected": -96.86444091796875,
"loss": 0.8898,
"margin_dpo/margin_mean": 12.47878646850586,
"margin_dpo/margin_std": 15.653668403625488,
"step": 358
},
{
"epoch": 0.5427059712773998,
"grad_norm": 36.99692153930664,
"learning_rate": 2.579319833745169e-07,
"logits/chosen": 0.3435346782207489,
"logits/rejected": 0.3147648572921753,
"logps/chosen": -79.03623962402344,
"logps/ref_chosen": -62.52578353881836,
"logps/ref_rejected": -76.63114929199219,
"logps/rejected": -100.06315612792969,
"loss": 1.1669,
"margin_dpo/margin_mean": 6.921560764312744,
"margin_dpo/margin_std": 12.897404670715332,
"step": 359
},
{
"epoch": 0.54421768707483,
"grad_norm": 32.149078369140625,
"learning_rate": 2.5661032514931834e-07,
"logits/chosen": 0.281266987323761,
"logits/rejected": 0.18974991142749786,
"logps/chosen": -79.66699981689453,
"logps/ref_chosen": -63.48772048950195,
"logps/ref_rejected": -90.6891098022461,
"logps/rejected": -116.84323120117188,
"loss": 0.964,
"margin_dpo/margin_mean": 9.97484016418457,
"margin_dpo/margin_std": 14.54484748840332,
"step": 360
},
{
"epoch": 0.54572940287226,
"grad_norm": 29.79141616821289,
"learning_rate": 2.552884820191154e-07,
"logits/chosen": 0.4189506769180298,
"logits/rejected": 0.3658691644668579,
"logps/chosen": -72.93891143798828,
"logps/ref_chosen": -57.917144775390625,
"logps/ref_rejected": -72.39089965820312,
"logps/rejected": -99.44670104980469,
"loss": 0.8476,
"margin_dpo/margin_mean": 12.034027099609375,
"margin_dpo/margin_std": 14.130279541015625,
"step": 361
},
{
"epoch": 0.54724111866969,
"grad_norm": 28.21626853942871,
"learning_rate": 2.53966490958702e-07,
"logits/chosen": 0.41983890533447266,
"logits/rejected": 0.31845760345458984,
"logps/chosen": -78.23361206054688,
"logps/ref_chosen": -63.4434700012207,
"logps/ref_rejected": -103.45516967773438,
"logps/rejected": -129.9938201904297,
"loss": 0.8654,
"margin_dpo/margin_mean": 11.748510360717773,
"margin_dpo/margin_std": 13.821972846984863,
"step": 362
},
{
"epoch": 0.5487528344671202,
"grad_norm": 26.874298095703125,
"learning_rate": 2.526443889470099e-07,
"logits/chosen": 0.4121060371398926,
"logits/rejected": 0.2874605059623718,
"logps/chosen": -65.51070404052734,
"logps/ref_chosen": -48.65182876586914,
"logps/ref_rejected": -88.65904235839844,
"logps/rejected": -117.357421875,
"loss": 0.8606,
"margin_dpo/margin_mean": 11.839500427246094,
"margin_dpo/margin_std": 14.284195899963379,
"step": 363
},
{
"epoch": 0.5502645502645502,
"grad_norm": 27.861297607421875,
"learning_rate": 2.513222129660744e-07,
"logits/chosen": 0.2716137170791626,
"logits/rejected": 0.17793412506580353,
"logps/chosen": -72.13548278808594,
"logps/ref_chosen": -57.87107467651367,
"logps/ref_rejected": -80.95502471923828,
"logps/rejected": -107.41427612304688,
"loss": 0.9838,
"margin_dpo/margin_mean": 12.194845199584961,
"margin_dpo/margin_std": 17.369735717773438,
"step": 364
},
{
"epoch": 0.5517762660619804,
"grad_norm": 24.756385803222656,
"learning_rate": 2.5e-07,
"logits/chosen": 0.3685954213142395,
"logits/rejected": 0.35689201951026917,
"logps/chosen": -77.160888671875,
"logps/ref_chosen": -64.94217681884766,
"logps/ref_rejected": -74.8599853515625,
"logps/rejected": -98.56990051269531,
"loss": 0.8058,
"margin_dpo/margin_mean": 11.4912109375,
"margin_dpo/margin_std": 12.382892608642578,
"step": 365
},
{
"epoch": 0.5532879818594104,
"grad_norm": 31.192968368530273,
"learning_rate": 2.486777870339255e-07,
"logits/chosen": 0.314043253660202,
"logits/rejected": 0.29208898544311523,
"logps/chosen": -66.73348999023438,
"logps/ref_chosen": -55.165985107421875,
"logps/ref_rejected": -65.2612075805664,
"logps/rejected": -86.15950012207031,
"loss": 1.0801,
"margin_dpo/margin_mean": 9.330782890319824,
"margin_dpo/margin_std": 15.033758163452148,
"step": 366
},
{
"epoch": 0.5547996976568406,
"grad_norm": 28.845701217651367,
"learning_rate": 2.4735561105299014e-07,
"logits/chosen": 0.33946436643600464,
"logits/rejected": 0.23690305650234222,
"logps/chosen": -70.849365234375,
"logps/ref_chosen": -56.010467529296875,
"logps/ref_rejected": -77.31010437011719,
"logps/rejected": -101.05453491210938,
"loss": 0.9823,
"margin_dpo/margin_mean": 8.905525207519531,
"margin_dpo/margin_std": 12.392328262329102,
"step": 367
},
{
"epoch": 0.5563114134542706,
"grad_norm": 31.565885543823242,
"learning_rate": 2.46033509041298e-07,
"logits/chosen": 0.21036341786384583,
"logits/rejected": 0.20005691051483154,
"logps/chosen": -90.7908935546875,
"logps/ref_chosen": -74.82928466796875,
"logps/ref_rejected": -76.11680603027344,
"logps/rejected": -100.37398529052734,
"loss": 1.0675,
"margin_dpo/margin_mean": 8.295562744140625,
"margin_dpo/margin_std": 13.27570915222168,
"step": 368
},
{
"epoch": 0.5578231292517006,
"grad_norm": 27.81777572631836,
"learning_rate": 2.447115179808846e-07,
"logits/chosen": 0.3587338328361511,
"logits/rejected": 0.3078988790512085,
"logps/chosen": -73.69477081298828,
"logps/ref_chosen": -58.32621765136719,
"logps/ref_rejected": -80.92184448242188,
"logps/rejected": -104.86062622070312,
"loss": 1.0359,
"margin_dpo/margin_mean": 8.570233345031738,
"margin_dpo/margin_std": 12.987571716308594,
"step": 369
},
{
"epoch": 0.5593348450491308,
"grad_norm": 31.120532989501953,
"learning_rate": 2.4338967485068164e-07,
"logits/chosen": 0.4456462264060974,
"logits/rejected": 0.37752196192741394,
"logps/chosen": -67.46039581298828,
"logps/ref_chosen": -52.88372039794922,
"logps/ref_rejected": -79.43692016601562,
"logps/rejected": -105.49575805664062,
"loss": 1.0011,
"margin_dpo/margin_mean": 11.482163429260254,
"margin_dpo/margin_std": 16.541217803955078,
"step": 370
},
{
"epoch": 0.5608465608465608,
"grad_norm": 28.67278480529785,
"learning_rate": 2.420680166254831e-07,
"logits/chosen": 0.46257519721984863,
"logits/rejected": 0.42657119035720825,
"logps/chosen": -63.5814094543457,
"logps/ref_chosen": -49.224212646484375,
"logps/ref_rejected": -63.348472595214844,
"logps/rejected": -88.49848937988281,
"loss": 0.9804,
"margin_dpo/margin_mean": 10.792819023132324,
"margin_dpo/margin_std": 14.90372085571289,
"step": 371
},
{
"epoch": 0.562358276643991,
"grad_norm": 35.98383331298828,
"learning_rate": 2.4074658027491044e-07,
"logits/chosen": 0.39039164781570435,
"logits/rejected": 0.30279237031936646,
"logps/chosen": -67.90390014648438,
"logps/ref_chosen": -52.26955032348633,
"logps/ref_rejected": -72.99522399902344,
"logps/rejected": -95.57124328613281,
"loss": 1.3022,
"margin_dpo/margin_mean": 6.941670894622803,
"margin_dpo/margin_std": 16.000974655151367,
"step": 372
},
{
"epoch": 0.563869992441421,
"grad_norm": 39.37256622314453,
"learning_rate": 2.394254027623792e-07,
"logits/chosen": 0.37247511744499207,
"logits/rejected": 0.3031313419342041,
"logps/chosen": -77.99808502197266,
"logps/ref_chosen": -61.112998962402344,
"logps/ref_rejected": -76.24851989746094,
"logps/rejected": -102.57091522216797,
"loss": 1.1096,
"margin_dpo/margin_mean": 9.437311172485352,
"margin_dpo/margin_std": 16.313791275024414,
"step": 373
},
{
"epoch": 0.5653817082388511,
"grad_norm": 34.07474136352539,
"learning_rate": 2.381045210440644e-07,
"logits/chosen": 0.27694612741470337,
"logits/rejected": 0.2643893361091614,
"logps/chosen": -87.940673828125,
"logps/ref_chosen": -72.66920471191406,
"logps/ref_rejected": -76.83158874511719,
"logps/rejected": -104.74732208251953,
"loss": 0.7677,
"margin_dpo/margin_mean": 12.644261360168457,
"margin_dpo/margin_std": 12.625534057617188,
"step": 374
},
{
"epoch": 0.5668934240362812,
"grad_norm": 36.280582427978516,
"learning_rate": 2.3678397206786715e-07,
"logits/chosen": 0.4151899218559265,
"logits/rejected": 0.35815227031707764,
"logps/chosen": -72.33064270019531,
"logps/ref_chosen": -57.68330383300781,
"logps/ref_rejected": -79.34097290039062,
"logps/rejected": -101.60260009765625,
"loss": 1.0982,
"margin_dpo/margin_mean": 7.614285469055176,
"margin_dpo/margin_std": 13.181623458862305,
"step": 375
},
{
"epoch": 0.5684051398337112,
"grad_norm": 28.2639217376709,
"learning_rate": 2.3546379277238103e-07,
"logits/chosen": 0.4273167848587036,
"logits/rejected": 0.35925012826919556,
"logps/chosen": -67.07015228271484,
"logps/ref_chosen": -51.674072265625,
"logps/ref_rejected": -75.69713592529297,
"logps/rejected": -104.54731750488281,
"loss": 0.8639,
"margin_dpo/margin_mean": 13.45411205291748,
"margin_dpo/margin_std": 16.179851531982422,
"step": 376
},
{
"epoch": 0.5699168556311414,
"grad_norm": 28.088367462158203,
"learning_rate": 2.3414402008585886e-07,
"logits/chosen": 0.3443489670753479,
"logits/rejected": 0.31910431385040283,
"logps/chosen": -62.237247467041016,
"logps/ref_chosen": -46.17853546142578,
"logps/ref_rejected": -57.756500244140625,
"logps/rejected": -83.15531921386719,
"loss": 0.9812,
"margin_dpo/margin_mean": 9.340099334716797,
"margin_dpo/margin_std": 12.798967361450195,
"step": 377
},
{
"epoch": 0.5714285714285714,
"grad_norm": 32.793365478515625,
"learning_rate": 2.3282469092517977e-07,
"logits/chosen": 0.42863214015960693,
"logits/rejected": 0.37801802158355713,
"logps/chosen": -75.05768585205078,
"logps/ref_chosen": -59.21887969970703,
"logps/ref_rejected": -71.2481918334961,
"logps/rejected": -95.27156066894531,
"loss": 1.0859,
"margin_dpo/margin_mean": 8.184557914733887,
"margin_dpo/margin_std": 13.956628799438477,
"step": 378
},
{
"epoch": 0.5729402872260015,
"grad_norm": 35.66096496582031,
"learning_rate": 2.3150584219481643e-07,
"logits/chosen": 0.3971962034702301,
"logits/rejected": 0.330384761095047,
"logps/chosen": -91.02781677246094,
"logps/ref_chosen": -76.31658935546875,
"logps/ref_rejected": -104.26200866699219,
"logps/rejected": -129.94598388671875,
"loss": 0.9102,
"margin_dpo/margin_mean": 10.972752571105957,
"margin_dpo/margin_std": 14.204010009765625,
"step": 379
},
{
"epoch": 0.5744520030234316,
"grad_norm": 28.557945251464844,
"learning_rate": 2.3018751078580283e-07,
"logits/chosen": 0.40566498041152954,
"logits/rejected": 0.3627912998199463,
"logps/chosen": -74.88333129882812,
"logps/ref_chosen": -61.283164978027344,
"logps/ref_rejected": -72.38892364501953,
"logps/rejected": -98.32823181152344,
"loss": 0.8653,
"margin_dpo/margin_mean": 12.339146614074707,
"margin_dpo/margin_std": 14.47273063659668,
"step": 380
},
{
"epoch": 0.5759637188208617,
"grad_norm": 36.389286041259766,
"learning_rate": 2.288697335747027e-07,
"logits/chosen": 0.34177953004837036,
"logits/rejected": 0.31463104486465454,
"logps/chosen": -75.02536010742188,
"logps/ref_chosen": -58.2139892578125,
"logps/ref_rejected": -60.78669357299805,
"logps/rejected": -82.91322326660156,
"loss": 1.2548,
"margin_dpo/margin_mean": 5.315165996551514,
"margin_dpo/margin_std": 12.719392776489258,
"step": 381
},
{
"epoch": 0.5774754346182918,
"grad_norm": 29.557825088500977,
"learning_rate": 2.2755254742257706e-07,
"logits/chosen": 0.39128854870796204,
"logits/rejected": 0.3420262336730957,
"logps/chosen": -79.03041076660156,
"logps/ref_chosen": -61.82532501220703,
"logps/ref_rejected": -83.0452880859375,
"logps/rejected": -108.56074523925781,
"loss": 0.9752,
"margin_dpo/margin_mean": 8.310381889343262,
"margin_dpo/margin_std": 11.493642807006836,
"step": 382
},
{
"epoch": 0.5789871504157218,
"grad_norm": 37.190284729003906,
"learning_rate": 2.2623598917395436e-07,
"logits/chosen": 0.2815375328063965,
"logits/rejected": 0.29732605814933777,
"logps/chosen": -96.2820053100586,
"logps/ref_chosen": -80.56326293945312,
"logps/ref_rejected": -74.62922668457031,
"logps/rejected": -100.91593933105469,
"loss": 0.9926,
"margin_dpo/margin_mean": 10.567968368530273,
"margin_dpo/margin_std": 15.280168533325195,
"step": 383
},
{
"epoch": 0.5804988662131519,
"grad_norm": 31.593162536621094,
"learning_rate": 2.2492009565579875e-07,
"logits/chosen": 0.37002235651016235,
"logits/rejected": 0.3238743543624878,
"logps/chosen": -82.05486297607422,
"logps/ref_chosen": -65.47514343261719,
"logps/ref_rejected": -79.67378234863281,
"logps/rejected": -106.28813171386719,
"loss": 0.8973,
"margin_dpo/margin_mean": 10.034626007080078,
"margin_dpo/margin_std": 12.21096420288086,
"step": 384
},
{
"epoch": 0.582010582010582,
"grad_norm": 31.312774658203125,
"learning_rate": 2.2360490367648084e-07,
"logits/chosen": 0.31117817759513855,
"logits/rejected": 0.2695065438747406,
"logps/chosen": -82.10106658935547,
"logps/ref_chosen": -66.0565185546875,
"logps/ref_rejected": -86.68023681640625,
"logps/rejected": -113.80641174316406,
"loss": 0.8523,
"margin_dpo/margin_mean": 11.08163070678711,
"margin_dpo/margin_std": 12.792245864868164,
"step": 385
},
{
"epoch": 0.5835222978080121,
"grad_norm": 31.37440299987793,
"learning_rate": 2.2229045002474724e-07,
"logits/chosen": 0.2849174737930298,
"logits/rejected": 0.2268781214952469,
"logps/chosen": -93.83451843261719,
"logps/ref_chosen": -75.6236572265625,
"logps/ref_rejected": -92.62330627441406,
"logps/rejected": -119.64865112304688,
"loss": 1.0064,
"margin_dpo/margin_mean": 8.81447982788086,
"margin_dpo/margin_std": 12.923885345458984,
"step": 386
},
{
"epoch": 0.5850340136054422,
"grad_norm": 25.713747024536133,
"learning_rate": 2.209767714686924e-07,
"logits/chosen": 0.412885844707489,
"logits/rejected": 0.31256186962127686,
"logps/chosen": -62.97633743286133,
"logps/ref_chosen": -47.22170639038086,
"logps/ref_rejected": -87.33814239501953,
"logps/rejected": -114.74981689453125,
"loss": 0.7702,
"margin_dpo/margin_mean": 11.657045364379883,
"margin_dpo/margin_std": 11.923395156860352,
"step": 387
},
{
"epoch": 0.5865457294028723,
"grad_norm": 32.77546310424805,
"learning_rate": 2.1966390475472954e-07,
"logits/chosen": 0.39520263671875,
"logits/rejected": 0.3797769546508789,
"logps/chosen": -90.493408203125,
"logps/ref_chosen": -74.5794677734375,
"logps/ref_rejected": -79.92558288574219,
"logps/rejected": -104.2813720703125,
"loss": 1.1101,
"margin_dpo/margin_mean": 8.441862106323242,
"margin_dpo/margin_std": 14.776924133300781,
"step": 388
},
{
"epoch": 0.5880574452003023,
"grad_norm": 45.588809967041016,
"learning_rate": 2.1835188660656265e-07,
"logits/chosen": 0.3767646551132202,
"logits/rejected": 0.3376935124397278,
"logps/chosen": -79.09071350097656,
"logps/ref_chosen": -61.624366760253906,
"logps/ref_rejected": -76.50978088378906,
"logps/rejected": -104.31814575195312,
"loss": 0.9563,
"margin_dpo/margin_mean": 10.34201431274414,
"margin_dpo/margin_std": 13.894803047180176,
"step": 389
},
{
"epoch": 0.5895691609977324,
"grad_norm": 26.247488021850586,
"learning_rate": 2.170407537241599e-07,
"logits/chosen": 0.4315715730190277,
"logits/rejected": 0.3645378351211548,
"logps/chosen": -61.070472717285156,
"logps/ref_chosen": -45.871864318847656,
"logps/ref_rejected": -61.305999755859375,
"logps/rejected": -86.49710083007812,
"loss": 0.8893,
"margin_dpo/margin_mean": 9.992494583129883,
"margin_dpo/margin_std": 12.295760154724121,
"step": 390
},
{
"epoch": 0.5910808767951625,
"grad_norm": 29.182357788085938,
"learning_rate": 2.1573054278272636e-07,
"logits/chosen": 0.3792092204093933,
"logits/rejected": 0.31141602993011475,
"logps/chosen": -73.98717498779297,
"logps/ref_chosen": -58.18701171875,
"logps/ref_rejected": -83.63443756103516,
"logps/rejected": -110.81365966796875,
"loss": 0.98,
"margin_dpo/margin_mean": 11.379058837890625,
"margin_dpo/margin_std": 15.968740463256836,
"step": 391
},
{
"epoch": 0.5925925925925926,
"grad_norm": 36.35900115966797,
"learning_rate": 2.1442129043167873e-07,
"logits/chosen": 0.48609426617622375,
"logits/rejected": 0.4238324761390686,
"logps/chosen": -83.52938842773438,
"logps/ref_chosen": -69.74452209472656,
"logps/ref_rejected": -94.05877685546875,
"logps/rejected": -120.82524871826172,
"loss": 0.9217,
"margin_dpo/margin_mean": 12.981613159179688,
"margin_dpo/margin_std": 17.47749137878418,
"step": 392
},
{
"epoch": 0.5941043083900227,
"grad_norm": 27.474313735961914,
"learning_rate": 2.131130332936195e-07,
"logits/chosen": 0.41157129406929016,
"logits/rejected": 0.36979395151138306,
"logps/chosen": -69.56451416015625,
"logps/ref_chosen": -52.33489990234375,
"logps/ref_rejected": -74.33810424804688,
"logps/rejected": -103.74046325683594,
"loss": 0.8095,
"margin_dpo/margin_mean": 12.172752380371094,
"margin_dpo/margin_std": 13.218109130859375,
"step": 393
},
{
"epoch": 0.5956160241874527,
"grad_norm": 29.2933349609375,
"learning_rate": 2.1180580796331323e-07,
"logits/chosen": 0.44927114248275757,
"logits/rejected": 0.41426438093185425,
"logps/chosen": -76.46290588378906,
"logps/ref_chosen": -60.6761360168457,
"logps/ref_rejected": -71.36075592041016,
"logps/rejected": -97.13763427734375,
"loss": 0.8872,
"margin_dpo/margin_mean": 9.990107536315918,
"margin_dpo/margin_std": 12.077360153198242,
"step": 394
},
{
"epoch": 0.5971277399848829,
"grad_norm": 31.02004051208496,
"learning_rate": 2.104996510066625e-07,
"logits/chosen": 0.4154667258262634,
"logits/rejected": 0.3240908682346344,
"logps/chosen": -66.60691833496094,
"logps/ref_chosen": -50.60432434082031,
"logps/ref_rejected": -77.08731079101562,
"logps/rejected": -102.46751403808594,
"loss": 0.9324,
"margin_dpo/margin_mean": 9.377607345581055,
"margin_dpo/margin_std": 12.179868698120117,
"step": 395
},
{
"epoch": 0.5986394557823129,
"grad_norm": 30.178119659423828,
"learning_rate": 2.0919459895968517e-07,
"logits/chosen": 0.38538146018981934,
"logits/rejected": 0.2900884747505188,
"logps/chosen": -66.71278381347656,
"logps/ref_chosen": -51.35961151123047,
"logps/ref_rejected": -79.89360046386719,
"logps/rejected": -106.4371337890625,
"loss": 0.8753,
"margin_dpo/margin_mean": 11.190366744995117,
"margin_dpo/margin_std": 12.479292869567871,
"step": 396
},
{
"epoch": 0.600151171579743,
"grad_norm": 46.14140701293945,
"learning_rate": 2.078906883274924e-07,
"logits/chosen": 0.3269771933555603,
"logits/rejected": 0.2809567153453827,
"logps/chosen": -84.61856842041016,
"logps/ref_chosen": -66.45622253417969,
"logps/ref_rejected": -85.74736022949219,
"logps/rejected": -109.89884948730469,
"loss": 1.3166,
"margin_dpo/margin_mean": 5.98914909362793,
"margin_dpo/margin_std": 14.911293029785156,
"step": 397
},
{
"epoch": 0.6016628873771731,
"grad_norm": 27.392459869384766,
"learning_rate": 2.065879555832674e-07,
"logits/chosen": 0.3728417158126831,
"logits/rejected": 0.3049323558807373,
"logps/chosen": -64.94625091552734,
"logps/ref_chosen": -49.244239807128906,
"logps/ref_rejected": -75.18949127197266,
"logps/rejected": -102.7054672241211,
"loss": 0.8319,
"margin_dpo/margin_mean": 11.813968658447266,
"margin_dpo/margin_std": 13.981929779052734,
"step": 398
},
{
"epoch": 0.6031746031746031,
"grad_norm": 30.352577209472656,
"learning_rate": 2.052864371672457e-07,
"logits/chosen": 0.3121348023414612,
"logits/rejected": 0.1789398044347763,
"logps/chosen": -85.18453979492188,
"logps/ref_chosen": -68.30679321289062,
"logps/ref_rejected": -113.2708511352539,
"logps/rejected": -143.73336791992188,
"loss": 0.7977,
"margin_dpo/margin_mean": 13.584760665893555,
"margin_dpo/margin_std": 16.35565185546875,
"step": 399
},
{
"epoch": 0.6046863189720333,
"grad_norm": 40.554786682128906,
"learning_rate": 2.0398616948569493e-07,
"logits/chosen": 0.397554486989975,
"logits/rejected": 0.3408077657222748,
"logps/chosen": -91.72099304199219,
"logps/ref_chosen": -71.62649536132812,
"logps/ref_rejected": -90.98765563964844,
"logps/rejected": -119.32801055908203,
"loss": 1.084,
"margin_dpo/margin_mean": 8.245853424072266,
"margin_dpo/margin_std": 14.028858184814453,
"step": 400
},
{
"epoch": 0.6046863189720333,
"eval_logits/chosen": 0.391160786151886,
"eval_logits/rejected": 0.3394322395324707,
"eval_logps/chosen": -91.2626724243164,
"eval_logps/ref_chosen": -74.85946655273438,
"eval_logps/ref_rejected": -79.54898834228516,
"eval_logps/rejected": -104.81573486328125,
"eval_loss": 0.5336794853210449,
"eval_margin_dpo/margin_mean": 8.86353588104248,
"eval_margin_dpo/margin_std": 14.35659122467041,
"eval_runtime": 38.6454,
"eval_samples_per_second": 59.593,
"eval_steps_per_second": 1.863,
"step": 400
},
{
"epoch": 0.6061980347694633,
"grad_norm": 26.281349182128906,
"learning_rate": 2.0268718890989752e-07,
"logits/chosen": 0.4084246754646301,
"logits/rejected": 0.31023353338241577,
"logps/chosen": -68.73127746582031,
"logps/ref_chosen": -53.72496032714844,
"logps/ref_rejected": -75.06304931640625,
"logps/rejected": -103.77824401855469,
"loss": 0.7719,
"margin_dpo/margin_mean": 13.708871841430664,
"margin_dpo/margin_std": 14.911130905151367,
"step": 401
},
{
"epoch": 0.6077097505668935,
"grad_norm": 32.728675842285156,
"learning_rate": 2.013895317751323e-07,
"logits/chosen": 0.40241509675979614,
"logits/rejected": 0.3629066050052643,
"logps/chosen": -77.94585418701172,
"logps/ref_chosen": -61.873931884765625,
"logps/ref_rejected": -66.1519775390625,
"logps/rejected": -92.5583724975586,
"loss": 0.9704,
"margin_dpo/margin_mean": 10.334466934204102,
"margin_dpo/margin_std": 14.114884376525879,
"step": 402
},
{
"epoch": 0.6092214663643235,
"grad_norm": 26.115026473999023,
"learning_rate": 2.0009323437965898e-07,
"logits/chosen": 0.5070240497589111,
"logits/rejected": 0.42596685886383057,
"logps/chosen": -68.05751037597656,
"logps/ref_chosen": -51.321502685546875,
"logps/ref_rejected": -86.54010772705078,
"logps/rejected": -117.83511352539062,
"loss": 0.7949,
"margin_dpo/margin_mean": 14.55899715423584,
"margin_dpo/margin_std": 16.20275115966797,
"step": 403
},
{
"epoch": 0.6107331821617535,
"grad_norm": 34.61177444458008,
"learning_rate": 1.9879833298370237e-07,
"logits/chosen": 0.3717535734176636,
"logits/rejected": 0.2808607518672943,
"logps/chosen": -77.73711395263672,
"logps/ref_chosen": -62.26288604736328,
"logps/ref_rejected": -95.19029998779297,
"logps/rejected": -122.16180419921875,
"loss": 0.925,
"margin_dpo/margin_mean": 11.497272491455078,
"margin_dpo/margin_std": 14.764816284179688,
"step": 404
},
{
"epoch": 0.6122448979591837,
"grad_norm": 26.180768966674805,
"learning_rate": 1.975048638084379e-07,
"logits/chosen": 0.44532907009124756,
"logits/rejected": 0.39605429768562317,
"logps/chosen": -66.79632568359375,
"logps/ref_chosen": -50.58434295654297,
"logps/ref_rejected": -65.43156433105469,
"logps/rejected": -92.28169250488281,
"loss": 0.9244,
"margin_dpo/margin_mean": 10.638147354125977,
"margin_dpo/margin_std": 12.895776748657227,
"step": 405
},
{
"epoch": 0.6137566137566137,
"grad_norm": 26.834482192993164,
"learning_rate": 1.9621286303497914e-07,
"logits/chosen": 0.4308337867259979,
"logits/rejected": 0.2796477973461151,
"logps/chosen": -64.9953842163086,
"logps/ref_chosen": -48.99560546875,
"logps/ref_rejected": -92.47773742675781,
"logps/rejected": -120.5987777709961,
"loss": 0.8967,
"margin_dpo/margin_mean": 12.121261596679688,
"margin_dpo/margin_std": 14.868024826049805,
"step": 406
},
{
"epoch": 0.6152683295540439,
"grad_norm": 41.65665817260742,
"learning_rate": 1.9492236680336483e-07,
"logits/chosen": 0.29304084181785583,
"logits/rejected": 0.21816039085388184,
"logps/chosen": -109.28121948242188,
"logps/ref_chosen": -89.40056610107422,
"logps/ref_rejected": -99.28775024414062,
"logps/rejected": -128.31704711914062,
"loss": 1.0474,
"margin_dpo/margin_mean": 9.148632049560547,
"margin_dpo/margin_std": 14.953045845031738,
"step": 407
},
{
"epoch": 0.6167800453514739,
"grad_norm": 24.90476417541504,
"learning_rate": 1.9363341121154895e-07,
"logits/chosen": 0.38886088132858276,
"logits/rejected": 0.3072519898414612,
"logps/chosen": -69.03826904296875,
"logps/ref_chosen": -54.70391845703125,
"logps/ref_rejected": -73.98648834228516,
"logps/rejected": -101.77029418945312,
"loss": 0.7863,
"margin_dpo/margin_mean": 13.449457168579102,
"margin_dpo/margin_std": 14.428291320800781,
"step": 408
},
{
"epoch": 0.618291761148904,
"grad_norm": 33.50522232055664,
"learning_rate": 1.9234603231438994e-07,
"logits/chosen": 0.3977552652359009,
"logits/rejected": 0.397859662771225,
"logps/chosen": -79.27891540527344,
"logps/ref_chosen": -62.11822509765625,
"logps/ref_rejected": -61.933509826660156,
"logps/rejected": -87.280517578125,
"loss": 1.1203,
"margin_dpo/margin_mean": 8.186317443847656,
"margin_dpo/margin_std": 14.839698791503906,
"step": 409
},
{
"epoch": 0.6198034769463341,
"grad_norm": 29.58933448791504,
"learning_rate": 1.9106026612264315e-07,
"logits/chosen": 0.4006178379058838,
"logits/rejected": 0.3718770742416382,
"logps/chosen": -77.56334686279297,
"logps/ref_chosen": -61.80265808105469,
"logps/ref_rejected": -76.60001373291016,
"logps/rejected": -102.92352294921875,
"loss": 0.809,
"margin_dpo/margin_mean": 10.562816619873047,
"margin_dpo/margin_std": 10.718137741088867,
"step": 410
},
{
"epoch": 0.6213151927437641,
"grad_norm": 37.61003875732422,
"learning_rate": 1.8977614860195296e-07,
"logits/chosen": 0.38806748390197754,
"logits/rejected": 0.3275112211704254,
"logps/chosen": -73.58920288085938,
"logps/ref_chosen": -54.445396423339844,
"logps/ref_rejected": -74.56507873535156,
"logps/rejected": -105.17032623291016,
"loss": 1.0566,
"margin_dpo/margin_mean": 11.461427688598633,
"margin_dpo/margin_std": 17.844844818115234,
"step": 411
},
{
"epoch": 0.6228269085411943,
"grad_norm": 30.01692008972168,
"learning_rate": 1.8849371567184662e-07,
"logits/chosen": 0.39307522773742676,
"logits/rejected": 0.32400524616241455,
"logps/chosen": -74.90364837646484,
"logps/ref_chosen": -55.248085021972656,
"logps/ref_rejected": -68.96623229980469,
"logps/rejected": -99.93446350097656,
"loss": 0.8748,
"margin_dpo/margin_mean": 11.312671661376953,
"margin_dpo/margin_std": 14.340400695800781,
"step": 412
},
{
"epoch": 0.6243386243386243,
"grad_norm": 37.489013671875,
"learning_rate": 1.872130032047302e-07,
"logits/chosen": 0.2387150228023529,
"logits/rejected": 0.20277327299118042,
"logps/chosen": -88.35789489746094,
"logps/ref_chosen": -68.72074890136719,
"logps/ref_rejected": -78.76539611816406,
"logps/rejected": -108.86207580566406,
"loss": 1.0298,
"margin_dpo/margin_mean": 10.459529876708984,
"margin_dpo/margin_std": 16.46000099182129,
"step": 413
},
{
"epoch": 0.6258503401360545,
"grad_norm": 30.536802291870117,
"learning_rate": 1.8593404702488436e-07,
"logits/chosen": 0.38499364256858826,
"logits/rejected": 0.32137370109558105,
"logps/chosen": -72.59126281738281,
"logps/ref_chosen": -54.13821792602539,
"logps/ref_rejected": -74.65741729736328,
"logps/rejected": -105.45807647705078,
"loss": 0.8818,
"margin_dpo/margin_mean": 12.347614288330078,
"margin_dpo/margin_std": 15.455865859985352,
"step": 414
},
{
"epoch": 0.6273620559334845,
"grad_norm": 33.240840911865234,
"learning_rate": 1.846568829074628e-07,
"logits/chosen": 0.39433568716049194,
"logits/rejected": 0.3705742359161377,
"logps/chosen": -73.94824981689453,
"logps/ref_chosen": -55.91856002807617,
"logps/ref_rejected": -61.747703552246094,
"logps/rejected": -90.88080596923828,
"loss": 1.0148,
"margin_dpo/margin_mean": 11.103410720825195,
"margin_dpo/margin_std": 16.630678176879883,
"step": 415
},
{
"epoch": 0.6288737717309146,
"grad_norm": 39.10771179199219,
"learning_rate": 1.8338154657749128e-07,
"logits/chosen": 0.36848437786102295,
"logits/rejected": 0.316191703081131,
"logps/chosen": -73.673095703125,
"logps/ref_chosen": -54.72308349609375,
"logps/ref_rejected": -69.17388916015625,
"logps/rejected": -97.229248046875,
"loss": 1.118,
"margin_dpo/margin_mean": 9.105339050292969,
"margin_dpo/margin_std": 14.868558883666992,
"step": 416
},
{
"epoch": 0.6303854875283447,
"grad_norm": 35.6463508605957,
"learning_rate": 1.8210807370886849e-07,
"logits/chosen": 0.4883832335472107,
"logits/rejected": 0.4186071753501892,
"logps/chosen": -77.90934753417969,
"logps/ref_chosen": -56.791259765625,
"logps/ref_rejected": -68.7791748046875,
"logps/rejected": -101.26014709472656,
"loss": 0.9539,
"margin_dpo/margin_mean": 11.362879753112793,
"margin_dpo/margin_std": 15.073711395263672,
"step": 417
},
{
"epoch": 0.6318972033257747,
"grad_norm": 48.182716369628906,
"learning_rate": 1.8083649992336825e-07,
"logits/chosen": 0.39419084787368774,
"logits/rejected": 0.39882832765579224,
"logps/chosen": -91.99015808105469,
"logps/ref_chosen": -69.10798645019531,
"logps/ref_rejected": -75.09132385253906,
"logps/rejected": -105.7783203125,
"loss": 1.2277,
"margin_dpo/margin_mean": 7.804815292358398,
"margin_dpo/margin_std": 16.62636375427246,
"step": 418
},
{
"epoch": 0.6334089191232048,
"grad_norm": 29.898523330688477,
"learning_rate": 1.7956686078964255e-07,
"logits/chosen": 0.2999052107334137,
"logits/rejected": 0.23983854055404663,
"logps/chosen": -74.1788558959961,
"logps/ref_chosen": -58.1717643737793,
"logps/ref_rejected": -71.67066955566406,
"logps/rejected": -100.13961791992188,
"loss": 0.8586,
"margin_dpo/margin_mean": 12.461854934692383,
"margin_dpo/margin_std": 14.932546615600586,
"step": 419
},
{
"epoch": 0.6349206349206349,
"grad_norm": 41.883358001708984,
"learning_rate": 1.782991918222275e-07,
"logits/chosen": 0.3532024621963501,
"logits/rejected": 0.30437958240509033,
"logps/chosen": -79.4993896484375,
"logps/ref_chosen": -57.05351257324219,
"logps/ref_rejected": -62.670982360839844,
"logps/rejected": -92.65522766113281,
"loss": 1.2803,
"margin_dpo/margin_mean": 7.538368225097656,
"margin_dpo/margin_std": 16.784019470214844,
"step": 420
},
{
"epoch": 0.636432350718065,
"grad_norm": 41.40155792236328,
"learning_rate": 1.7703352848054887e-07,
"logits/chosen": 0.339724600315094,
"logits/rejected": 0.27951130270957947,
"logps/chosen": -78.44889068603516,
"logps/ref_chosen": -57.32324981689453,
"logps/ref_rejected": -75.33782958984375,
"logps/rejected": -105.40597534179688,
"loss": 1.2636,
"margin_dpo/margin_mean": 8.942495346069336,
"margin_dpo/margin_std": 18.20132064819336,
"step": 421
},
{
"epoch": 0.6379440665154951,
"grad_norm": 32.80634689331055,
"learning_rate": 1.7576990616793137e-07,
"logits/chosen": 0.37151938676834106,
"logits/rejected": 0.34015408158302307,
"logps/chosen": -84.34286499023438,
"logps/ref_chosen": -67.05757904052734,
"logps/ref_rejected": -72.12803649902344,
"logps/rejected": -100.574951171875,
"loss": 0.8976,
"margin_dpo/margin_mean": 11.161640167236328,
"margin_dpo/margin_std": 14.631818771362305,
"step": 422
},
{
"epoch": 0.6394557823129252,
"grad_norm": 30.248695373535156,
"learning_rate": 1.745083602306071e-07,
"logits/chosen": 0.4215930104255676,
"logits/rejected": 0.35043540596961975,
"logps/chosen": -72.92449951171875,
"logps/ref_chosen": -54.061668395996094,
"logps/ref_rejected": -76.64092254638672,
"logps/rejected": -107.888671875,
"loss": 0.9148,
"margin_dpo/margin_mean": 12.38492202758789,
"margin_dpo/margin_std": 16.511720657348633,
"step": 423
},
{
"epoch": 0.6409674981103552,
"grad_norm": 32.00240707397461,
"learning_rate": 1.7324892595672804e-07,
"logits/chosen": 0.29672929644584656,
"logits/rejected": 0.2571881115436554,
"logps/chosen": -71.3875503540039,
"logps/ref_chosen": -53.60887145996094,
"logps/ref_rejected": -79.2139892578125,
"logps/rejected": -109.96659851074219,
"loss": 0.8388,
"margin_dpo/margin_mean": 12.973922729492188,
"margin_dpo/margin_std": 15.422065734863281,
"step": 424
},
{
"epoch": 0.6424792139077853,
"grad_norm": 29.581701278686523,
"learning_rate": 1.7199163857537824e-07,
"logits/chosen": 0.4177130460739136,
"logits/rejected": 0.38378778100013733,
"logps/chosen": -77.00273895263672,
"logps/ref_chosen": -58.41468048095703,
"logps/ref_rejected": -66.59054565429688,
"logps/rejected": -95.47838592529297,
"loss": 0.9742,
"margin_dpo/margin_mean": 10.299787521362305,
"margin_dpo/margin_std": 14.727434158325195,
"step": 425
},
{
"epoch": 0.6439909297052154,
"grad_norm": 44.239986419677734,
"learning_rate": 1.7073653325558828e-07,
"logits/chosen": 0.34781593084335327,
"logits/rejected": 0.3494877219200134,
"logps/chosen": -93.83209228515625,
"logps/ref_chosen": -71.70822143554688,
"logps/ref_rejected": -73.57725524902344,
"logps/rejected": -102.10546875,
"loss": 1.2957,
"margin_dpo/margin_mean": 6.4043288230896,
"margin_dpo/margin_std": 16.000539779663086,
"step": 426
},
{
"epoch": 0.6455026455026455,
"grad_norm": 30.25602149963379,
"learning_rate": 1.6948364510535218e-07,
"logits/chosen": 0.4051620662212372,
"logits/rejected": 0.3498576283454895,
"logps/chosen": -78.96176147460938,
"logps/ref_chosen": -58.64276885986328,
"logps/ref_rejected": -86.25437927246094,
"logps/rejected": -117.43075561523438,
"loss": 0.9869,
"margin_dpo/margin_mean": 10.857380867004395,
"margin_dpo/margin_std": 16.25821304321289,
"step": 427
},
{
"epoch": 0.6470143613000756,
"grad_norm": 35.63125991821289,
"learning_rate": 1.6823300917064458e-07,
"logits/chosen": 0.34857386350631714,
"logits/rejected": 0.29594945907592773,
"logps/chosen": -86.02629089355469,
"logps/ref_chosen": -66.5960464477539,
"logps/ref_rejected": -82.3941650390625,
"logps/rejected": -114.90159606933594,
"loss": 0.9136,
"margin_dpo/margin_mean": 13.07719612121582,
"margin_dpo/margin_std": 17.15648651123047,
"step": 428
},
{
"epoch": 0.6485260770975056,
"grad_norm": 32.7097282409668,
"learning_rate": 1.669846604344412e-07,
"logits/chosen": 0.3295377492904663,
"logits/rejected": 0.33729058504104614,
"logps/chosen": -77.25363159179688,
"logps/ref_chosen": -57.009700775146484,
"logps/ref_rejected": -59.86549377441406,
"logps/rejected": -90.89605712890625,
"loss": 0.9879,
"margin_dpo/margin_mean": 10.786637306213379,
"margin_dpo/margin_std": 15.325776100158691,
"step": 429
},
{
"epoch": 0.6500377928949358,
"grad_norm": 29.613990783691406,
"learning_rate": 1.6573863381573954e-07,
"logits/chosen": 0.25784656405448914,
"logits/rejected": 0.24630099534988403,
"logps/chosen": -78.46931457519531,
"logps/ref_chosen": -59.563194274902344,
"logps/ref_rejected": -70.52289581298828,
"logps/rejected": -101.87786865234375,
"loss": 0.8551,
"margin_dpo/margin_mean": 12.448851585388184,
"margin_dpo/margin_std": 15.234640121459961,
"step": 430
},
{
"epoch": 0.6515495086923658,
"grad_norm": 31.293846130371094,
"learning_rate": 1.6449496416858282e-07,
"logits/chosen": 0.3527390658855438,
"logits/rejected": 0.3057120442390442,
"logps/chosen": -67.58097839355469,
"logps/ref_chosen": -50.20032501220703,
"logps/ref_rejected": -77.81680297851562,
"logps/rejected": -105.64657592773438,
"loss": 0.9619,
"margin_dpo/margin_mean": 10.449111938476562,
"margin_dpo/margin_std": 15.20313549041748,
"step": 431
},
{
"epoch": 0.6530612244897959,
"grad_norm": 28.854806900024414,
"learning_rate": 1.632536862810844e-07,
"logits/chosen": 0.40595024824142456,
"logits/rejected": 0.35618919134140015,
"logps/chosen": -79.57620239257812,
"logps/ref_chosen": -61.662757873535156,
"logps/ref_rejected": -83.94496154785156,
"logps/rejected": -112.93707275390625,
"loss": 0.9282,
"margin_dpo/margin_mean": 11.078676223754883,
"margin_dpo/margin_std": 14.486236572265625,
"step": 432
},
{
"epoch": 0.654572940287226,
"grad_norm": 28.713123321533203,
"learning_rate": 1.6201483487445515e-07,
"logits/chosen": 0.468322217464447,
"logits/rejected": 0.45534592866897583,
"logps/chosen": -82.63919067382812,
"logps/ref_chosen": -63.72918701171875,
"logps/ref_rejected": -65.8391342163086,
"logps/rejected": -98.62335205078125,
"loss": 0.8754,
"margin_dpo/margin_mean": 13.874216079711914,
"margin_dpo/margin_std": 17.385021209716797,
"step": 433
},
{
"epoch": 0.656084656084656,
"grad_norm": 29.267993927001953,
"learning_rate": 1.6077844460203204e-07,
"logits/chosen": 0.4577040672302246,
"logits/rejected": 0.3909740447998047,
"logps/chosen": -64.70407104492188,
"logps/ref_chosen": -47.97331619262695,
"logps/ref_rejected": -72.51132202148438,
"logps/rejected": -101.47706604003906,
"loss": 0.988,
"margin_dpo/margin_mean": 12.234983444213867,
"margin_dpo/margin_std": 16.952194213867188,
"step": 434
},
{
"epoch": 0.6575963718820862,
"grad_norm": 32.02671813964844,
"learning_rate": 1.5954455004830878e-07,
"logits/chosen": 0.4733126759529114,
"logits/rejected": 0.43098342418670654,
"logps/chosen": -76.56661987304688,
"logps/ref_chosen": -57.06024932861328,
"logps/ref_rejected": -71.69146728515625,
"logps/rejected": -101.79170227050781,
"loss": 0.9515,
"margin_dpo/margin_mean": 10.593864440917969,
"margin_dpo/margin_std": 14.23061752319336,
"step": 435
},
{
"epoch": 0.6591080876795162,
"grad_norm": 30.705045700073242,
"learning_rate": 1.5831318572796847e-07,
"logits/chosen": 0.3775298297405243,
"logits/rejected": 0.31887999176979065,
"logps/chosen": -74.879150390625,
"logps/ref_chosen": -56.158050537109375,
"logps/ref_rejected": -67.63787841796875,
"logps/rejected": -97.3807373046875,
"loss": 0.9631,
"margin_dpo/margin_mean": 11.02175521850586,
"margin_dpo/margin_std": 14.938774108886719,
"step": 436
},
{
"epoch": 0.6606198034769464,
"grad_norm": 35.06089782714844,
"learning_rate": 1.5708438608491815e-07,
"logits/chosen": 0.36842063069343567,
"logits/rejected": 0.2398551106452942,
"logps/chosen": -77.53937530517578,
"logps/ref_chosen": -56.98578643798828,
"logps/ref_rejected": -85.61524963378906,
"logps/rejected": -116.64567565917969,
"loss": 1.1462,
"margin_dpo/margin_mean": 10.476823806762695,
"margin_dpo/margin_std": 18.057945251464844,
"step": 437
},
{
"epoch": 0.6621315192743764,
"grad_norm": 27.175416946411133,
"learning_rate": 1.558581854913253e-07,
"logits/chosen": 0.4072152376174927,
"logits/rejected": 0.33879750967025757,
"logps/chosen": -59.64152145385742,
"logps/ref_chosen": -41.27777862548828,
"logps/ref_rejected": -65.33840942382812,
"logps/rejected": -96.728515625,
"loss": 0.8997,
"margin_dpo/margin_mean": 13.026363372802734,
"margin_dpo/margin_std": 16.39688491821289,
"step": 438
},
{
"epoch": 0.6636432350718064,
"grad_norm": 34.17799758911133,
"learning_rate": 1.5463461824665658e-07,
"logits/chosen": 0.3313141465187073,
"logits/rejected": 0.29224908351898193,
"logps/chosen": -99.81672668457031,
"logps/ref_chosen": -81.41764831542969,
"logps/ref_rejected": -94.72309875488281,
"logps/rejected": -124.81297302246094,
"loss": 0.8734,
"margin_dpo/margin_mean": 11.690802574157715,
"margin_dpo/margin_std": 14.000129699707031,
"step": 439
},
{
"epoch": 0.6651549508692366,
"grad_norm": 35.477378845214844,
"learning_rate": 1.534137185767178e-07,
"logits/chosen": 0.3310989737510681,
"logits/rejected": 0.23319561779499054,
"logps/chosen": -59.91465759277344,
"logps/ref_chosen": -42.538185119628906,
"logps/ref_rejected": -69.78813934326172,
"logps/rejected": -99.70895385742188,
"loss": 0.89,
"margin_dpo/margin_mean": 12.544342041015625,
"margin_dpo/margin_std": 15.960411071777344,
"step": 440
},
{
"epoch": 0.6666666666666666,
"grad_norm": 25.895111083984375,
"learning_rate": 1.521955206326976e-07,
"logits/chosen": 0.32143065333366394,
"logits/rejected": 0.2299826741218567,
"logps/chosen": -73.61805725097656,
"logps/ref_chosen": -57.593223571777344,
"logps/ref_rejected": -84.82878875732422,
"logps/rejected": -114.32113647460938,
"loss": 0.7072,
"margin_dpo/margin_mean": 13.467504501342773,
"margin_dpo/margin_std": 12.755413055419922,
"step": 441
},
{
"epoch": 0.6681783824640968,
"grad_norm": 34.46872329711914,
"learning_rate": 1.5098005849021078e-07,
"logits/chosen": 0.38055524230003357,
"logits/rejected": 0.3312191963195801,
"logps/chosen": -88.37391662597656,
"logps/ref_chosen": -67.46121978759766,
"logps/ref_rejected": -89.0693588256836,
"logps/rejected": -121.82472229003906,
"loss": 0.9047,
"margin_dpo/margin_mean": 11.842670440673828,
"margin_dpo/margin_std": 15.179086685180664,
"step": 442
},
{
"epoch": 0.6696900982615268,
"grad_norm": 27.29175567626953,
"learning_rate": 1.4976736614834662e-07,
"logits/chosen": 0.3942459225654602,
"logits/rejected": 0.31970757246017456,
"logps/chosen": -71.87895202636719,
"logps/ref_chosen": -54.79609680175781,
"logps/ref_rejected": -77.80782318115234,
"logps/rejected": -110.1363525390625,
"loss": 0.8146,
"margin_dpo/margin_mean": 15.245677947998047,
"margin_dpo/margin_std": 17.69448471069336,
"step": 443
},
{
"epoch": 0.671201814058957,
"grad_norm": 40.294551849365234,
"learning_rate": 1.4855747752871654e-07,
"logits/chosen": 0.384095162153244,
"logits/rejected": 0.30130642652511597,
"logps/chosen": -80.62181091308594,
"logps/ref_chosen": -58.749061584472656,
"logps/ref_rejected": -86.87397003173828,
"logps/rejected": -114.7911376953125,
"loss": 1.3101,
"margin_dpo/margin_mean": 6.04442024230957,
"margin_dpo/margin_std": 14.843099594116211,
"step": 444
},
{
"epoch": 0.672713529856387,
"grad_norm": 34.12118148803711,
"learning_rate": 1.473504264745062e-07,
"logits/chosen": 0.3511345088481903,
"logits/rejected": 0.3304353356361389,
"logps/chosen": -83.09930419921875,
"logps/ref_chosen": -60.91743850708008,
"logps/ref_rejected": -71.56373596191406,
"logps/rejected": -105.06558990478516,
"loss": 0.9353,
"margin_dpo/margin_mean": 11.31997299194336,
"margin_dpo/margin_std": 14.875,
"step": 445
},
{
"epoch": 0.674225245653817,
"grad_norm": 23.161922454833984,
"learning_rate": 1.461462467495284e-07,
"logits/chosen": 0.4186308979988098,
"logits/rejected": 0.3374045491218567,
"logps/chosen": -66.56710052490234,
"logps/ref_chosen": -48.79924774169922,
"logps/ref_rejected": -71.87195587158203,
"logps/rejected": -105.11751556396484,
"loss": 0.6502,
"margin_dpo/margin_mean": 15.477707862854004,
"margin_dpo/margin_std": 13.376574516296387,
"step": 446
},
{
"epoch": 0.6757369614512472,
"grad_norm": 27.70553207397461,
"learning_rate": 1.4494497203727843e-07,
"logits/chosen": 0.3202866315841675,
"logits/rejected": 0.21970438957214355,
"logps/chosen": -70.60762023925781,
"logps/ref_chosen": -53.682716369628906,
"logps/ref_rejected": -88.17315673828125,
"logps/rejected": -120.407470703125,
"loss": 0.7725,
"margin_dpo/margin_mean": 15.309407234191895,
"margin_dpo/margin_std": 16.02827262878418,
"step": 447
},
{
"epoch": 0.6772486772486772,
"grad_norm": 26.229351043701172,
"learning_rate": 1.4374663593999256e-07,
"logits/chosen": 0.39596807956695557,
"logits/rejected": 0.34193187952041626,
"logps/chosen": -72.67399597167969,
"logps/ref_chosen": -53.75125503540039,
"logps/ref_rejected": -77.17623901367188,
"logps/rejected": -108.64974975585938,
"loss": 0.8676,
"margin_dpo/margin_mean": 12.550762176513672,
"margin_dpo/margin_std": 15.388051986694336,
"step": 448
},
{
"epoch": 0.6787603930461074,
"grad_norm": 36.61587142944336,
"learning_rate": 1.4255127197770707e-07,
"logits/chosen": 0.2539058327674866,
"logits/rejected": 0.24261929094791412,
"logps/chosen": -99.29594421386719,
"logps/ref_chosen": -75.82737731933594,
"logps/ref_rejected": -82.20687103271484,
"logps/rejected": -113.31431579589844,
"loss": 1.0785,
"margin_dpo/margin_mean": 7.638876914978027,
"margin_dpo/margin_std": 12.820967674255371,
"step": 449
},
{
"epoch": 0.6802721088435374,
"grad_norm": 33.98078536987305,
"learning_rate": 1.4135891358732205e-07,
"logits/chosen": 0.4526249170303345,
"logits/rejected": 0.34345030784606934,
"logps/chosen": -65.86204528808594,
"logps/ref_chosen": -47.11572265625,
"logps/ref_rejected": -78.7546615600586,
"logps/rejected": -105.94377136230469,
"loss": 1.0763,
"margin_dpo/margin_mean": 8.44278335571289,
"margin_dpo/margin_std": 14.431180953979492,
"step": 450
},
{
"epoch": 0.6817838246409675,
"grad_norm": 35.303871154785156,
"learning_rate": 1.4016959412166437e-07,
"logits/chosen": 0.3524070382118225,
"logits/rejected": 0.3001144528388977,
"logps/chosen": -82.66523742675781,
"logps/ref_chosen": -63.350440979003906,
"logps/ref_rejected": -76.28530883789062,
"logps/rejected": -104.85352325439453,
"loss": 1.0359,
"margin_dpo/margin_mean": 9.253421783447266,
"margin_dpo/margin_std": 14.614404678344727,
"step": 451
},
{
"epoch": 0.6832955404383976,
"grad_norm": 33.860225677490234,
"learning_rate": 1.3898334684855645e-07,
"logits/chosen": 0.34719571471214294,
"logits/rejected": 0.26927071809768677,
"logps/chosen": -74.74221801757812,
"logps/ref_chosen": -55.585838317871094,
"logps/ref_rejected": -77.68738555908203,
"logps/rejected": -107.97444152832031,
"loss": 0.9814,
"margin_dpo/margin_mean": 11.130671501159668,
"margin_dpo/margin_std": 15.483327865600586,
"step": 452
},
{
"epoch": 0.6848072562358276,
"grad_norm": 32.40834045410156,
"learning_rate": 1.3780020494988445e-07,
"logits/chosen": 0.34208589792251587,
"logits/rejected": 0.3065118193626404,
"logps/chosen": -79.8968505859375,
"logps/ref_chosen": -61.778202056884766,
"logps/ref_rejected": -71.51402282714844,
"logps/rejected": -100.97367858886719,
"loss": 0.9782,
"margin_dpo/margin_mean": 11.34100341796875,
"margin_dpo/margin_std": 16.038299560546875,
"step": 453
},
{
"epoch": 0.6863189720332578,
"grad_norm": 29.024457931518555,
"learning_rate": 1.366202015206706e-07,
"logits/chosen": 0.3748947083950043,
"logits/rejected": 0.3294043242931366,
"logps/chosen": -68.87191009521484,
"logps/ref_chosen": -51.59515380859375,
"logps/ref_rejected": -63.967323303222656,
"logps/rejected": -93.42787170410156,
"loss": 0.9702,
"margin_dpo/margin_mean": 12.183794021606445,
"margin_dpo/margin_std": 16.67722511291504,
"step": 454
},
{
"epoch": 0.6878306878306878,
"grad_norm": 31.36719512939453,
"learning_rate": 1.354433695681474e-07,
"logits/chosen": 0.2365991175174713,
"logits/rejected": 0.196326345205307,
"logps/chosen": -90.68863677978516,
"logps/ref_chosen": -70.65170288085938,
"logps/ref_rejected": -77.44276428222656,
"logps/rejected": -109.77125549316406,
"loss": 0.8471,
"margin_dpo/margin_mean": 12.291560173034668,
"margin_dpo/margin_std": 15.06167984008789,
"step": 455
},
{
"epoch": 0.6893424036281179,
"grad_norm": 31.750394821166992,
"learning_rate": 1.3426974201083439e-07,
"logits/chosen": 0.29783540964126587,
"logits/rejected": 0.23817452788352966,
"logps/chosen": -76.01242065429688,
"logps/ref_chosen": -56.398284912109375,
"logps/ref_rejected": -82.61642456054688,
"logps/rejected": -112.9671630859375,
"loss": 0.924,
"margin_dpo/margin_mean": 10.736598014831543,
"margin_dpo/margin_std": 14.164466857910156,
"step": 456
},
{
"epoch": 0.690854119425548,
"grad_norm": 29.68619728088379,
"learning_rate": 1.3309935167761717e-07,
"logits/chosen": 0.42319661378860474,
"logits/rejected": 0.35049009323120117,
"logps/chosen": -65.1966552734375,
"logps/ref_chosen": -44.72057342529297,
"logps/ref_rejected": -68.11585998535156,
"logps/rejected": -98.49119567871094,
"loss": 0.9288,
"margin_dpo/margin_mean": 9.899250984191895,
"margin_dpo/margin_std": 12.776674270629883,
"step": 457
},
{
"epoch": 0.6923658352229781,
"grad_norm": 32.18935775756836,
"learning_rate": 1.3193223130682936e-07,
"logits/chosen": 0.3539985418319702,
"logits/rejected": 0.24194201827049255,
"logps/chosen": -68.06039428710938,
"logps/ref_chosen": -50.00569152832031,
"logps/ref_rejected": -87.50015258789062,
"logps/rejected": -117.12113189697266,
"loss": 0.9265,
"margin_dpo/margin_mean": 11.566278457641602,
"margin_dpo/margin_std": 14.978096008300781,
"step": 458
},
{
"epoch": 0.6938775510204082,
"grad_norm": 29.448610305786133,
"learning_rate": 1.3076841354533658e-07,
"logits/chosen": 0.4100639820098877,
"logits/rejected": 0.3683810532093048,
"logps/chosen": -83.21324157714844,
"logps/ref_chosen": -65.37794494628906,
"logps/ref_rejected": -88.19244384765625,
"logps/rejected": -119.49298095703125,
"loss": 0.819,
"margin_dpo/margin_mean": 13.465246200561523,
"margin_dpo/margin_std": 14.571124076843262,
"step": 459
},
{
"epoch": 0.6953892668178382,
"grad_norm": 31.59938621520996,
"learning_rate": 1.2960793094762345e-07,
"logits/chosen": 0.4093438982963562,
"logits/rejected": 0.2837139070034027,
"logps/chosen": -83.06444549560547,
"logps/ref_chosen": -64.5616683959961,
"logps/ref_rejected": -88.67889404296875,
"logps/rejected": -122.12751770019531,
"loss": 0.7529,
"margin_dpo/margin_mean": 14.945836067199707,
"margin_dpo/margin_std": 16.06540298461914,
"step": 460
},
{
"epoch": 0.6969009826152683,
"grad_norm": 27.819725036621094,
"learning_rate": 1.2845081597488286e-07,
"logits/chosen": 0.49810439348220825,
"logits/rejected": 0.4051767587661743,
"logps/chosen": -65.88737487792969,
"logps/ref_chosen": -49.4779167175293,
"logps/ref_rejected": -72.65262603759766,
"logps/rejected": -101.95249938964844,
"loss": 0.8445,
"margin_dpo/margin_mean": 12.89040756225586,
"margin_dpo/margin_std": 14.886733055114746,
"step": 461
},
{
"epoch": 0.6984126984126984,
"grad_norm": 28.74095344543457,
"learning_rate": 1.27297100994108e-07,
"logits/chosen": 0.34951984882354736,
"logits/rejected": 0.2860267162322998,
"logps/chosen": -78.25369262695312,
"logps/ref_chosen": -60.4951171875,
"logps/ref_rejected": -74.82137298583984,
"logps/rejected": -107.18858337402344,
"loss": 0.7492,
"margin_dpo/margin_mean": 14.608634948730469,
"margin_dpo/margin_std": 15.107083320617676,
"step": 462
},
{
"epoch": 0.6999244142101285,
"grad_norm": 32.04073715209961,
"learning_rate": 1.2614681827718695e-07,
"logits/chosen": 0.37092894315719604,
"logits/rejected": 0.3595684766769409,
"logps/chosen": -87.26763916015625,
"logps/ref_chosen": -67.68511962890625,
"logps/ref_rejected": -71.32196044921875,
"logps/rejected": -100.9955062866211,
"loss": 0.9124,
"margin_dpo/margin_mean": 10.091035842895508,
"margin_dpo/margin_std": 12.723739624023438,
"step": 463
},
{
"epoch": 0.7014361300075586,
"grad_norm": 35.85340881347656,
"learning_rate": 1.2500000000000005e-07,
"logits/chosen": 0.34320521354675293,
"logits/rejected": 0.31048744916915894,
"logps/chosen": -79.22843933105469,
"logps/ref_chosen": -59.16564178466797,
"logps/ref_rejected": -69.56146240234375,
"logps/rejected": -101.13890838623047,
"loss": 0.9691,
"margin_dpo/margin_mean": 11.5146484375,
"margin_dpo/margin_std": 15.515382766723633,
"step": 464
},
{
"epoch": 0.7029478458049887,
"grad_norm": 34.352787017822266,
"learning_rate": 1.238566782415197e-07,
"logits/chosen": 0.44564637541770935,
"logits/rejected": 0.3805859386920929,
"logps/chosen": -79.09562683105469,
"logps/ref_chosen": -58.513671875,
"logps/ref_rejected": -84.31745910644531,
"logps/rejected": -115.3167724609375,
"loss": 1.0309,
"margin_dpo/margin_mean": 10.417366981506348,
"margin_dpo/margin_std": 15.731843948364258,
"step": 465
},
{
"epoch": 0.7044595616024187,
"grad_norm": 42.33492660522461,
"learning_rate": 1.2271688498291334e-07,
"logits/chosen": 0.37938642501831055,
"logits/rejected": 0.37962764501571655,
"logps/chosen": -95.0290756225586,
"logps/ref_chosen": -73.26580810546875,
"logps/ref_rejected": -74.83621215820312,
"logps/rejected": -103.54035949707031,
"loss": 1.1424,
"margin_dpo/margin_mean": 6.940883636474609,
"margin_dpo/margin_std": 13.032196998596191,
"step": 466
},
{
"epoch": 0.7059712773998488,
"grad_norm": 27.393531799316406,
"learning_rate": 1.2158065210664848e-07,
"logits/chosen": 0.41466957330703735,
"logits/rejected": 0.2669060230255127,
"logps/chosen": -67.52658081054688,
"logps/ref_chosen": -47.57947540283203,
"logps/ref_rejected": -78.68522644042969,
"logps/rejected": -110.11405181884766,
"loss": 0.8437,
"margin_dpo/margin_mean": 11.481725692749023,
"margin_dpo/margin_std": 13.719661712646484,
"step": 467
},
{
"epoch": 0.7074829931972789,
"grad_norm": 30.220720291137695,
"learning_rate": 1.204480113956011e-07,
"logits/chosen": 0.3812910318374634,
"logits/rejected": 0.3669815957546234,
"logps/chosen": -80.45675659179688,
"logps/ref_chosen": -63.92778778076172,
"logps/ref_rejected": -76.51626586914062,
"logps/rejected": -109.25390625,
"loss": 0.6942,
"margin_dpo/margin_mean": 16.208662033081055,
"margin_dpo/margin_std": 16.242408752441406,
"step": 468
},
{
"epoch": 0.708994708994709,
"grad_norm": 28.700984954833984,
"learning_rate": 1.1931899453216697e-07,
"logits/chosen": 0.45047247409820557,
"logits/rejected": 0.43260282278060913,
"logps/chosen": -76.3576889038086,
"logps/ref_chosen": -59.05818176269531,
"logps/ref_rejected": -75.67672729492188,
"logps/rejected": -105.44158172607422,
"loss": 0.8573,
"margin_dpo/margin_mean": 12.465351104736328,
"margin_dpo/margin_std": 14.824824333190918,
"step": 469
},
{
"epoch": 0.7105064247921391,
"grad_norm": 27.37070083618164,
"learning_rate": 1.1819363309737438e-07,
"logits/chosen": 0.38742589950561523,
"logits/rejected": 0.31535792350769043,
"logps/chosen": -67.01020812988281,
"logps/ref_chosen": -47.86743927001953,
"logps/ref_rejected": -65.96858978271484,
"logps/rejected": -96.77906036376953,
"loss": 0.9114,
"margin_dpo/margin_mean": 11.667705535888672,
"margin_dpo/margin_std": 14.78006362915039,
"step": 470
},
{
"epoch": 0.7120181405895691,
"grad_norm": 31.069612503051758,
"learning_rate": 1.1707195857000215e-07,
"logits/chosen": 0.38362210988998413,
"logits/rejected": 0.31821680068969727,
"logps/chosen": -74.50727081298828,
"logps/ref_chosen": -57.77785110473633,
"logps/ref_rejected": -73.81172180175781,
"logps/rejected": -104.50235748291016,
"loss": 0.8414,
"margin_dpo/margin_mean": 13.961222648620605,
"margin_dpo/margin_std": 16.028059005737305,
"step": 471
},
{
"epoch": 0.7135298563869993,
"grad_norm": 32.531490325927734,
"learning_rate": 1.1595400232569768e-07,
"logits/chosen": 0.40967637300491333,
"logits/rejected": 0.3588043749332428,
"logps/chosen": -72.53768157958984,
"logps/ref_chosen": -55.908668518066406,
"logps/ref_rejected": -74.70294189453125,
"logps/rejected": -103.69931030273438,
"loss": 0.9363,
"margin_dpo/margin_mean": 12.367351531982422,
"margin_dpo/margin_std": 16.488346099853516,
"step": 472
},
{
"epoch": 0.7150415721844293,
"grad_norm": 32.29591369628906,
"learning_rate": 1.1483979563610069e-07,
"logits/chosen": 0.46595680713653564,
"logits/rejected": 0.356813907623291,
"logps/chosen": -70.86893463134766,
"logps/ref_chosen": -54.16088104248047,
"logps/ref_rejected": -92.76789855957031,
"logps/rejected": -121.89593505859375,
"loss": 1.0278,
"margin_dpo/margin_mean": 12.419994354248047,
"margin_dpo/margin_std": 18.459060668945312,
"step": 473
},
{
"epoch": 0.7165532879818595,
"grad_norm": 38.99330520629883,
"learning_rate": 1.1372936966796709e-07,
"logits/chosen": 0.4462272524833679,
"logits/rejected": 0.37020203471183777,
"logps/chosen": -66.63814544677734,
"logps/ref_chosen": -46.685707092285156,
"logps/ref_rejected": -71.44731140136719,
"logps/rejected": -101.14683532714844,
"loss": 1.1052,
"margin_dpo/margin_mean": 9.747077941894531,
"margin_dpo/margin_std": 16.308094024658203,
"step": 474
},
{
"epoch": 0.7180650037792895,
"grad_norm": 27.097074508666992,
"learning_rate": 1.126227554822985e-07,
"logits/chosen": 0.36055099964141846,
"logits/rejected": 0.30357635021209717,
"logps/chosen": -77.57850646972656,
"logps/ref_chosen": -58.4873046875,
"logps/ref_rejected": -87.00187683105469,
"logps/rejected": -122.87249755859375,
"loss": 0.6529,
"margin_dpo/margin_mean": 16.779415130615234,
"margin_dpo/margin_std": 15.916208267211914,
"step": 475
},
{
"epoch": 0.7195767195767195,
"grad_norm": 43.0082893371582,
"learning_rate": 1.1151998403347243e-07,
"logits/chosen": 0.3008124828338623,
"logits/rejected": 0.288544237613678,
"logps/chosen": -97.41394805908203,
"logps/ref_chosen": -75.38162231445312,
"logps/ref_rejected": -76.99822235107422,
"logps/rejected": -109.19664764404297,
"loss": 1.0678,
"margin_dpo/margin_mean": 10.166099548339844,
"margin_dpo/margin_std": 16.556289672851562,
"step": 476
},
{
"epoch": 0.7210884353741497,
"grad_norm": 40.80556869506836,
"learning_rate": 1.1042108616837692e-07,
"logits/chosen": 0.41148385405540466,
"logits/rejected": 0.36269134283065796,
"logps/chosen": -82.53071594238281,
"logps/ref_chosen": -61.073387145996094,
"logps/ref_rejected": -81.34375,
"logps/rejected": -114.1594009399414,
"loss": 1.0709,
"margin_dpo/margin_mean": 11.35832691192627,
"margin_dpo/margin_std": 17.43365478515625,
"step": 477
},
{
"epoch": 0.7226001511715797,
"grad_norm": 40.633033752441406,
"learning_rate": 1.0932609262554746e-07,
"logits/chosen": 0.3245035409927368,
"logits/rejected": 0.3233986496925354,
"logps/chosen": -75.4658432006836,
"logps/ref_chosen": -57.16731643676758,
"logps/ref_rejected": -53.309181213378906,
"logps/rejected": -81.65347290039062,
"loss": 1.1221,
"margin_dpo/margin_mean": 10.045768737792969,
"margin_dpo/margin_std": 17.326122283935547,
"step": 478
},
{
"epoch": 0.7241118669690099,
"grad_norm": 30.868711471557617,
"learning_rate": 1.0823503403430734e-07,
"logits/chosen": 0.30822062492370605,
"logits/rejected": 0.25296294689178467,
"logps/chosen": -79.10305786132812,
"logps/ref_chosen": -58.91331481933594,
"logps/ref_rejected": -63.7403450012207,
"logps/rejected": -93.48685455322266,
"loss": 1.0759,
"margin_dpo/margin_mean": 9.556758880615234,
"margin_dpo/margin_std": 15.833057403564453,
"step": 479
},
{
"epoch": 0.7256235827664399,
"grad_norm": 36.491539001464844,
"learning_rate": 1.0714794091391072e-07,
"logits/chosen": 0.31532788276672363,
"logits/rejected": 0.29993510246276855,
"logps/chosen": -81.86836242675781,
"logps/ref_chosen": -62.80060577392578,
"logps/ref_rejected": -67.58859252929688,
"logps/rejected": -97.8862075805664,
"loss": 1.0544,
"margin_dpo/margin_mean": 11.229857444763184,
"margin_dpo/margin_std": 16.638090133666992,
"step": 480
},
{
"epoch": 0.72713529856387,
"grad_norm": 30.423425674438477,
"learning_rate": 1.0606484367268906e-07,
"logits/chosen": 0.30370086431503296,
"logits/rejected": 0.29490453004837036,
"logps/chosen": -83.44210815429688,
"logps/ref_chosen": -65.28649139404297,
"logps/ref_rejected": -70.78668212890625,
"logps/rejected": -101.75835418701172,
"loss": 0.9433,
"margin_dpo/margin_mean": 12.816054344177246,
"margin_dpo/margin_std": 17.707691192626953,
"step": 481
},
{
"epoch": 0.7286470143613001,
"grad_norm": 43.65763473510742,
"learning_rate": 1.0498577260720048e-07,
"logits/chosen": 0.3178885281085968,
"logits/rejected": 0.1772965043783188,
"logps/chosen": -81.17154693603516,
"logps/ref_chosen": -60.906185150146484,
"logps/ref_rejected": -103.44656372070312,
"logps/rejected": -134.303955078125,
"loss": 1.113,
"margin_dpo/margin_mean": 10.592021942138672,
"margin_dpo/margin_std": 18.282638549804688,
"step": 482
},
{
"epoch": 0.7301587301587301,
"grad_norm": 32.36988067626953,
"learning_rate": 1.0391075790138232e-07,
"logits/chosen": 0.44268304109573364,
"logits/rejected": 0.33734846115112305,
"logps/chosen": -71.24640655517578,
"logps/ref_chosen": -53.192012786865234,
"logps/ref_rejected": -81.83927154541016,
"logps/rejected": -112.96994018554688,
"loss": 0.905,
"margin_dpo/margin_mean": 13.076276779174805,
"margin_dpo/margin_std": 16.73548126220703,
"step": 483
},
{
"epoch": 0.7316704459561603,
"grad_norm": 34.262271881103516,
"learning_rate": 1.0283982962570681e-07,
"logits/chosen": 0.4280846118927002,
"logits/rejected": 0.3853263854980469,
"logps/chosen": -76.50611114501953,
"logps/ref_chosen": -57.76945877075195,
"logps/ref_rejected": -71.6829833984375,
"logps/rejected": -100.66574096679688,
"loss": 0.8924,
"margin_dpo/margin_mean": 10.246103286743164,
"margin_dpo/margin_std": 12.448812484741211,
"step": 484
},
{
"epoch": 0.7331821617535903,
"grad_norm": 30.331209182739258,
"learning_rate": 1.0177301773633992e-07,
"logits/chosen": 0.40210530161857605,
"logits/rejected": 0.3721107542514801,
"logps/chosen": -73.7468032836914,
"logps/ref_chosen": -56.63584899902344,
"logps/ref_rejected": -70.85614013671875,
"logps/rejected": -99.84536743164062,
"loss": 0.921,
"margin_dpo/margin_mean": 11.878273010253906,
"margin_dpo/margin_std": 15.501483917236328,
"step": 485
},
{
"epoch": 0.7346938775510204,
"grad_norm": 33.630470275878906,
"learning_rate": 1.007103520743035e-07,
"logits/chosen": 0.41097620129585266,
"logits/rejected": 0.2941015958786011,
"logps/chosen": -78.46426391601562,
"logps/ref_chosen": -56.347023010253906,
"logps/ref_rejected": -85.97221374511719,
"logps/rejected": -119.07996368408203,
"loss": 1.0505,
"margin_dpo/margin_mean": 10.990509986877441,
"margin_dpo/margin_std": 18.080223083496094,
"step": 486
},
{
"epoch": 0.7362055933484505,
"grad_norm": 33.40716552734375,
"learning_rate": 9.965186236464046e-08,
"logits/chosen": 0.45882779359817505,
"logits/rejected": 0.39584046602249146,
"logps/chosen": -80.12818908691406,
"logps/ref_chosen": -60.617218017578125,
"logps/ref_rejected": -82.5097427368164,
"logps/rejected": -113.12313079833984,
"loss": 0.9295,
"margin_dpo/margin_mean": 11.102409362792969,
"margin_dpo/margin_std": 15.39416217803955,
"step": 487
},
{
"epoch": 0.7377173091458806,
"grad_norm": 32.00619888305664,
"learning_rate": 9.859757821558337e-08,
"logits/chosen": 0.3710404634475708,
"logits/rejected": 0.2971905469894409,
"logps/chosen": -81.0091781616211,
"logps/ref_chosen": -63.10905456542969,
"logps/ref_rejected": -82.49348449707031,
"logps/rejected": -114.04411315917969,
"loss": 0.796,
"margin_dpo/margin_mean": 13.650504112243652,
"margin_dpo/margin_std": 15.383604049682617,
"step": 488
},
{
"epoch": 0.7392290249433107,
"grad_norm": 35.907958984375,
"learning_rate": 9.754752911772615e-08,
"logits/chosen": 0.3739784359931946,
"logits/rejected": 0.327509343624115,
"logps/chosen": -84.74853515625,
"logps/ref_chosen": -64.98896026611328,
"logps/ref_rejected": -84.39607238769531,
"logps/rejected": -111.33993530273438,
"loss": 1.1867,
"margin_dpo/margin_mean": 7.184290885925293,
"margin_dpo/margin_std": 14.307548522949219,
"step": 489
},
{
"epoch": 0.7407407407407407,
"grad_norm": 42.15243148803711,
"learning_rate": 9.650174444319956e-08,
"logits/chosen": 0.4511192739009857,
"logits/rejected": 0.4217193126678467,
"logps/chosen": -80.63301086425781,
"logps/ref_chosen": -61.90874481201172,
"logps/ref_rejected": -70.58566284179688,
"logps/rejected": -99.12336730957031,
"loss": 1.219,
"margin_dpo/margin_mean": 9.813451766967773,
"margin_dpo/margin_std": 18.48382568359375,
"step": 490
},
{
"epoch": 0.7422524565381708,
"grad_norm": 30.005891799926758,
"learning_rate": 9.546025344484868e-08,
"logits/chosen": 0.2991289794445038,
"logits/rejected": 0.24064157903194427,
"logps/chosen": -73.79498291015625,
"logps/ref_chosen": -55.47570037841797,
"logps/ref_rejected": -78.70318603515625,
"logps/rejected": -107.55183410644531,
"loss": 0.93,
"margin_dpo/margin_mean": 10.529365539550781,
"margin_dpo/margin_std": 13.245063781738281,
"step": 491
},
{
"epoch": 0.7437641723356009,
"grad_norm": 40.63232421875,
"learning_rate": 9.442308525541589e-08,
"logits/chosen": 0.32890427112579346,
"logits/rejected": 0.25535351037979126,
"logps/chosen": -90.1016616821289,
"logps/ref_chosen": -67.28638458251953,
"logps/ref_rejected": -82.78628540039062,
"logps/rejected": -115.23426055908203,
"loss": 1.1686,
"margin_dpo/margin_mean": 9.632694244384766,
"margin_dpo/margin_std": 16.95614242553711,
"step": 492
},
{
"epoch": 0.745275888133031,
"grad_norm": 27.453155517578125,
"learning_rate": 9.339026888672468e-08,
"logits/chosen": 0.35031116008758545,
"logits/rejected": 0.266485333442688,
"logps/chosen": -74.11001586914062,
"logps/ref_chosen": -55.92750549316406,
"logps/ref_rejected": -79.12149810791016,
"logps/rejected": -110.38975524902344,
"loss": 0.8313,
"margin_dpo/margin_mean": 13.085746765136719,
"margin_dpo/margin_std": 15.100048065185547,
"step": 493
},
{
"epoch": 0.7467876039304611,
"grad_norm": 41.737037658691406,
"learning_rate": 9.236183322886945e-08,
"logits/chosen": 0.2632497251033783,
"logits/rejected": 0.20675988495349884,
"logps/chosen": -86.74630737304688,
"logps/ref_chosen": -67.95411682128906,
"logps/ref_rejected": -90.50865936279297,
"logps/rejected": -118.86773681640625,
"loss": 1.1456,
"margin_dpo/margin_mean": 9.566875457763672,
"margin_dpo/margin_std": 17.215763092041016,
"step": 494
},
{
"epoch": 0.7482993197278912,
"grad_norm": 33.81610870361328,
"learning_rate": 9.133780704940594e-08,
"logits/chosen": 0.4413723349571228,
"logits/rejected": 0.37061169743537903,
"logps/chosen": -70.81031036376953,
"logps/ref_chosen": -52.625465393066406,
"logps/ref_rejected": -72.06781005859375,
"logps/rejected": -98.15983581542969,
"loss": 1.1075,
"margin_dpo/margin_mean": 7.907181262969971,
"margin_dpo/margin_std": 14.21833324432373,
"step": 495
},
{
"epoch": 0.7498110355253212,
"grad_norm": 34.30609130859375,
"learning_rate": 9.031821899254797e-08,
"logits/chosen": 0.39603427052497864,
"logits/rejected": 0.28451234102249146,
"logps/chosen": -77.51828002929688,
"logps/ref_chosen": -57.597328186035156,
"logps/ref_rejected": -94.36127471923828,
"logps/rejected": -124.71854400634766,
"loss": 1.0383,
"margin_dpo/margin_mean": 10.436321258544922,
"margin_dpo/margin_std": 17.503387451171875,
"step": 496
},
{
"epoch": 0.7513227513227513,
"grad_norm": 33.988346099853516,
"learning_rate": 8.930309757836516e-08,
"logits/chosen": 0.41119682788848877,
"logits/rejected": 0.37434881925582886,
"logps/chosen": -92.74378967285156,
"logps/ref_chosen": -72.78994750976562,
"logps/ref_rejected": -89.48483276367188,
"logps/rejected": -123.64321899414062,
"loss": 0.844,
"margin_dpo/margin_mean": 14.204545974731445,
"margin_dpo/margin_std": 17.2962703704834,
"step": 497
},
{
"epoch": 0.7528344671201814,
"grad_norm": 36.42383575439453,
"learning_rate": 8.829247120198563e-08,
"logits/chosen": 0.3917444944381714,
"logits/rejected": 0.35288551449775696,
"logps/chosen": -87.1715316772461,
"logps/ref_chosen": -68.36572265625,
"logps/ref_rejected": -71.28846740722656,
"logps/rejected": -102.5860824584961,
"loss": 0.8655,
"margin_dpo/margin_mean": 12.491800308227539,
"margin_dpo/margin_std": 15.737793922424316,
"step": 498
},
{
"epoch": 0.7543461829176115,
"grad_norm": 34.57160568237305,
"learning_rate": 8.728636813280163e-08,
"logits/chosen": 0.36719441413879395,
"logits/rejected": 0.29699230194091797,
"logps/chosen": -79.09321594238281,
"logps/ref_chosen": -61.90882873535156,
"logps/ref_rejected": -91.9411392211914,
"logps/rejected": -120.89878845214844,
"loss": 1.0196,
"margin_dpo/margin_mean": 11.773270606994629,
"margin_dpo/margin_std": 17.18640899658203,
"step": 499
},
{
"epoch": 0.7558578987150416,
"grad_norm": 35.90029525756836,
"learning_rate": 8.628481651367875e-08,
"logits/chosen": 0.30776458978652954,
"logits/rejected": 0.2964794933795929,
"logps/chosen": -89.29718017578125,
"logps/ref_chosen": -70.225830078125,
"logps/ref_rejected": -71.72203063964844,
"logps/rejected": -101.60995483398438,
"loss": 1.0037,
"margin_dpo/margin_mean": 10.816570281982422,
"margin_dpo/margin_std": 15.775973320007324,
"step": 500
},
{
"epoch": 0.7558578987150416,
"eval_logits/chosen": 0.3937297463417053,
"eval_logits/rejected": 0.3419411778450012,
"eval_logps/chosen": -92.17249298095703,
"eval_logps/ref_chosen": -74.85946655273438,
"eval_logps/ref_rejected": -79.54898834228516,
"eval_logps/rejected": -106.36981964111328,
"eval_loss": 0.527696967124939,
"eval_margin_dpo/margin_mean": 9.507804870605469,
"eval_margin_dpo/margin_std": 15.06718635559082,
"eval_runtime": 38.6653,
"eval_samples_per_second": 59.562,
"eval_steps_per_second": 1.862,
"step": 500
},
{
"epoch": 0.7573696145124716,
"grad_norm": 26.93088722229004,
"learning_rate": 8.528784436016878e-08,
"logits/chosen": 0.359049528837204,
"logits/rejected": 0.34849053621292114,
"logps/chosen": -83.36741638183594,
"logps/ref_chosen": -64.59880828857422,
"logps/ref_rejected": -70.59329223632812,
"logps/rejected": -99.85511779785156,
"loss": 0.8434,
"margin_dpo/margin_mean": 10.493215560913086,
"margin_dpo/margin_std": 11.587947845458984,
"step": 501
},
{
"epoch": 0.7588813303099018,
"grad_norm": 35.08143615722656,
"learning_rate": 8.4295479559726e-08,
"logits/chosen": 0.3733653128147125,
"logits/rejected": 0.3248666822910309,
"logps/chosen": -83.78016662597656,
"logps/ref_chosen": -65.46662902832031,
"logps/ref_rejected": -90.22233581542969,
"logps/rejected": -118.43475341796875,
"loss": 0.9262,
"margin_dpo/margin_mean": 9.898885726928711,
"margin_dpo/margin_std": 13.525036811828613,
"step": 502
},
{
"epoch": 0.7603930461073318,
"grad_norm": 33.57307434082031,
"learning_rate": 8.330774987092712e-08,
"logits/chosen": 0.37700212001800537,
"logits/rejected": 0.36113613843917847,
"logps/chosen": -68.490966796875,
"logps/ref_chosen": -51.83476257324219,
"logps/ref_rejected": -57.62522506713867,
"logps/rejected": -84.85618591308594,
"loss": 1.0233,
"margin_dpo/margin_mean": 10.574769020080566,
"margin_dpo/margin_std": 15.259387969970703,
"step": 503
},
{
"epoch": 0.7619047619047619,
"grad_norm": 32.33924865722656,
"learning_rate": 8.232468292269479e-08,
"logits/chosen": 0.33015936613082886,
"logits/rejected": 0.3007563352584839,
"logps/chosen": -85.83465576171875,
"logps/ref_chosen": -68.65119934082031,
"logps/ref_rejected": -77.91394805908203,
"logps/rejected": -109.01856994628906,
"loss": 0.71,
"margin_dpo/margin_mean": 13.92115592956543,
"margin_dpo/margin_std": 13.897747039794922,
"step": 504
},
{
"epoch": 0.763416477702192,
"grad_norm": 38.17281723022461,
"learning_rate": 8.134630621352483e-08,
"logits/chosen": 0.3694714903831482,
"logits/rejected": 0.3267279267311096,
"logps/chosen": -78.07418823242188,
"logps/ref_chosen": -59.99884796142578,
"logps/ref_rejected": -76.88047790527344,
"logps/rejected": -104.90641784667969,
"loss": 1.0749,
"margin_dpo/margin_mean": 9.950593948364258,
"margin_dpo/margin_std": 16.37490463256836,
"step": 505
},
{
"epoch": 0.764928193499622,
"grad_norm": 37.14749526977539,
"learning_rate": 8.037264711071698e-08,
"logits/chosen": 0.37326282262802124,
"logits/rejected": 0.34700077772140503,
"logps/chosen": -86.4708251953125,
"logps/ref_chosen": -70.07130432128906,
"logps/ref_rejected": -82.03775024414062,
"logps/rejected": -108.28327941894531,
"loss": 1.0496,
"margin_dpo/margin_mean": 9.846002578735352,
"margin_dpo/margin_std": 15.988225936889648,
"step": 506
},
{
"epoch": 0.7664399092970522,
"grad_norm": 32.942909240722656,
"learning_rate": 7.940373284960933e-08,
"logits/chosen": 0.3433038592338562,
"logits/rejected": 0.2900928556919098,
"logps/chosen": -90.55781555175781,
"logps/ref_chosen": -72.00703430175781,
"logps/ref_rejected": -93.94987487792969,
"logps/rejected": -123.90901947021484,
"loss": 0.9739,
"margin_dpo/margin_mean": 11.408361434936523,
"margin_dpo/margin_std": 16.2240047454834,
"step": 507
},
{
"epoch": 0.7679516250944822,
"grad_norm": 29.779918670654297,
"learning_rate": 7.843959053281663e-08,
"logits/chosen": 0.32127851247787476,
"logits/rejected": 0.19979628920555115,
"logps/chosen": -77.34425354003906,
"logps/ref_chosen": -60.21992492675781,
"logps/ref_rejected": -95.9200668334961,
"logps/rejected": -125.66604614257812,
"loss": 0.8772,
"margin_dpo/margin_mean": 12.621658325195312,
"margin_dpo/margin_std": 15.715794563293457,
"step": 508
},
{
"epoch": 0.7694633408919124,
"grad_norm": 39.186466217041016,
"learning_rate": 7.748024712947204e-08,
"logits/chosen": 0.30487918853759766,
"logits/rejected": 0.27709323167800903,
"logps/chosen": -84.89524841308594,
"logps/ref_chosen": -66.2701644897461,
"logps/ref_rejected": -71.73065185546875,
"logps/rejected": -100.41943359375,
"loss": 0.9939,
"margin_dpo/margin_mean": 10.063714981079102,
"margin_dpo/margin_std": 14.282079696655273,
"step": 509
},
{
"epoch": 0.7709750566893424,
"grad_norm": 31.86794090270996,
"learning_rate": 7.652572947447272e-08,
"logits/chosen": 0.48108965158462524,
"logits/rejected": 0.3837711811065674,
"logps/chosen": -72.2618179321289,
"logps/ref_chosen": -53.54487609863281,
"logps/ref_rejected": -91.36649322509766,
"logps/rejected": -121.51242065429688,
"loss": 0.9319,
"margin_dpo/margin_mean": 11.42898941040039,
"margin_dpo/margin_std": 15.287019729614258,
"step": 510
},
{
"epoch": 0.7724867724867724,
"grad_norm": 32.75205993652344,
"learning_rate": 7.557606426772961e-08,
"logits/chosen": 0.39279526472091675,
"logits/rejected": 0.3349749445915222,
"logps/chosen": -75.0731201171875,
"logps/ref_chosen": -55.844390869140625,
"logps/ref_rejected": -86.49819946289062,
"logps/rejected": -120.20086669921875,
"loss": 0.8105,
"margin_dpo/margin_mean": 14.473926544189453,
"margin_dpo/margin_std": 16.571292877197266,
"step": 511
},
{
"epoch": 0.7739984882842026,
"grad_norm": 35.257179260253906,
"learning_rate": 7.463127807341966e-08,
"logits/chosen": 0.25258108973503113,
"logits/rejected": 0.23254463076591492,
"logps/chosen": -80.03028106689453,
"logps/ref_chosen": -61.653038024902344,
"logps/ref_rejected": -72.83148193359375,
"logps/rejected": -102.50459289550781,
"loss": 1.0745,
"margin_dpo/margin_mean": 11.295858383178711,
"margin_dpo/margin_std": 20.00664520263672,
"step": 512
},
{
"epoch": 0.7755102040816326,
"grad_norm": 25.621583938598633,
"learning_rate": 7.369139731924401e-08,
"logits/chosen": 0.49317803978919983,
"logits/rejected": 0.43911677598953247,
"logps/chosen": -67.57891082763672,
"logps/ref_chosen": -50.852561950683594,
"logps/ref_rejected": -69.21754455566406,
"logps/rejected": -97.71588134765625,
"loss": 0.8412,
"margin_dpo/margin_mean": 11.771990776062012,
"margin_dpo/margin_std": 14.104642868041992,
"step": 513
},
{
"epoch": 0.7770219198790628,
"grad_norm": 34.901641845703125,
"learning_rate": 7.275644829568747e-08,
"logits/chosen": 0.4055827260017395,
"logits/rejected": 0.3611418604850769,
"logps/chosen": -88.5246810913086,
"logps/ref_chosen": -69.38493347167969,
"logps/ref_rejected": -83.32447814941406,
"logps/rejected": -113.60870361328125,
"loss": 0.9427,
"margin_dpo/margin_mean": 11.14448356628418,
"margin_dpo/margin_std": 14.856443405151367,
"step": 514
},
{
"epoch": 0.7785336356764928,
"grad_norm": 31.39341926574707,
"learning_rate": 7.182645715528435e-08,
"logits/chosen": 0.3923872411251068,
"logits/rejected": 0.30827796459198,
"logps/chosen": -74.15292358398438,
"logps/ref_chosen": -53.687034606933594,
"logps/ref_rejected": -83.59614562988281,
"logps/rejected": -116.03274536132812,
"loss": 0.8749,
"margin_dpo/margin_mean": 11.970718383789062,
"margin_dpo/margin_std": 14.525394439697266,
"step": 515
},
{
"epoch": 0.780045351473923,
"grad_norm": 33.65007781982422,
"learning_rate": 7.090144991188568e-08,
"logits/chosen": 0.35884907841682434,
"logits/rejected": 0.30560222268104553,
"logps/chosen": -73.98365783691406,
"logps/ref_chosen": -56.9017219543457,
"logps/ref_rejected": -67.83477783203125,
"logps/rejected": -94.897705078125,
"loss": 0.9889,
"margin_dpo/margin_mean": 9.980987548828125,
"margin_dpo/margin_std": 14.448881149291992,
"step": 516
},
{
"epoch": 0.781557067271353,
"grad_norm": 31.95818519592285,
"learning_rate": 6.998145243993284e-08,
"logits/chosen": 0.40695106983184814,
"logits/rejected": 0.3971271812915802,
"logps/chosen": -82.69999694824219,
"logps/ref_chosen": -61.775142669677734,
"logps/ref_rejected": -62.88270950317383,
"logps/rejected": -94.34140014648438,
"loss": 0.9838,
"margin_dpo/margin_mean": 10.533843994140625,
"margin_dpo/margin_std": 15.69892692565918,
"step": 517
},
{
"epoch": 0.783068783068783,
"grad_norm": 30.663984298706055,
"learning_rate": 6.906649047373245e-08,
"logits/chosen": 0.37528085708618164,
"logits/rejected": 0.32002222537994385,
"logps/chosen": -79.28578186035156,
"logps/ref_chosen": -62.025230407714844,
"logps/ref_rejected": -79.06085205078125,
"logps/rejected": -105.47918701171875,
"loss": 1.0703,
"margin_dpo/margin_mean": 9.15778923034668,
"margin_dpo/margin_std": 15.571455001831055,
"step": 518
},
{
"epoch": 0.7845804988662132,
"grad_norm": 44.80562973022461,
"learning_rate": 6.815658960673781e-08,
"logits/chosen": 0.4019656777381897,
"logits/rejected": 0.34249573945999146,
"logps/chosen": -82.74702453613281,
"logps/ref_chosen": -61.60636901855469,
"logps/ref_rejected": -74.50727844238281,
"logps/rejected": -102.14393615722656,
"loss": 1.4077,
"margin_dpo/margin_mean": 6.495992660522461,
"margin_dpo/margin_std": 17.53131103515625,
"step": 519
},
{
"epoch": 0.7860922146636432,
"grad_norm": 32.09324264526367,
"learning_rate": 6.725177529083209e-08,
"logits/chosen": 0.4463905394077301,
"logits/rejected": 0.38504666090011597,
"logps/chosen": -80.70368194580078,
"logps/ref_chosen": -62.87343215942383,
"logps/ref_rejected": -76.505615234375,
"logps/rejected": -104.06520080566406,
"loss": 0.927,
"margin_dpo/margin_mean": 9.729334831237793,
"margin_dpo/margin_std": 12.515774726867676,
"step": 520
},
{
"epoch": 0.7876039304610734,
"grad_norm": 34.64705276489258,
"learning_rate": 6.63520728356167e-08,
"logits/chosen": 0.2728666663169861,
"logits/rejected": 0.19291679561138153,
"logps/chosen": -82.63397216796875,
"logps/ref_chosen": -64.20668029785156,
"logps/ref_rejected": -92.28083038330078,
"logps/rejected": -121.74046325683594,
"loss": 0.8833,
"margin_dpo/margin_mean": 11.032337188720703,
"margin_dpo/margin_std": 13.831937789916992,
"step": 521
},
{
"epoch": 0.7891156462585034,
"grad_norm": 36.75823974609375,
"learning_rate": 6.545750740770336e-08,
"logits/chosen": 0.3487527370452881,
"logits/rejected": 0.33449989557266235,
"logps/chosen": -75.84567260742188,
"logps/ref_chosen": -58.36972427368164,
"logps/ref_rejected": -68.79248046875,
"logps/rejected": -96.63177490234375,
"loss": 1.1404,
"margin_dpo/margin_mean": 10.363338470458984,
"margin_dpo/margin_std": 17.992464065551758,
"step": 522
},
{
"epoch": 0.7906273620559335,
"grad_norm": 41.78493881225586,
"learning_rate": 6.456810403001012e-08,
"logits/chosen": 0.4005553722381592,
"logits/rejected": 0.2822886109352112,
"logps/chosen": -85.6712417602539,
"logps/ref_chosen": -65.71324157714844,
"logps/ref_rejected": -91.98896789550781,
"logps/rejected": -124.408447265625,
"loss": 0.9277,
"margin_dpo/margin_mean": 12.461475372314453,
"margin_dpo/margin_std": 15.705230712890625,
"step": 523
},
{
"epoch": 0.7921390778533636,
"grad_norm": 36.14333724975586,
"learning_rate": 6.368388758106134e-08,
"logits/chosen": 0.2999478578567505,
"logits/rejected": 0.2743346095085144,
"logps/chosen": -92.46823120117188,
"logps/ref_chosen": -76.35124969482422,
"logps/ref_rejected": -89.96072387695312,
"logps/rejected": -114.48500061035156,
"loss": 1.0375,
"margin_dpo/margin_mean": 8.407294273376465,
"margin_dpo/margin_std": 13.35383129119873,
"step": 524
},
{
"epoch": 0.7936507936507936,
"grad_norm": 35.24806213378906,
"learning_rate": 6.280488279429185e-08,
"logits/chosen": 0.20016852021217346,
"logits/rejected": 0.1862761676311493,
"logps/chosen": -93.02354431152344,
"logps/ref_chosen": -75.49578857421875,
"logps/ref_rejected": -84.04852294921875,
"logps/rejected": -109.72943115234375,
"loss": 1.0159,
"margin_dpo/margin_mean": 8.153154373168945,
"margin_dpo/margin_std": 12.372773170471191,
"step": 525
},
{
"epoch": 0.7951625094482238,
"grad_norm": 34.34667205810547,
"learning_rate": 6.193111425735515e-08,
"logits/chosen": 0.3684360086917877,
"logits/rejected": 0.30142414569854736,
"logps/chosen": -80.24913024902344,
"logps/ref_chosen": -61.29241943359375,
"logps/ref_rejected": -82.47763061523438,
"logps/rejected": -109.65870666503906,
"loss": 1.0505,
"margin_dpo/margin_mean": 8.224379539489746,
"margin_dpo/margin_std": 12.520376205444336,
"step": 526
},
{
"epoch": 0.7966742252456538,
"grad_norm": 41.396583557128906,
"learning_rate": 6.106260641143546e-08,
"logits/chosen": 0.45549625158309937,
"logits/rejected": 0.38120344281196594,
"logps/chosen": -81.03947448730469,
"logps/ref_chosen": -61.47262954711914,
"logps/ref_rejected": -90.52831268310547,
"logps/rejected": -117.31925964355469,
"loss": 1.172,
"margin_dpo/margin_mean": 7.224104881286621,
"margin_dpo/margin_std": 14.417116165161133,
"step": 527
},
{
"epoch": 0.7981859410430839,
"grad_norm": 36.54910659790039,
"learning_rate": 6.019938355056422e-08,
"logits/chosen": 0.26845669746398926,
"logits/rejected": 0.18720799684524536,
"logps/chosen": -77.49993896484375,
"logps/ref_chosen": -58.792015075683594,
"logps/ref_rejected": -71.82516479492188,
"logps/rejected": -97.82785034179688,
"loss": 1.2063,
"margin_dpo/margin_mean": 7.294772148132324,
"margin_dpo/margin_std": 14.19528579711914,
"step": 528
},
{
"epoch": 0.799697656840514,
"grad_norm": 25.931058883666992,
"learning_rate": 5.934146982094049e-08,
"logits/chosen": 0.2855815887451172,
"logits/rejected": 0.22274532914161682,
"logps/chosen": -72.43391418457031,
"logps/ref_chosen": -55.070960998535156,
"logps/ref_rejected": -75.44007110595703,
"logps/rejected": -108.48625946044922,
"loss": 0.7267,
"margin_dpo/margin_mean": 15.683242797851562,
"margin_dpo/margin_std": 15.202524185180664,
"step": 529
},
{
"epoch": 0.8012093726379441,
"grad_norm": 28.412240982055664,
"learning_rate": 5.848888922025552e-08,
"logits/chosen": 0.3692162334918976,
"logits/rejected": 0.32501786947250366,
"logps/chosen": -74.69463348388672,
"logps/ref_chosen": -56.743812561035156,
"logps/ref_rejected": -76.6692123413086,
"logps/rejected": -104.58545684814453,
"loss": 0.9058,
"margin_dpo/margin_mean": 9.965425491333008,
"margin_dpo/margin_std": 12.44455337524414,
"step": 530
},
{
"epoch": 0.8027210884353742,
"grad_norm": 35.59648132324219,
"learning_rate": 5.7641665597021435e-08,
"logits/chosen": 0.3428173065185547,
"logits/rejected": 0.2750357389450073,
"logps/chosen": -69.8387680053711,
"logps/ref_chosen": -51.116458892822266,
"logps/ref_rejected": -79.52884674072266,
"logps/rejected": -106.50172424316406,
"loss": 1.084,
"margin_dpo/margin_mean": 8.250574111938477,
"margin_dpo/margin_std": 13.591650009155273,
"step": 531
},
{
"epoch": 0.8042328042328042,
"grad_norm": 30.03083610534668,
"learning_rate": 5.679982264990424e-08,
"logits/chosen": 0.28468602895736694,
"logits/rejected": 0.22941389679908752,
"logps/chosen": -78.22216796875,
"logps/ref_chosen": -58.279945373535156,
"logps/ref_rejected": -78.05426788330078,
"logps/rejected": -108.97996520996094,
"loss": 0.9567,
"margin_dpo/margin_mean": 10.983474731445312,
"margin_dpo/margin_std": 15.372318267822266,
"step": 532
},
{
"epoch": 0.8057445200302343,
"grad_norm": 30.32957649230957,
"learning_rate": 5.596338392706076e-08,
"logits/chosen": 0.4587961733341217,
"logits/rejected": 0.39391863346099854,
"logps/chosen": -71.87660217285156,
"logps/ref_chosen": -56.41801452636719,
"logps/ref_rejected": -73.89324951171875,
"logps/rejected": -99.58336639404297,
"loss": 0.9828,
"margin_dpo/margin_mean": 10.23153305053711,
"margin_dpo/margin_std": 14.516399383544922,
"step": 533
},
{
"epoch": 0.8072562358276644,
"grad_norm": 32.21763229370117,
"learning_rate": 5.513237282548033e-08,
"logits/chosen": 0.35069578886032104,
"logits/rejected": 0.3038621246814728,
"logps/chosen": -77.76612091064453,
"logps/ref_chosen": -60.748687744140625,
"logps/ref_rejected": -73.8623046875,
"logps/rejected": -100.79328155517578,
"loss": 1.0124,
"margin_dpo/margin_mean": 9.913549423217773,
"margin_dpo/margin_std": 14.57766342163086,
"step": 534
},
{
"epoch": 0.8087679516250945,
"grad_norm": 32.92207336425781,
"learning_rate": 5.430681259032957e-08,
"logits/chosen": 0.25479215383529663,
"logits/rejected": 0.18896648287773132,
"logps/chosen": -81.41650390625,
"logps/ref_chosen": -61.637413024902344,
"logps/ref_rejected": -80.93138122558594,
"logps/rejected": -110.5506591796875,
"loss": 1.0647,
"margin_dpo/margin_mean": 9.84018325805664,
"margin_dpo/margin_std": 16.032939910888672,
"step": 535
},
{
"epoch": 0.8102796674225246,
"grad_norm": 27.200687408447266,
"learning_rate": 5.3486726314303175e-08,
"logits/chosen": 0.39205577969551086,
"logits/rejected": 0.303525447845459,
"logps/chosen": -69.04838562011719,
"logps/ref_chosen": -51.888973236083984,
"logps/ref_rejected": -73.34864044189453,
"logps/rejected": -102.1460189819336,
"loss": 0.7897,
"margin_dpo/margin_mean": 11.637961387634277,
"margin_dpo/margin_std": 12.476455688476562,
"step": 536
},
{
"epoch": 0.8117913832199547,
"grad_norm": 30.757015228271484,
"learning_rate": 5.267213693697695e-08,
"logits/chosen": 0.43932461738586426,
"logits/rejected": 0.3489525020122528,
"logps/chosen": -73.9029769897461,
"logps/ref_chosen": -54.248619079589844,
"logps/ref_rejected": -94.94343566894531,
"logps/rejected": -124.87551879882812,
"loss": 1.0542,
"margin_dpo/margin_mean": 10.277728080749512,
"margin_dpo/margin_std": 15.779983520507812,
"step": 537
},
{
"epoch": 0.8133030990173847,
"grad_norm": 33.307640075683594,
"learning_rate": 5.1863067244167144e-08,
"logits/chosen": 0.3416905403137207,
"logits/rejected": 0.3030615448951721,
"logps/chosen": -89.13497924804688,
"logps/ref_chosen": -70.09354400634766,
"logps/ref_rejected": -79.49833679199219,
"logps/rejected": -111.39437866210938,
"loss": 0.8039,
"margin_dpo/margin_mean": 12.854602813720703,
"margin_dpo/margin_std": 14.327655792236328,
"step": 538
},
{
"epoch": 0.8148148148148148,
"grad_norm": 32.713741302490234,
"learning_rate": 5.105953986729195e-08,
"logits/chosen": 0.336628258228302,
"logits/rejected": 0.2574174106121063,
"logps/chosen": -80.40707397460938,
"logps/ref_chosen": -61.93169403076172,
"logps/ref_rejected": -84.08946228027344,
"logps/rejected": -113.2166748046875,
"loss": 0.8848,
"margin_dpo/margin_mean": 10.651832580566406,
"margin_dpo/margin_std": 13.286481857299805,
"step": 539
},
{
"epoch": 0.8163265306122449,
"grad_norm": 31.55617332458496,
"learning_rate": 5.026157728273966e-08,
"logits/chosen": 0.38173389434814453,
"logits/rejected": 0.28644853830337524,
"logps/chosen": -80.31228637695312,
"logps/ref_chosen": -62.70425033569336,
"logps/ref_rejected": -95.63597106933594,
"logps/rejected": -126.89846801757812,
"loss": 0.7723,
"margin_dpo/margin_mean": 13.654460906982422,
"margin_dpo/margin_std": 14.235689163208008,
"step": 540
},
{
"epoch": 0.817838246409675,
"grad_norm": 32.464759826660156,
"learning_rate": 4.9469201811239035e-08,
"logits/chosen": 0.3611023426055908,
"logits/rejected": 0.36431825160980225,
"logps/chosen": -79.27033996582031,
"logps/ref_chosen": -62.48084259033203,
"logps/ref_rejected": -57.55541229248047,
"logps/rejected": -86.85520935058594,
"loss": 0.8684,
"margin_dpo/margin_mean": 12.510297775268555,
"margin_dpo/margin_std": 14.803916931152344,
"step": 541
},
{
"epoch": 0.8193499622071051,
"grad_norm": 27.37916374206543,
"learning_rate": 4.868243561723534e-08,
"logits/chosen": 0.41390740871429443,
"logits/rejected": 0.35542088747024536,
"logps/chosen": -64.57349395751953,
"logps/ref_chosen": -49.454891204833984,
"logps/ref_rejected": -65.33275604248047,
"logps/rejected": -93.30146789550781,
"loss": 0.8766,
"margin_dpo/margin_mean": 12.850105285644531,
"margin_dpo/margin_std": 15.956599235534668,
"step": 542
},
{
"epoch": 0.8208616780045351,
"grad_norm": 25.401742935180664,
"learning_rate": 4.790130070827028e-08,
"logits/chosen": 0.3546418845653534,
"logits/rejected": 0.26434922218322754,
"logps/chosen": -68.28006744384766,
"logps/ref_chosen": -51.10085678100586,
"logps/ref_rejected": -76.06130981445312,
"logps/rejected": -104.24705505371094,
"loss": 0.9126,
"margin_dpo/margin_mean": 11.006534576416016,
"margin_dpo/margin_std": 14.493851661682129,
"step": 543
},
{
"epoch": 0.8223733938019653,
"grad_norm": 26.264808654785156,
"learning_rate": 4.7125818934366454e-08,
"logits/chosen": 0.3526231348514557,
"logits/rejected": 0.2759806513786316,
"logps/chosen": -77.70940399169922,
"logps/ref_chosen": -60.2772331237793,
"logps/ref_rejected": -88.40553283691406,
"logps/rejected": -120.80406188964844,
"loss": 0.8164,
"margin_dpo/margin_mean": 14.96635627746582,
"margin_dpo/margin_std": 17.501577377319336,
"step": 544
},
{
"epoch": 0.8238851095993953,
"grad_norm": 34.545066833496094,
"learning_rate": 4.635601198741607e-08,
"logits/chosen": 0.31289103627204895,
"logits/rejected": 0.2516869604587555,
"logps/chosen": -80.18997955322266,
"logps/ref_chosen": -61.61524963378906,
"logps/ref_rejected": -78.71266174316406,
"logps/rejected": -107.26792907714844,
"loss": 1.0111,
"margin_dpo/margin_mean": 9.980533599853516,
"margin_dpo/margin_std": 15.094192504882812,
"step": 545
},
{
"epoch": 0.8253968253968254,
"grad_norm": 36.85138702392578,
"learning_rate": 4.559190140057428e-08,
"logits/chosen": 0.41092002391815186,
"logits/rejected": 0.3929000794887543,
"logps/chosen": -77.4332504272461,
"logps/ref_chosen": -59.313262939453125,
"logps/ref_rejected": -64.73631286621094,
"logps/rejected": -92.99637603759766,
"loss": 0.9744,
"margin_dpo/margin_mean": 10.140082359313965,
"margin_dpo/margin_std": 13.980596542358398,
"step": 546
},
{
"epoch": 0.8269085411942555,
"grad_norm": 28.803686141967773,
"learning_rate": 4.483350854765672e-08,
"logits/chosen": 0.3148411214351654,
"logits/rejected": 0.2440166473388672,
"logps/chosen": -70.45671081542969,
"logps/ref_chosen": -54.97674560546875,
"logps/ref_rejected": -75.35922241210938,
"logps/rejected": -102.77980041503906,
"loss": 0.8617,
"margin_dpo/margin_mean": 11.940618515014648,
"margin_dpo/margin_std": 14.236207008361816,
"step": 547
},
{
"epoch": 0.8284202569916855,
"grad_norm": 32.73234176635742,
"learning_rate": 4.4080854642541826e-08,
"logits/chosen": 0.2752930223941803,
"logits/rejected": 0.20368722081184387,
"logps/chosen": -82.19081115722656,
"logps/ref_chosen": -63.21067810058594,
"logps/ref_rejected": -81.23347473144531,
"logps/rejected": -109.26484680175781,
"loss": 1.0192,
"margin_dpo/margin_mean": 9.05124282836914,
"margin_dpo/margin_std": 14.42248249053955,
"step": 548
},
{
"epoch": 0.8299319727891157,
"grad_norm": 38.426536560058594,
"learning_rate": 4.333396073857723e-08,
"logits/chosen": 0.4287411570549011,
"logits/rejected": 0.36311283707618713,
"logps/chosen": -81.94154357910156,
"logps/ref_chosen": -64.27351379394531,
"logps/ref_rejected": -92.31663513183594,
"logps/rejected": -119.1673583984375,
"loss": 1.0566,
"margin_dpo/margin_mean": 9.182695388793945,
"margin_dpo/margin_std": 14.17637825012207,
"step": 549
},
{
"epoch": 0.8314436885865457,
"grad_norm": 30.786033630371094,
"learning_rate": 4.259284772799099e-08,
"logits/chosen": 0.39032119512557983,
"logits/rejected": 0.35575926303863525,
"logps/chosen": -75.25477600097656,
"logps/ref_chosen": -56.230438232421875,
"logps/ref_rejected": -62.59788513183594,
"logps/rejected": -89.64376831054688,
"loss": 1.0733,
"margin_dpo/margin_mean": 8.02153205871582,
"margin_dpo/margin_std": 13.34286117553711,
"step": 550
},
{
"epoch": 0.8329554043839759,
"grad_norm": 30.042922973632812,
"learning_rate": 4.1857536341307176e-08,
"logits/chosen": 0.41295182704925537,
"logits/rejected": 0.37483125925064087,
"logps/chosen": -85.90719604492188,
"logps/ref_chosen": -67.74720764160156,
"logps/ref_rejected": -87.04285430908203,
"logps/rejected": -114.77220916748047,
"loss": 0.9587,
"margin_dpo/margin_mean": 9.569366455078125,
"margin_dpo/margin_std": 13.855855941772461,
"step": 551
},
{
"epoch": 0.8344671201814059,
"grad_norm": 32.260215759277344,
"learning_rate": 4.112804714676593e-08,
"logits/chosen": 0.34446731209754944,
"logits/rejected": 0.2876893877983093,
"logps/chosen": -80.09131622314453,
"logps/ref_chosen": -62.92626190185547,
"logps/ref_rejected": -82.983642578125,
"logps/rejected": -111.53181457519531,
"loss": 0.869,
"margin_dpo/margin_mean": 11.383108139038086,
"margin_dpo/margin_std": 13.217905044555664,
"step": 552
},
{
"epoch": 0.8359788359788359,
"grad_norm": 35.45354080200195,
"learning_rate": 4.0404400549748144e-08,
"logits/chosen": 0.33471113443374634,
"logits/rejected": 0.22613003849983215,
"logps/chosen": -76.17123413085938,
"logps/ref_chosen": -56.038490295410156,
"logps/ref_rejected": -84.48454284667969,
"logps/rejected": -114.52032470703125,
"loss": 1.1435,
"margin_dpo/margin_mean": 9.903047561645508,
"margin_dpo/margin_std": 18.003122329711914,
"step": 553
},
{
"epoch": 0.8374905517762661,
"grad_norm": 36.615055084228516,
"learning_rate": 3.968661679220467e-08,
"logits/chosen": 0.3139854371547699,
"logits/rejected": 0.2850970923900604,
"logps/chosen": -83.22395324707031,
"logps/ref_chosen": -64.53059387207031,
"logps/ref_rejected": -71.21560668945312,
"logps/rejected": -100.5650405883789,
"loss": 0.9951,
"margin_dpo/margin_mean": 10.656076431274414,
"margin_dpo/margin_std": 15.991934776306152,
"step": 554
},
{
"epoch": 0.8390022675736961,
"grad_norm": 36.042789459228516,
"learning_rate": 3.89747159520904e-08,
"logits/chosen": 0.3471308946609497,
"logits/rejected": 0.3114258050918579,
"logps/chosen": -87.32560729980469,
"logps/ref_chosen": -66.65191650390625,
"logps/ref_rejected": -68.6667251586914,
"logps/rejected": -99.08800506591797,
"loss": 1.0254,
"margin_dpo/margin_mean": 9.74759292602539,
"margin_dpo/margin_std": 13.942052841186523,
"step": 555
},
{
"epoch": 0.8405139833711263,
"grad_norm": 32.9485969543457,
"learning_rate": 3.826871794280192e-08,
"logits/chosen": 0.38114964962005615,
"logits/rejected": 0.3249025344848633,
"logps/chosen": -73.74171447753906,
"logps/ref_chosen": -52.832366943359375,
"logps/ref_rejected": -64.49044036865234,
"logps/rejected": -94.696533203125,
"loss": 1.1283,
"margin_dpo/margin_mean": 9.296748161315918,
"margin_dpo/margin_std": 16.70484161376953,
"step": 556
},
{
"epoch": 0.8420256991685563,
"grad_norm": 31.57485580444336,
"learning_rate": 3.756864251262143e-08,
"logits/chosen": 0.48908981680870056,
"logits/rejected": 0.4027714729309082,
"logps/chosen": -74.70928955078125,
"logps/ref_chosen": -55.035980224609375,
"logps/ref_rejected": -75.80644989013672,
"logps/rejected": -107.88850402832031,
"loss": 0.8341,
"margin_dpo/margin_mean": 12.40875244140625,
"margin_dpo/margin_std": 14.594932556152344,
"step": 557
},
{
"epoch": 0.8435374149659864,
"grad_norm": 32.328819274902344,
"learning_rate": 3.687450924416341e-08,
"logits/chosen": 0.42616766691207886,
"logits/rejected": 0.36569124460220337,
"logps/chosen": -80.0509033203125,
"logps/ref_chosen": -63.226348876953125,
"logps/ref_rejected": -91.46881866455078,
"logps/rejected": -120.33204650878906,
"loss": 0.8683,
"margin_dpo/margin_mean": 12.038671493530273,
"margin_dpo/margin_std": 15.140554428100586,
"step": 558
},
{
"epoch": 0.8450491307634165,
"grad_norm": 32.156707763671875,
"learning_rate": 3.6186337553827743e-08,
"logits/chosen": 0.3285496234893799,
"logits/rejected": 0.2561686336994171,
"logps/chosen": -80.550048828125,
"logps/ref_chosen": -61.521644592285156,
"logps/ref_rejected": -82.83859252929688,
"logps/rejected": -113.366943359375,
"loss": 0.9861,
"margin_dpo/margin_mean": 11.49993896484375,
"margin_dpo/margin_std": 16.700138092041016,
"step": 559
},
{
"epoch": 0.8465608465608465,
"grad_norm": 32.59392166137695,
"learning_rate": 3.550414669125573e-08,
"logits/chosen": 0.36145317554473877,
"logits/rejected": 0.318742573261261,
"logps/chosen": -79.90272521972656,
"logps/ref_chosen": -60.64122009277344,
"logps/ref_rejected": -78.75474548339844,
"logps/rejected": -108.12406921386719,
"loss": 1.0135,
"margin_dpo/margin_mean": 10.107817649841309,
"margin_dpo/margin_std": 14.901373863220215,
"step": 560
},
{
"epoch": 0.8480725623582767,
"grad_norm": 28.553300857543945,
"learning_rate": 3.482795573879241e-08,
"logits/chosen": 0.35189008712768555,
"logits/rejected": 0.3156622648239136,
"logps/chosen": -78.87603759765625,
"logps/ref_chosen": -62.49860382080078,
"logps/ref_rejected": -78.72064208984375,
"logps/rejected": -106.23121643066406,
"loss": 0.9285,
"margin_dpo/margin_mean": 11.133148193359375,
"margin_dpo/margin_std": 15.419248580932617,
"step": 561
},
{
"epoch": 0.8495842781557067,
"grad_norm": 29.942720413208008,
"learning_rate": 3.415778361095226e-08,
"logits/chosen": 0.3795633912086487,
"logits/rejected": 0.3341631591320038,
"logps/chosen": -93.68280029296875,
"logps/ref_chosen": -74.78173828125,
"logps/ref_rejected": -92.63499450683594,
"logps/rejected": -124.07745361328125,
"loss": 0.8497,
"margin_dpo/margin_mean": 12.541391372680664,
"margin_dpo/margin_std": 14.615842819213867,
"step": 562
},
{
"epoch": 0.8510959939531368,
"grad_norm": 34.505672454833984,
"learning_rate": 3.349364905389032e-08,
"logits/chosen": 0.4309301972389221,
"logits/rejected": 0.373563289642334,
"logps/chosen": -67.70144653320312,
"logps/ref_chosen": -50.19850158691406,
"logps/ref_rejected": -66.76687622070312,
"logps/rejected": -95.26081848144531,
"loss": 1.0219,
"margin_dpo/margin_mean": 10.990997314453125,
"margin_dpo/margin_std": 16.4418888092041,
"step": 563
},
{
"epoch": 0.8526077097505669,
"grad_norm": 28.641733169555664,
"learning_rate": 3.283557064487785e-08,
"logits/chosen": 0.319988489151001,
"logits/rejected": 0.284395694732666,
"logps/chosen": -72.115478515625,
"logps/ref_chosen": -55.7408447265625,
"logps/ref_rejected": -74.8232421875,
"logps/rejected": -104.1437759399414,
"loss": 0.8346,
"margin_dpo/margin_mean": 12.945907592773438,
"margin_dpo/margin_std": 14.922258377075195,
"step": 564
},
{
"epoch": 0.854119425547997,
"grad_norm": 35.68989944458008,
"learning_rate": 3.218356679178252e-08,
"logits/chosen": 0.40092119574546814,
"logits/rejected": 0.34424033761024475,
"logps/chosen": -79.51023864746094,
"logps/ref_chosen": -58.33738327026367,
"logps/ref_rejected": -78.31776428222656,
"logps/rejected": -109.12625122070312,
"loss": 1.0348,
"margin_dpo/margin_mean": 9.63563346862793,
"margin_dpo/margin_std": 14.547908782958984,
"step": 565
},
{
"epoch": 0.8556311413454271,
"grad_norm": 40.661415100097656,
"learning_rate": 3.1537655732553764e-08,
"logits/chosen": 0.38034552335739136,
"logits/rejected": 0.35432299971580505,
"logps/chosen": -89.7762451171875,
"logps/ref_chosen": -71.22373962402344,
"logps/ref_rejected": -71.11601257324219,
"logps/rejected": -99.64232635498047,
"loss": 1.1097,
"margin_dpo/margin_mean": 9.973814010620117,
"margin_dpo/margin_std": 16.615154266357422,
"step": 566
},
{
"epoch": 0.8571428571428571,
"grad_norm": 27.573196411132812,
"learning_rate": 3.089785553471233e-08,
"logits/chosen": 0.3880395293235779,
"logits/rejected": 0.2916451096534729,
"logps/chosen": -71.2606201171875,
"logps/ref_chosen": -52.669273376464844,
"logps/ref_rejected": -74.34785461425781,
"logps/rejected": -104.5632553100586,
"loss": 0.8334,
"margin_dpo/margin_mean": 11.62405014038086,
"margin_dpo/margin_std": 13.137116432189941,
"step": 567
},
{
"epoch": 0.8586545729402872,
"grad_norm": 27.940860748291016,
"learning_rate": 3.026418409484513e-08,
"logits/chosen": 0.4033098816871643,
"logits/rejected": 0.31375181674957275,
"logps/chosen": -69.65806579589844,
"logps/ref_chosen": -52.178001403808594,
"logps/ref_rejected": -85.8277587890625,
"logps/rejected": -116.09062194824219,
"loss": 0.7742,
"margin_dpo/margin_mean": 12.78278923034668,
"margin_dpo/margin_std": 13.336824417114258,
"step": 568
},
{
"epoch": 0.8601662887377173,
"grad_norm": 31.390438079833984,
"learning_rate": 2.963665913810451e-08,
"logits/chosen": 0.27315863966941833,
"logits/rejected": 0.24073369801044464,
"logps/chosen": -81.84532165527344,
"logps/ref_chosen": -62.649253845214844,
"logps/ref_rejected": -75.4298324584961,
"logps/rejected": -104.8544921875,
"loss": 0.9563,
"margin_dpo/margin_mean": 10.228591918945312,
"margin_dpo/margin_std": 13.502737998962402,
"step": 569
},
{
"epoch": 0.8616780045351474,
"grad_norm": 26.328510284423828,
"learning_rate": 2.9015298217712453e-08,
"logits/chosen": 0.3102429509162903,
"logits/rejected": 0.22179409861564636,
"logps/chosen": -66.96505737304688,
"logps/ref_chosen": -50.04179763793945,
"logps/ref_rejected": -78.27146911621094,
"logps/rejected": -109.94290161132812,
"loss": 0.7166,
"margin_dpo/margin_mean": 14.748177528381348,
"margin_dpo/margin_std": 14.456729888916016,
"step": 570
},
{
"epoch": 0.8631897203325775,
"grad_norm": 33.18550491333008,
"learning_rate": 2.840011871446962e-08,
"logits/chosen": 0.34274426102638245,
"logits/rejected": 0.3042169213294983,
"logps/chosen": -72.50775909423828,
"logps/ref_chosen": -53.65681457519531,
"logps/ref_rejected": -66.13298034667969,
"logps/rejected": -93.20866394042969,
"loss": 1.0793,
"margin_dpo/margin_mean": 8.224736213684082,
"margin_dpo/margin_std": 13.317925453186035,
"step": 571
},
{
"epoch": 0.8647014361300076,
"grad_norm": 38.28977966308594,
"learning_rate": 2.7791137836269158e-08,
"logits/chosen": 0.38000768423080444,
"logits/rejected": 0.4129163324832916,
"logps/chosen": -93.44584655761719,
"logps/ref_chosen": -74.81793212890625,
"logps/ref_rejected": -65.88681030273438,
"logps/rejected": -94.55876922607422,
"loss": 0.9528,
"margin_dpo/margin_mean": 10.044036865234375,
"margin_dpo/margin_std": 13.885844230651855,
"step": 572
},
{
"epoch": 0.8662131519274376,
"grad_norm": 39.81300735473633,
"learning_rate": 2.718837261761528e-08,
"logits/chosen": 0.35131800174713135,
"logits/rejected": 0.30365169048309326,
"logps/chosen": -89.20417785644531,
"logps/ref_chosen": -68.72564697265625,
"logps/ref_rejected": -88.16201782226562,
"logps/rejected": -119.7973403930664,
"loss": 1.0868,
"margin_dpo/margin_mean": 11.156793594360352,
"margin_dpo/margin_std": 18.632854461669922,
"step": 573
},
{
"epoch": 0.8677248677248677,
"grad_norm": 27.587465286254883,
"learning_rate": 2.659183991914696e-08,
"logits/chosen": 0.4093438684940338,
"logits/rejected": 0.3355673551559448,
"logps/chosen": -75.17575073242188,
"logps/ref_chosen": -56.31340026855469,
"logps/ref_rejected": -83.91553497314453,
"logps/rejected": -115.10252380371094,
"loss": 0.763,
"margin_dpo/margin_mean": 12.324638366699219,
"margin_dpo/margin_std": 12.423215866088867,
"step": 574
},
{
"epoch": 0.8692365835222978,
"grad_norm": 37.5849609375,
"learning_rate": 2.600155642716606e-08,
"logits/chosen": 0.4293455481529236,
"logits/rejected": 0.34858548641204834,
"logps/chosen": -82.68851470947266,
"logps/ref_chosen": -64.5841293334961,
"logps/ref_rejected": -93.47034454345703,
"logps/rejected": -120.43289947509766,
"loss": 1.1319,
"margin_dpo/margin_mean": 8.858168601989746,
"margin_dpo/margin_std": 15.557498931884766,
"step": 575
},
{
"epoch": 0.8707482993197279,
"grad_norm": 33.85169219970703,
"learning_rate": 2.5417538653170754e-08,
"logits/chosen": 0.42376574873924255,
"logits/rejected": 0.3154095411300659,
"logps/chosen": -69.3154296875,
"logps/ref_chosen": -53.28052520751953,
"logps/ref_rejected": -84.20004272460938,
"logps/rejected": -111.85845184326172,
"loss": 0.8868,
"margin_dpo/margin_mean": 11.62350082397461,
"margin_dpo/margin_std": 14.447968482971191,
"step": 576
},
{
"epoch": 0.872260015117158,
"grad_norm": 34.81472396850586,
"learning_rate": 2.4839802933393607e-08,
"logits/chosen": 0.3261626362800598,
"logits/rejected": 0.303744912147522,
"logps/chosen": -80.17160034179688,
"logps/ref_chosen": -62.32469177246094,
"logps/ref_rejected": -67.300537109375,
"logps/rejected": -93.61477661132812,
"loss": 1.1176,
"margin_dpo/margin_mean": 8.467338562011719,
"margin_dpo/margin_std": 15.134763717651367,
"step": 577
},
{
"epoch": 0.873771730914588,
"grad_norm": 32.825374603271484,
"learning_rate": 2.4268365428344733e-08,
"logits/chosen": 0.42811548709869385,
"logits/rejected": 0.39781272411346436,
"logps/chosen": -74.832275390625,
"logps/ref_chosen": -56.65557861328125,
"logps/ref_rejected": -68.21835327148438,
"logps/rejected": -94.33741760253906,
"loss": 1.1023,
"margin_dpo/margin_mean": 7.942363739013672,
"margin_dpo/margin_std": 14.076078414916992,
"step": 578
},
{
"epoch": 0.8752834467120182,
"grad_norm": 28.52338981628418,
"learning_rate": 2.3703242122359357e-08,
"logits/chosen": 0.30040955543518066,
"logits/rejected": 0.26452547311782837,
"logps/chosen": -75.03377532958984,
"logps/ref_chosen": -56.809661865234375,
"logps/ref_rejected": -68.09613037109375,
"logps/rejected": -100.4896240234375,
"loss": 0.7615,
"margin_dpo/margin_mean": 14.169382095336914,
"margin_dpo/margin_std": 15.500885009765625,
"step": 579
},
{
"epoch": 0.8767951625094482,
"grad_norm": 33.52607345581055,
"learning_rate": 2.3144448823151392e-08,
"logits/chosen": 0.33850836753845215,
"logits/rejected": 0.28215405344963074,
"logps/chosen": -76.02555847167969,
"logps/ref_chosen": -57.70011520385742,
"logps/ref_rejected": -77.90664672851562,
"logps/rejected": -106.16627502441406,
"loss": 1.037,
"margin_dpo/margin_mean": 9.934186935424805,
"margin_dpo/margin_std": 15.274032592773438,
"step": 580
},
{
"epoch": 0.8783068783068783,
"grad_norm": 37.067745208740234,
"learning_rate": 2.259200116137039e-08,
"logits/chosen": 0.38778138160705566,
"logits/rejected": 0.32042139768600464,
"logps/chosen": -79.5643539428711,
"logps/ref_chosen": -59.332359313964844,
"logps/ref_rejected": -83.64482116699219,
"logps/rejected": -112.95232391357422,
"loss": 1.0798,
"margin_dpo/margin_mean": 9.075504302978516,
"margin_dpo/margin_std": 15.379884719848633,
"step": 581
},
{
"epoch": 0.8798185941043084,
"grad_norm": 32.96426010131836,
"learning_rate": 2.204591459016525e-08,
"logits/chosen": 0.3775950074195862,
"logits/rejected": 0.3984692394733429,
"logps/chosen": -82.11659240722656,
"logps/ref_chosen": -64.16285705566406,
"logps/ref_rejected": -58.632896423339844,
"logps/rejected": -86.89315795898438,
"loss": 0.9115,
"margin_dpo/margin_mean": 10.306524276733398,
"margin_dpo/margin_std": 13.311269760131836,
"step": 582
},
{
"epoch": 0.8813303099017384,
"grad_norm": 35.847442626953125,
"learning_rate": 2.1506204384751064e-08,
"logits/chosen": 0.46830427646636963,
"logits/rejected": 0.35817262530326843,
"logps/chosen": -69.75498962402344,
"logps/ref_chosen": -51.87239456176758,
"logps/ref_rejected": -83.86331176757812,
"logps/rejected": -111.9892807006836,
"loss": 1.044,
"margin_dpo/margin_mean": 10.24338150024414,
"margin_dpo/margin_std": 16.19095230102539,
"step": 583
},
{
"epoch": 0.8828420256991686,
"grad_norm": 33.298362731933594,
"learning_rate": 2.09728856419826e-08,
"logits/chosen": 0.48178231716156006,
"logits/rejected": 0.3758727014064789,
"logps/chosen": -63.06849670410156,
"logps/ref_chosen": -46.571388244628906,
"logps/ref_rejected": -80.67969512939453,
"logps/rejected": -107.80296325683594,
"loss": 1.053,
"margin_dpo/margin_mean": 10.626161575317383,
"margin_dpo/margin_std": 16.727203369140625,
"step": 584
},
{
"epoch": 0.8843537414965986,
"grad_norm": 31.498868942260742,
"learning_rate": 2.044597327993153e-08,
"logits/chosen": 0.33858704566955566,
"logits/rejected": 0.29580289125442505,
"logps/chosen": -76.06266784667969,
"logps/ref_chosen": -58.124534606933594,
"logps/ref_rejected": -79.00538635253906,
"logps/rejected": -106.02567291259766,
"loss": 1.0402,
"margin_dpo/margin_mean": 9.082149505615234,
"margin_dpo/margin_std": 14.097719192504883,
"step": 585
},
{
"epoch": 0.8858654572940288,
"grad_norm": 29.214014053344727,
"learning_rate": 1.9925482037469187e-08,
"logits/chosen": 0.4083937406539917,
"logits/rejected": 0.35212117433547974,
"logps/chosen": -72.71726989746094,
"logps/ref_chosen": -54.10163879394531,
"logps/ref_rejected": -63.72113037109375,
"logps/rejected": -94.09498596191406,
"loss": 0.8183,
"margin_dpo/margin_mean": 11.758225440979004,
"margin_dpo/margin_std": 13.04593276977539,
"step": 586
},
{
"epoch": 0.8873771730914588,
"grad_norm": 40.19536209106445,
"learning_rate": 1.9411426473854687e-08,
"logits/chosen": 0.39255088567733765,
"logits/rejected": 0.3729506731033325,
"logps/chosen": -80.76512145996094,
"logps/ref_chosen": -63.41719436645508,
"logps/ref_rejected": -63.47003936767578,
"logps/rejected": -91.57522583007812,
"loss": 1.0638,
"margin_dpo/margin_mean": 10.757262229919434,
"margin_dpo/margin_std": 17.08779525756836,
"step": 587
},
{
"epoch": 0.8888888888888888,
"grad_norm": 33.136600494384766,
"learning_rate": 1.890382096832699e-08,
"logits/chosen": 0.42146050930023193,
"logits/rejected": 0.3690452575683594,
"logps/chosen": -80.40387725830078,
"logps/ref_chosen": -62.20103454589844,
"logps/ref_rejected": -82.10250091552734,
"logps/rejected": -110.79915618896484,
"loss": 0.929,
"margin_dpo/margin_mean": 10.493810653686523,
"margin_dpo/margin_std": 14.043756484985352,
"step": 588
},
{
"epoch": 0.890400604686319,
"grad_norm": 30.288169860839844,
"learning_rate": 1.840267971970344e-08,
"logits/chosen": 0.3641907572746277,
"logits/rejected": 0.32878395915031433,
"logps/chosen": -72.49046325683594,
"logps/ref_chosen": -56.71361541748047,
"logps/ref_rejected": -76.7366943359375,
"logps/rejected": -106.24540710449219,
"loss": 0.7499,
"margin_dpo/margin_mean": 13.731870651245117,
"margin_dpo/margin_std": 14.987432479858398,
"step": 589
},
{
"epoch": 0.891912320483749,
"grad_norm": 31.437152862548828,
"learning_rate": 1.7908016745981856e-08,
"logits/chosen": 0.332854300737381,
"logits/rejected": 0.29905757308006287,
"logps/chosen": -85.63790893554688,
"logps/ref_chosen": -66.5138168334961,
"logps/ref_rejected": -85.70820617675781,
"logps/rejected": -118.0446548461914,
"loss": 0.8782,
"margin_dpo/margin_mean": 13.212362289428711,
"margin_dpo/margin_std": 16.211627960205078,
"step": 590
},
{
"epoch": 0.8934240362811792,
"grad_norm": 31.040788650512695,
"learning_rate": 1.7419845883949098e-08,
"logits/chosen": 0.43007323145866394,
"logits/rejected": 0.3634907603263855,
"logps/chosen": -76.70008850097656,
"logps/ref_chosen": -60.697181701660156,
"logps/ref_rejected": -86.12278747558594,
"logps/rejected": -114.46322631835938,
"loss": 0.9683,
"margin_dpo/margin_mean": 12.337522506713867,
"margin_dpo/margin_std": 16.39020347595215,
"step": 591
},
{
"epoch": 0.8949357520786092,
"grad_norm": 32.1717643737793,
"learning_rate": 1.6938180788793556e-08,
"logits/chosen": 0.4040185213088989,
"logits/rejected": 0.28919947147369385,
"logps/chosen": -68.08457946777344,
"logps/ref_chosen": -51.237327575683594,
"logps/ref_rejected": -81.60243225097656,
"logps/rejected": -109.53164672851562,
"loss": 0.905,
"margin_dpo/margin_mean": 11.08197021484375,
"margin_dpo/margin_std": 14.641545295715332,
"step": 592
},
{
"epoch": 0.8964474678760394,
"grad_norm": 34.61304473876953,
"learning_rate": 1.6463034933723336e-08,
"logits/chosen": 0.3542747497558594,
"logits/rejected": 0.2594214081764221,
"logps/chosen": -57.7242546081543,
"logps/ref_chosen": -42.08000183105469,
"logps/ref_rejected": -68.47499084472656,
"logps/rejected": -94.96978759765625,
"loss": 1.0714,
"margin_dpo/margin_mean": 10.850542068481445,
"margin_dpo/margin_std": 17.000240325927734,
"step": 593
},
{
"epoch": 0.8979591836734694,
"grad_norm": 29.87799835205078,
"learning_rate": 1.5994421609589385e-08,
"logits/chosen": 0.29593855142593384,
"logits/rejected": 0.2723734378814697,
"logps/chosen": -82.42706298828125,
"logps/ref_chosen": -63.65867614746094,
"logps/ref_rejected": -70.35597229003906,
"logps/rejected": -97.96664428710938,
"loss": 1.0163,
"margin_dpo/margin_mean": 8.84228515625,
"margin_dpo/margin_std": 13.406206130981445,
"step": 594
},
{
"epoch": 0.8994708994708994,
"grad_norm": 30.320661544799805,
"learning_rate": 1.553235392451377e-08,
"logits/chosen": 0.41393527388572693,
"logits/rejected": 0.31939074397087097,
"logps/chosen": -74.23045349121094,
"logps/ref_chosen": -56.21875762939453,
"logps/ref_rejected": -83.95773315429688,
"logps/rejected": -116.33284759521484,
"loss": 0.8554,
"margin_dpo/margin_mean": 14.363415718078613,
"margin_dpo/margin_std": 17.355724334716797,
"step": 595
},
{
"epoch": 0.9009826152683296,
"grad_norm": 36.3000373840332,
"learning_rate": 1.507684480352292e-08,
"logits/chosen": 0.2564888596534729,
"logits/rejected": 0.2615135610103607,
"logps/chosen": -87.64381408691406,
"logps/ref_chosen": -68.48088073730469,
"logps/ref_rejected": -61.732967376708984,
"logps/rejected": -88.15812683105469,
"loss": 1.1944,
"margin_dpo/margin_mean": 7.262219429016113,
"margin_dpo/margin_std": 15.237005233764648,
"step": 596
},
{
"epoch": 0.9024943310657596,
"grad_norm": 26.79124641418457,
"learning_rate": 1.4627906988186111e-08,
"logits/chosen": 0.32503068447113037,
"logits/rejected": 0.2960435152053833,
"logps/chosen": -64.060791015625,
"logps/ref_chosen": -48.85750961303711,
"logps/ref_rejected": -55.068084716796875,
"logps/rejected": -80.88280487060547,
"loss": 0.9442,
"margin_dpo/margin_mean": 10.611440658569336,
"margin_dpo/margin_std": 14.508207321166992,
"step": 597
},
{
"epoch": 0.9040060468631897,
"grad_norm": 41.459163665771484,
"learning_rate": 1.4185553036259095e-08,
"logits/chosen": 0.3746280074119568,
"logits/rejected": 0.29736125469207764,
"logps/chosen": -78.41563415527344,
"logps/ref_chosen": -58.88715362548828,
"logps/ref_rejected": -81.43145751953125,
"logps/rejected": -108.66373443603516,
"loss": 1.1658,
"margin_dpo/margin_mean": 7.703801155090332,
"margin_dpo/margin_std": 14.998854637145996,
"step": 598
},
{
"epoch": 0.9055177626606198,
"grad_norm": 34.81147384643555,
"learning_rate": 1.3749795321332885e-08,
"logits/chosen": 0.453105092048645,
"logits/rejected": 0.4056002199649811,
"logps/chosen": -78.00206756591797,
"logps/ref_chosen": -57.60719680786133,
"logps/ref_rejected": -71.80469512939453,
"logps/rejected": -101.34292602539062,
"loss": 1.0565,
"margin_dpo/margin_mean": 9.143360137939453,
"margin_dpo/margin_std": 14.960365295410156,
"step": 599
},
{
"epoch": 0.9070294784580499,
"grad_norm": 31.092208862304688,
"learning_rate": 1.3320646032487393e-08,
"logits/chosen": 0.4238000214099884,
"logits/rejected": 0.3725815415382385,
"logps/chosen": -77.0726547241211,
"logps/ref_chosen": -58.44231414794922,
"logps/ref_rejected": -83.64639282226562,
"logps/rejected": -111.70862579345703,
"loss": 1.0459,
"margin_dpo/margin_mean": 9.431896209716797,
"margin_dpo/margin_std": 14.470096588134766,
"step": 600
},
{
"epoch": 0.9070294784580499,
"eval_logits/chosen": 0.37978631258010864,
"eval_logits/rejected": 0.32850033044815063,
"eval_logps/chosen": -92.03856658935547,
"eval_logps/ref_chosen": -74.85946655273438,
"eval_logps/ref_rejected": -79.54898834228516,
"eval_logps/rejected": -106.09295654296875,
"eval_loss": 0.5259022116661072,
"eval_margin_dpo/margin_mean": 9.36485481262207,
"eval_margin_dpo/margin_std": 14.809694290161133,
"eval_runtime": 38.6664,
"eval_samples_per_second": 59.561,
"eval_steps_per_second": 1.862,
"step": 600
},
{
"epoch": 0.90854119425548,
"grad_norm": 28.272085189819336,
"learning_rate": 1.2898117173950868e-08,
"logits/chosen": 0.3726602792739868,
"logits/rejected": 0.29538896679878235,
"logps/chosen": -70.87857055664062,
"logps/ref_chosen": -55.59432601928711,
"logps/ref_rejected": -83.68630981445312,
"logps/rejected": -111.41099548339844,
"loss": 0.8946,
"margin_dpo/margin_mean": 12.440434455871582,
"margin_dpo/margin_std": 15.828836441040039,
"step": 601
},
{
"epoch": 0.91005291005291,
"grad_norm": 29.497037887573242,
"learning_rate": 1.2482220564763667e-08,
"logits/chosen": 0.36928558349609375,
"logits/rejected": 0.3300653100013733,
"logps/chosen": -70.74989318847656,
"logps/ref_chosen": -56.349185943603516,
"logps/ref_rejected": -71.9959716796875,
"logps/rejected": -97.75289916992188,
"loss": 0.857,
"margin_dpo/margin_mean": 11.356219291687012,
"margin_dpo/margin_std": 13.522655487060547,
"step": 602
},
{
"epoch": 0.9115646258503401,
"grad_norm": 27.89227294921875,
"learning_rate": 1.2072967838448051e-08,
"logits/chosen": 0.318248450756073,
"logits/rejected": 0.25470617413520813,
"logps/chosen": -68.64102172851562,
"logps/ref_chosen": -53.168392181396484,
"logps/ref_rejected": -73.8604736328125,
"logps/rejected": -100.43838500976562,
"loss": 0.8921,
"margin_dpo/margin_mean": 11.10527515411377,
"margin_dpo/margin_std": 14.360272407531738,
"step": 603
},
{
"epoch": 0.9130763416477702,
"grad_norm": 34.568702697753906,
"learning_rate": 1.1670370442682459e-08,
"logits/chosen": 0.3337470591068268,
"logits/rejected": 0.32948338985443115,
"logps/chosen": -88.53524780273438,
"logps/ref_chosen": -72.64942169189453,
"logps/ref_rejected": -69.87926483154297,
"logps/rejected": -95.9818115234375,
"loss": 1.0618,
"margin_dpo/margin_mean": 10.216726303100586,
"margin_dpo/margin_std": 16.702648162841797,
"step": 604
},
{
"epoch": 0.9145880574452003,
"grad_norm": 32.900787353515625,
"learning_rate": 1.1274439638981532e-08,
"logits/chosen": 0.43235743045806885,
"logits/rejected": 0.3695913553237915,
"logps/chosen": -80.73637390136719,
"logps/ref_chosen": -61.61284637451172,
"logps/ref_rejected": -79.34398651123047,
"logps/rejected": -108.1134033203125,
"loss": 1.0151,
"margin_dpo/margin_mean": 9.645885467529297,
"margin_dpo/margin_std": 14.3195219039917,
"step": 605
},
{
"epoch": 0.9160997732426304,
"grad_norm": 30.54977035522461,
"learning_rate": 1.0885186502381016e-08,
"logits/chosen": 0.36005428433418274,
"logits/rejected": 0.2921282649040222,
"logps/chosen": -71.12816619873047,
"logps/ref_chosen": -54.464237213134766,
"logps/ref_rejected": -79.6270751953125,
"logps/rejected": -106.30125427246094,
"loss": 0.9359,
"margin_dpo/margin_mean": 10.010250091552734,
"margin_dpo/margin_std": 13.337453842163086,
"step": 606
},
{
"epoch": 0.9176114890400605,
"grad_norm": 32.681453704833984,
"learning_rate": 1.0502621921127774e-08,
"logits/chosen": 0.33145594596862793,
"logits/rejected": 0.29535144567489624,
"logps/chosen": -80.78804016113281,
"logps/ref_chosen": -62.86086654663086,
"logps/ref_rejected": -72.55020141601562,
"logps/rejected": -102.04086303710938,
"loss": 0.9373,
"margin_dpo/margin_mean": 11.56348991394043,
"margin_dpo/margin_std": 15.309553146362305,
"step": 607
},
{
"epoch": 0.9191232048374905,
"grad_norm": 34.54768753051758,
"learning_rate": 1.0126756596375685e-08,
"logits/chosen": 0.3337676525115967,
"logits/rejected": 0.26411697268486023,
"logps/chosen": -82.00431823730469,
"logps/ref_chosen": -63.18071746826172,
"logps/ref_rejected": -99.15888977050781,
"logps/rejected": -126.51753234863281,
"loss": 1.025,
"margin_dpo/margin_mean": 8.535051345825195,
"margin_dpo/margin_std": 13.468018531799316,
"step": 608
},
{
"epoch": 0.9206349206349206,
"grad_norm": 25.379526138305664,
"learning_rate": 9.757601041885694e-09,
"logits/chosen": 0.4193066656589508,
"logits/rejected": 0.3789142966270447,
"logps/chosen": -64.20651245117188,
"logps/ref_chosen": -48.62322235107422,
"logps/ref_rejected": -68.28271484375,
"logps/rejected": -96.82388305664062,
"loss": 0.7597,
"margin_dpo/margin_mean": 12.957870483398438,
"margin_dpo/margin_std": 13.232034683227539,
"step": 609
},
{
"epoch": 0.9221466364323507,
"grad_norm": 39.13202667236328,
"learning_rate": 9.395165583732379e-09,
"logits/chosen": 0.3527122139930725,
"logits/rejected": 0.3498975336551666,
"logps/chosen": -90.61177062988281,
"logps/ref_chosen": -72.66513061523438,
"logps/ref_rejected": -87.15311431884766,
"logps/rejected": -116.95249938964844,
"loss": 0.9629,
"margin_dpo/margin_mean": 11.85274600982666,
"margin_dpo/margin_std": 17.011281967163086,
"step": 610
},
{
"epoch": 0.9236583522297808,
"grad_norm": 36.39158630371094,
"learning_rate": 9.03946036001449e-09,
"logits/chosen": 0.429378867149353,
"logits/rejected": 0.3778746426105499,
"logps/chosen": -66.14514923095703,
"logps/ref_chosen": -48.30857849121094,
"logps/ref_rejected": -70.6141128540039,
"logps/rejected": -96.36630249023438,
"loss": 1.0813,
"margin_dpo/margin_mean": 7.9156084060668945,
"margin_dpo/margin_std": 13.126340866088867,
"step": 611
},
{
"epoch": 0.9251700680272109,
"grad_norm": 31.740697860717773,
"learning_rate": 8.690495320571839e-09,
"logits/chosen": 0.2883661389350891,
"logits/rejected": 0.218246728181839,
"logps/chosen": -79.38035583496094,
"logps/ref_chosen": -61.23155975341797,
"logps/ref_rejected": -94.37979888916016,
"logps/rejected": -124.63994598388672,
"loss": 0.9565,
"margin_dpo/margin_mean": 12.111353874206543,
"margin_dpo/margin_std": 17.107288360595703,
"step": 612
},
{
"epoch": 0.926681783824641,
"grad_norm": 32.549964904785156,
"learning_rate": 8.348280226706722e-09,
"logits/chosen": 0.3040260076522827,
"logits/rejected": 0.2911207675933838,
"logps/chosen": -70.19365692138672,
"logps/ref_chosen": -53.98310852050781,
"logps/ref_rejected": -58.32208251953125,
"logps/rejected": -88.9547348022461,
"loss": 0.8032,
"margin_dpo/margin_mean": 14.422100067138672,
"margin_dpo/margin_std": 16.371356964111328,
"step": 613
},
{
"epoch": 0.9281934996220711,
"grad_norm": 30.036380767822266,
"learning_rate": 8.012824650910937e-09,
"logits/chosen": 0.39648669958114624,
"logits/rejected": 0.3797394633293152,
"logps/chosen": -77.901123046875,
"logps/ref_chosen": -60.24303436279297,
"logps/ref_rejected": -72.26258850097656,
"logps/rejected": -100.70169067382812,
"loss": 0.8261,
"margin_dpo/margin_mean": 10.781007766723633,
"margin_dpo/margin_std": 11.667643547058105,
"step": 614
},
{
"epoch": 0.9297052154195011,
"grad_norm": 36.60609436035156,
"learning_rate": 7.684137976598088e-09,
"logits/chosen": 0.3580027222633362,
"logits/rejected": 0.30660200119018555,
"logps/chosen": -90.10015106201172,
"logps/ref_chosen": -72.09467315673828,
"logps/ref_rejected": -104.02980041503906,
"logps/rejected": -134.2967987060547,
"loss": 0.937,
"margin_dpo/margin_mean": 12.261512756347656,
"margin_dpo/margin_std": 16.436660766601562,
"step": 615
},
{
"epoch": 0.9312169312169312,
"grad_norm": 31.520265579223633,
"learning_rate": 7.36222939784098e-09,
"logits/chosen": 0.38119786977767944,
"logits/rejected": 0.2980498969554901,
"logps/chosen": -77.32060241699219,
"logps/ref_chosen": -58.53071975708008,
"logps/ref_rejected": -75.48025512695312,
"logps/rejected": -102.96885681152344,
"loss": 1.0323,
"margin_dpo/margin_mean": 8.698728561401367,
"margin_dpo/margin_std": 14.142110824584961,
"step": 616
},
{
"epoch": 0.9327286470143613,
"grad_norm": 31.635847091674805,
"learning_rate": 7.047107919114586e-09,
"logits/chosen": 0.3622846007347107,
"logits/rejected": 0.3139178454875946,
"logps/chosen": -77.13575744628906,
"logps/ref_chosen": -57.608673095703125,
"logps/ref_rejected": -81.22109985351562,
"logps/rejected": -111.34927368164062,
"loss": 0.9236,
"margin_dpo/margin_mean": 10.601092338562012,
"margin_dpo/margin_std": 13.501839637756348,
"step": 617
},
{
"epoch": 0.9342403628117913,
"grad_norm": 32.08859634399414,
"learning_rate": 6.738782355044048e-09,
"logits/chosen": 0.33811718225479126,
"logits/rejected": 0.23761284351348877,
"logps/chosen": -72.75361633300781,
"logps/ref_chosen": -56.69594192504883,
"logps/ref_rejected": -85.92362976074219,
"logps/rejected": -112.8507080078125,
"loss": 0.891,
"margin_dpo/margin_mean": 10.869397163391113,
"margin_dpo/margin_std": 13.591676712036133,
"step": 618
},
{
"epoch": 0.9357520786092215,
"grad_norm": 30.435407638549805,
"learning_rate": 6.437261330158206e-09,
"logits/chosen": 0.4325089454650879,
"logits/rejected": 0.3603595495223999,
"logps/chosen": -70.07926940917969,
"logps/ref_chosen": -54.05841827392578,
"logps/ref_rejected": -83.55493927001953,
"logps/rejected": -110.23355865478516,
"loss": 0.9168,
"margin_dpo/margin_mean": 10.657764434814453,
"margin_dpo/margin_std": 14.015510559082031,
"step": 619
},
{
"epoch": 0.9372637944066515,
"grad_norm": 35.472373962402344,
"learning_rate": 6.142553278648238e-09,
"logits/chosen": 0.36848458647727966,
"logits/rejected": 0.3582582175731659,
"logps/chosen": -79.79203796386719,
"logps/ref_chosen": -63.36971664428711,
"logps/ref_rejected": -65.68268585205078,
"logps/rejected": -91.74382019042969,
"loss": 0.9899,
"margin_dpo/margin_mean": 9.638816833496094,
"margin_dpo/margin_std": 13.876047134399414,
"step": 620
},
{
"epoch": 0.9387755102040817,
"grad_norm": 35.464088439941406,
"learning_rate": 5.854666444131934e-09,
"logits/chosen": 0.4091821312904358,
"logits/rejected": 0.30643418431282043,
"logps/chosen": -69.613037109375,
"logps/ref_chosen": -52.321224212646484,
"logps/ref_rejected": -88.09001159667969,
"logps/rejected": -115.10537719726562,
"loss": 1.0238,
"margin_dpo/margin_mean": 9.723562240600586,
"margin_dpo/margin_std": 14.74032974243164,
"step": 621
},
{
"epoch": 0.9402872260015117,
"grad_norm": 28.998062133789062,
"learning_rate": 5.573608879422875e-09,
"logits/chosen": 0.3346262276172638,
"logits/rejected": 0.29250970482826233,
"logps/chosen": -77.71116638183594,
"logps/ref_chosen": -59.86545944213867,
"logps/ref_rejected": -81.86668395996094,
"logps/rejected": -109.695068359375,
"loss": 0.8933,
"margin_dpo/margin_mean": 9.982682228088379,
"margin_dpo/margin_std": 12.632926940917969,
"step": 622
},
{
"epoch": 0.9417989417989417,
"grad_norm": 27.194913864135742,
"learning_rate": 5.299388446305342e-09,
"logits/chosen": 0.338644802570343,
"logits/rejected": 0.27187514305114746,
"logps/chosen": -87.81857299804688,
"logps/ref_chosen": -67.36846160888672,
"logps/ref_rejected": -82.02734375,
"logps/rejected": -113.73113250732422,
"loss": 0.8523,
"margin_dpo/margin_mean": 11.253677368164062,
"margin_dpo/margin_std": 13.05603313446045,
"step": 623
},
{
"epoch": 0.9433106575963719,
"grad_norm": 29.42872428894043,
"learning_rate": 5.03201281531429e-09,
"logits/chosen": 0.35232800245285034,
"logits/rejected": 0.2513423562049866,
"logps/chosen": -67.53258514404297,
"logps/ref_chosen": -51.02655029296875,
"logps/ref_rejected": -76.49203491210938,
"logps/rejected": -103.98919677734375,
"loss": 0.9442,
"margin_dpo/margin_mean": 10.991124153137207,
"margin_dpo/margin_std": 14.784875869750977,
"step": 624
},
{
"epoch": 0.9448223733938019,
"grad_norm": 32.62732696533203,
"learning_rate": 4.7714894655209174e-09,
"logits/chosen": 0.4503011405467987,
"logits/rejected": 0.3634708523750305,
"logps/chosen": -72.8299560546875,
"logps/ref_chosen": -54.207618713378906,
"logps/ref_rejected": -84.93669891357422,
"logps/rejected": -112.27094268798828,
"loss": 1.0682,
"margin_dpo/margin_mean": 8.711897850036621,
"margin_dpo/margin_std": 14.378011703491211,
"step": 625
},
{
"epoch": 0.9463340891912321,
"grad_norm": 30.09181022644043,
"learning_rate": 4.517825684323323e-09,
"logits/chosen": 0.46222013235092163,
"logits/rejected": 0.3331487774848938,
"logps/chosen": -61.51173400878906,
"logps/ref_chosen": -45.06201934814453,
"logps/ref_rejected": -89.66368103027344,
"logps/rejected": -118.61964416503906,
"loss": 0.9869,
"margin_dpo/margin_mean": 12.506250381469727,
"margin_dpo/margin_std": 17.837303161621094,
"step": 626
},
{
"epoch": 0.9478458049886621,
"grad_norm": 31.055070877075195,
"learning_rate": 4.271028567242818e-09,
"logits/chosen": 0.33586978912353516,
"logits/rejected": 0.21951913833618164,
"logps/chosen": -76.3229751586914,
"logps/ref_chosen": -58.791053771972656,
"logps/ref_rejected": -94.90802001953125,
"logps/rejected": -126.97459411621094,
"loss": 0.7957,
"margin_dpo/margin_mean": 14.53464412689209,
"margin_dpo/margin_std": 16.625656127929688,
"step": 627
},
{
"epoch": 0.9493575207860923,
"grad_norm": 32.846656799316406,
"learning_rate": 4.0311050177251895e-09,
"logits/chosen": 0.3936043083667755,
"logits/rejected": 0.353672593832016,
"logps/chosen": -68.23133850097656,
"logps/ref_chosen": -52.8035774230957,
"logps/ref_rejected": -76.49468994140625,
"logps/rejected": -105.67950439453125,
"loss": 0.9118,
"margin_dpo/margin_mean": 13.75704288482666,
"margin_dpo/margin_std": 16.944318771362305,
"step": 628
},
{
"epoch": 0.9508692365835223,
"grad_norm": 28.255290985107422,
"learning_rate": 3.798061746947995e-09,
"logits/chosen": 0.3765157461166382,
"logits/rejected": 0.36460188031196594,
"logps/chosen": -87.0561294555664,
"logps/ref_chosen": -70.71749877929688,
"logps/ref_rejected": -78.9627456665039,
"logps/rejected": -106.53677368164062,
"loss": 0.7943,
"margin_dpo/margin_mean": 11.235391616821289,
"margin_dpo/margin_std": 11.854427337646484,
"step": 629
},
{
"epoch": 0.9523809523809523,
"grad_norm": 27.017562866210938,
"learning_rate": 3.5719052736323806e-09,
"logits/chosen": 0.3252914547920227,
"logits/rejected": 0.2737062871456146,
"logps/chosen": -73.12852478027344,
"logps/ref_chosen": -56.201412200927734,
"logps/ref_rejected": -74.69807434082031,
"logps/rejected": -103.19023132324219,
"loss": 0.863,
"margin_dpo/margin_mean": 11.56503963470459,
"margin_dpo/margin_std": 13.917753219604492,
"step": 630
},
{
"epoch": 0.9538926681783825,
"grad_norm": 33.7799072265625,
"learning_rate": 3.352641923861144e-09,
"logits/chosen": 0.46569541096687317,
"logits/rejected": 0.3548794686794281,
"logps/chosen": -74.5596923828125,
"logps/ref_chosen": -58.820594787597656,
"logps/ref_rejected": -96.51437377929688,
"logps/rejected": -125.98038482666016,
"loss": 0.8959,
"margin_dpo/margin_mean": 13.726908683776855,
"margin_dpo/margin_std": 17.125089645385742,
"step": 631
},
{
"epoch": 0.9554043839758125,
"grad_norm": 29.515872955322266,
"learning_rate": 3.140277830901428e-09,
"logits/chosen": 0.4190269708633423,
"logits/rejected": 0.38895538449287415,
"logps/chosen": -75.45999908447266,
"logps/ref_chosen": -58.786048889160156,
"logps/ref_rejected": -67.21923828125,
"logps/rejected": -97.81733703613281,
"loss": 0.7872,
"margin_dpo/margin_mean": 13.924144744873047,
"margin_dpo/margin_std": 14.660362243652344,
"step": 632
},
{
"epoch": 0.9569160997732427,
"grad_norm": 29.584177017211914,
"learning_rate": 2.9348189350335007e-09,
"logits/chosen": 0.3622671365737915,
"logits/rejected": 0.29672741889953613,
"logps/chosen": -67.86599731445312,
"logps/ref_chosen": -52.13019561767578,
"logps/ref_rejected": -67.23016357421875,
"logps/rejected": -93.01034545898438,
"loss": 1.0208,
"margin_dpo/margin_mean": 10.044373512268066,
"margin_dpo/margin_std": 15.52154541015625,
"step": 633
},
{
"epoch": 0.9584278155706727,
"grad_norm": 42.10969161987305,
"learning_rate": 2.736270983384276e-09,
"logits/chosen": 0.4223307967185974,
"logits/rejected": 0.4292535185813904,
"logps/chosen": -78.71293640136719,
"logps/ref_chosen": -60.97979736328125,
"logps/ref_rejected": -58.50825119018555,
"logps/rejected": -82.54983520507812,
"loss": 1.2272,
"margin_dpo/margin_mean": 6.308449745178223,
"margin_dpo/margin_std": 14.052802085876465,
"step": 634
},
{
"epoch": 0.9599395313681028,
"grad_norm": 39.1640739440918,
"learning_rate": 2.5446395297668287e-09,
"logits/chosen": 0.28009316325187683,
"logits/rejected": 0.21607697010040283,
"logps/chosen": -87.28462219238281,
"logps/ref_chosen": -65.9730224609375,
"logps/ref_rejected": -85.61316680908203,
"logps/rejected": -115.56608581542969,
"loss": 1.1864,
"margin_dpo/margin_mean": 8.641317367553711,
"margin_dpo/margin_std": 16.910058975219727,
"step": 635
},
{
"epoch": 0.9614512471655329,
"grad_norm": 28.020477294921875,
"learning_rate": 2.359929934524829e-09,
"logits/chosen": 0.3389211595058441,
"logits/rejected": 0.24626314640045166,
"logps/chosen": -65.96224975585938,
"logps/ref_chosen": -49.140167236328125,
"logps/ref_rejected": -81.26970672607422,
"logps/rejected": -110.5828857421875,
"loss": 0.7771,
"margin_dpo/margin_mean": 12.491098403930664,
"margin_dpo/margin_std": 13.732412338256836,
"step": 636
},
{
"epoch": 0.9629629629629629,
"grad_norm": 37.647308349609375,
"learning_rate": 2.1821473643827137e-09,
"logits/chosen": 0.32874488830566406,
"logits/rejected": 0.25553327798843384,
"logps/chosen": -95.09454345703125,
"logps/ref_chosen": -73.69658660888672,
"logps/ref_rejected": -83.01487731933594,
"logps/rejected": -113.97785949707031,
"loss": 1.0925,
"margin_dpo/margin_mean": 9.565020561218262,
"margin_dpo/margin_std": 17.08456802368164,
"step": 637
},
{
"epoch": 0.9644746787603931,
"grad_norm": 31.366626739501953,
"learning_rate": 2.0112967923011646e-09,
"logits/chosen": 0.3579593896865845,
"logits/rejected": 0.3046620488166809,
"logps/chosen": -82.60899353027344,
"logps/ref_chosen": -62.78158187866211,
"logps/ref_rejected": -85.40478515625,
"logps/rejected": -118.306396484375,
"loss": 0.8519,
"margin_dpo/margin_mean": 13.074191093444824,
"margin_dpo/margin_std": 15.700433731079102,
"step": 638
},
{
"epoch": 0.9659863945578231,
"grad_norm": 31.602121353149414,
"learning_rate": 1.847382997337943e-09,
"logits/chosen": 0.34738531708717346,
"logits/rejected": 0.23237836360931396,
"logps/chosen": -71.09160614013672,
"logps/ref_chosen": -53.76658248901367,
"logps/ref_rejected": -72.30009460449219,
"logps/rejected": -101.43208312988281,
"loss": 0.8877,
"margin_dpo/margin_mean": 11.806966781616211,
"margin_dpo/margin_std": 15.63563346862793,
"step": 639
},
{
"epoch": 0.9674981103552532,
"grad_norm": 33.430606842041016,
"learning_rate": 1.690410564514244e-09,
"logits/chosen": 0.4351768493652344,
"logits/rejected": 0.3735978901386261,
"logps/chosen": -69.39120483398438,
"logps/ref_chosen": -51.41777801513672,
"logps/ref_rejected": -77.27879333496094,
"logps/rejected": -104.54359436035156,
"loss": 1.0784,
"margin_dpo/margin_mean": 9.291372299194336,
"margin_dpo/margin_std": 15.052839279174805,
"step": 640
},
{
"epoch": 0.9690098261526833,
"grad_norm": 39.402584075927734,
"learning_rate": 1.5403838846864692e-09,
"logits/chosen": 0.3504864573478699,
"logits/rejected": 0.31922876834869385,
"logps/chosen": -89.49940490722656,
"logps/ref_chosen": -71.0546646118164,
"logps/ref_rejected": -82.2440185546875,
"logps/rejected": -110.66531372070312,
"loss": 0.91,
"margin_dpo/margin_mean": 9.976564407348633,
"margin_dpo/margin_std": 13.33207893371582,
"step": 641
},
{
"epoch": 0.9705215419501134,
"grad_norm": 39.39484786987305,
"learning_rate": 1.3973071544233218e-09,
"logits/chosen": 0.31102094054222107,
"logits/rejected": 0.31289154291152954,
"logps/chosen": -88.44036865234375,
"logps/ref_chosen": -68.92927551269531,
"logps/ref_rejected": -70.85682678222656,
"logps/rejected": -99.15159606933594,
"loss": 1.1142,
"margin_dpo/margin_mean": 8.783670425415039,
"margin_dpo/margin_std": 15.272052764892578,
"step": 642
},
{
"epoch": 0.9720332577475435,
"grad_norm": 99.86732482910156,
"learning_rate": 1.261184375888541e-09,
"logits/chosen": 0.29999518394470215,
"logits/rejected": 0.20917370915412903,
"logps/chosen": -84.09303283691406,
"logps/ref_chosen": -65.30903625488281,
"logps/ref_rejected": -83.61613464355469,
"logps/rejected": -112.48616790771484,
"loss": 0.9726,
"margin_dpo/margin_mean": 10.086037635803223,
"margin_dpo/margin_std": 14.042640686035156,
"step": 643
},
{
"epoch": 0.9735449735449735,
"grad_norm": 35.0439338684082,
"learning_rate": 1.1320193567288527e-09,
"logits/chosen": 0.4415176510810852,
"logits/rejected": 0.40569156408309937,
"logps/chosen": -68.61014556884766,
"logps/ref_chosen": -51.002601623535156,
"logps/ref_rejected": -64.46372985839844,
"logps/rejected": -91.19786834716797,
"loss": 1.1389,
"margin_dpo/margin_mean": 9.126594543457031,
"margin_dpo/margin_std": 16.37747573852539,
"step": 644
},
{
"epoch": 0.9750566893424036,
"grad_norm": 32.50900650024414,
"learning_rate": 1.0098157099674987e-09,
"logits/chosen": 0.2861027121543884,
"logits/rejected": 0.26147884130477905,
"logps/chosen": -79.64277648925781,
"logps/ref_chosen": -60.963409423828125,
"logps/ref_rejected": -69.73353576660156,
"logps/rejected": -98.8067626953125,
"loss": 0.9447,
"margin_dpo/margin_mean": 10.39387035369873,
"margin_dpo/margin_std": 14.448659896850586,
"step": 645
},
{
"epoch": 0.9765684051398337,
"grad_norm": 28.102170944213867,
"learning_rate": 8.945768539031783e-10,
"logits/chosen": 0.4046555757522583,
"logits/rejected": 0.3490592837333679,
"logps/chosen": -81.68882751464844,
"logps/ref_chosen": -62.290069580078125,
"logps/ref_rejected": -85.54812622070312,
"logps/rejected": -116.33580017089844,
"loss": 0.9153,
"margin_dpo/margin_mean": 11.388922691345215,
"margin_dpo/margin_std": 14.88244342803955,
"step": 646
},
{
"epoch": 0.9780801209372638,
"grad_norm": 35.961692810058594,
"learning_rate": 7.863060120144316e-10,
"logits/chosen": 0.39894628524780273,
"logits/rejected": 0.30425435304641724,
"logps/chosen": -86.18168640136719,
"logps/ref_chosen": -67.515869140625,
"logps/ref_rejected": -101.50870513916016,
"logps/rejected": -133.66326904296875,
"loss": 0.8038,
"margin_dpo/margin_mean": 13.488750457763672,
"margin_dpo/margin_std": 15.609901428222656,
"step": 647
},
{
"epoch": 0.9795918367346939,
"grad_norm": 29.62771224975586,
"learning_rate": 6.850062128694045e-10,
"logits/chosen": 0.33446913957595825,
"logits/rejected": 0.2666047215461731,
"logps/chosen": -84.09801483154297,
"logps/ref_chosen": -64.59593963623047,
"logps/ref_rejected": -83.384033203125,
"logps/rejected": -114.47706604003906,
"loss": 0.9224,
"margin_dpo/margin_mean": 11.590965270996094,
"margin_dpo/margin_std": 15.026180267333984,
"step": 648
},
{
"epoch": 0.981103552532124,
"grad_norm": 36.72792434692383,
"learning_rate": 5.906802900412788e-10,
"logits/chosen": 0.38652855157852173,
"logits/rejected": 0.32458633184432983,
"logps/chosen": -67.03204345703125,
"logps/ref_chosen": -49.30964660644531,
"logps/ref_rejected": -73.73710632324219,
"logps/rejected": -102.45809936523438,
"loss": 1.0014,
"margin_dpo/margin_mean": 10.998601913452148,
"margin_dpo/margin_std": 15.62658405303955,
"step": 649
},
{
"epoch": 0.982615268329554,
"grad_norm": 39.920772552490234,
"learning_rate": 5.033308820289184e-10,
"logits/chosen": 0.4805130958557129,
"logits/rejected": 0.41330695152282715,
"logps/chosen": -72.3466567993164,
"logps/ref_chosen": -55.063262939453125,
"logps/ref_rejected": -77.39610290527344,
"logps/rejected": -105.56784057617188,
"loss": 1.0476,
"margin_dpo/margin_mean": 10.888345718383789,
"margin_dpo/margin_std": 16.72048568725586,
"step": 650
},
{
"epoch": 0.9841269841269841,
"grad_norm": 33.2890625,
"learning_rate": 4.2296043218295606e-10,
"logits/chosen": 0.45900759100914,
"logits/rejected": 0.38366663455963135,
"logps/chosen": -70.79644775390625,
"logps/ref_chosen": -54.065162658691406,
"logps/ref_rejected": -77.79080200195312,
"logps/rejected": -104.88078308105469,
"loss": 0.9555,
"margin_dpo/margin_mean": 10.358694076538086,
"margin_dpo/margin_std": 14.075986862182617,
"step": 651
},
{
"epoch": 0.9856386999244142,
"grad_norm": 34.43223571777344,
"learning_rate": 3.4957118863768176e-10,
"logits/chosen": 0.3836783170700073,
"logits/rejected": 0.3254081606864929,
"logps/chosen": -83.17698669433594,
"logps/ref_chosen": -63.64030456542969,
"logps/ref_rejected": -78.86882019042969,
"logps/rejected": -108.52143096923828,
"loss": 0.9731,
"margin_dpo/margin_mean": 10.115922927856445,
"margin_dpo/margin_std": 14.506038665771484,
"step": 652
},
{
"epoch": 0.9871504157218443,
"grad_norm": 34.71416473388672,
"learning_rate": 2.831652042480093e-10,
"logits/chosen": 0.35164594650268555,
"logits/rejected": 0.3016093373298645,
"logps/chosen": -79.4456558227539,
"logps/ref_chosen": -61.668373107910156,
"logps/ref_rejected": -73.83012390136719,
"logps/rejected": -102.2838363647461,
"loss": 1.0075,
"margin_dpo/margin_mean": 10.676427841186523,
"margin_dpo/margin_std": 16.090900421142578,
"step": 653
},
{
"epoch": 0.9886621315192744,
"grad_norm": 34.91166305541992,
"learning_rate": 2.2374433653205016e-10,
"logits/chosen": 0.35892999172210693,
"logits/rejected": 0.2633056640625,
"logps/chosen": -75.29817962646484,
"logps/ref_chosen": -57.568267822265625,
"logps/ref_rejected": -87.74789428710938,
"logps/rejected": -116.19950103759766,
"loss": 0.9221,
"margin_dpo/margin_mean": 10.721696853637695,
"margin_dpo/margin_std": 14.811450958251953,
"step": 654
},
{
"epoch": 0.9901738473167044,
"grad_norm": 27.576698303222656,
"learning_rate": 1.7131024761923852e-10,
"logits/chosen": 0.3685477375984192,
"logits/rejected": 0.270671010017395,
"logps/chosen": -67.61747741699219,
"logps/ref_chosen": -52.14714813232422,
"logps/ref_rejected": -80.85014343261719,
"logps/rejected": -107.67730712890625,
"loss": 0.8058,
"margin_dpo/margin_mean": 11.356832504272461,
"margin_dpo/margin_std": 11.787322998046875,
"step": 655
},
{
"epoch": 0.9916855631141346,
"grad_norm": 28.61968994140625,
"learning_rate": 1.2586440420372934e-10,
"logits/chosen": 0.3276837468147278,
"logits/rejected": 0.27165547013282776,
"logps/chosen": -91.76471710205078,
"logps/ref_chosen": -73.25672912597656,
"logps/ref_rejected": -85.35127258300781,
"logps/rejected": -116.37913513183594,
"loss": 0.8199,
"margin_dpo/margin_mean": 12.519876480102539,
"margin_dpo/margin_std": 14.494292259216309,
"step": 656
},
{
"epoch": 0.9931972789115646,
"grad_norm": 30.431232452392578,
"learning_rate": 8.740807750345913e-11,
"logits/chosen": 0.49412286281585693,
"logits/rejected": 0.402765154838562,
"logps/chosen": -67.19498443603516,
"logps/ref_chosen": -49.72339630126953,
"logps/ref_rejected": -75.15686798095703,
"logps/rejected": -106.35232543945312,
"loss": 0.8435,
"margin_dpo/margin_mean": 13.723871231079102,
"margin_dpo/margin_std": 16.22945213317871,
"step": 657
},
{
"epoch": 0.9947089947089947,
"grad_norm": 35.37551498413086,
"learning_rate": 5.594234322453539e-11,
"logits/chosen": 0.4039853811264038,
"logits/rejected": 0.36136534810066223,
"logps/chosen": -82.059814453125,
"logps/ref_chosen": -63.04634094238281,
"logps/ref_rejected": -83.44963073730469,
"logps/rejected": -112.26458740234375,
"loss": 1.1246,
"margin_dpo/margin_mean": 9.801492691040039,
"margin_dpo/margin_std": 16.81917381286621,
"step": 658
},
{
"epoch": 0.9962207105064248,
"grad_norm": 31.186094284057617,
"learning_rate": 3.146808153123293e-11,
"logits/chosen": 0.4496970474720001,
"logits/rejected": 0.38089311122894287,
"logps/chosen": -74.04220581054688,
"logps/ref_chosen": -55.0802001953125,
"logps/ref_rejected": -71.91049194335938,
"logps/rejected": -99.90020751953125,
"loss": 1.0572,
"margin_dpo/margin_mean": 9.027709007263184,
"margin_dpo/margin_std": 14.64944076538086,
"step": 659
},
{
"epoch": 0.9977324263038548,
"grad_norm": 30.34589385986328,
"learning_rate": 1.3985977021235829e-11,
"logits/chosen": 0.47874438762664795,
"logits/rejected": 0.40453845262527466,
"logps/chosen": -72.7104721069336,
"logps/ref_chosen": -54.52591323852539,
"logps/ref_rejected": -81.23603820800781,
"logps/rejected": -111.93832397460938,
"loss": 0.815,
"margin_dpo/margin_mean": 12.517721176147461,
"margin_dpo/margin_std": 14.22152042388916,
"step": 660
},
{
"epoch": 0.999244142101285,
"grad_norm": 32.01219177246094,
"learning_rate": 3.4965187065971735e-12,
"logits/chosen": 0.3397953510284424,
"logits/rejected": 0.25590038299560547,
"logps/chosen": -81.170166015625,
"logps/ref_chosen": -60.372642517089844,
"logps/ref_rejected": -77.42874908447266,
"logps/rejected": -108.52937316894531,
"loss": 1.0351,
"margin_dpo/margin_mean": 10.303092956542969,
"margin_dpo/margin_std": 15.487680435180664,
"step": 661
},
{
"epoch": 0.999244142101285,
"step": 661,
"total_flos": 0.0,
"train_loss": 1.049779979526185,
"train_runtime": 1908.5591,
"train_samples_per_second": 22.182,
"train_steps_per_second": 0.346
}
],
"logging_steps": 1,
"max_steps": 661,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}