Files
qwen3-8b-base-beta-dpo-hh-h…/trainer_state.json
ModelHub XC 527c912d5f 初始化项目,由ModelHub XC社区提供模型
Model: jackf857/qwen3-8b-base-beta-dpo-hh-harmless-4xh200-batch-64-20260424-025105
Source: Original Platform
2026-05-16 07:03:57 +08:00

9388 lines
324 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.999244142101285,
"eval_steps": 100,
"global_step": 661,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"beta_dpo/beta_used": 0.10002562403678894,
"beta_dpo/beta_used_raw": 0.10002562403678894,
"beta_dpo/gap_mean": -0.002544061280786991,
"beta_dpo/gap_std": 0.05413506180047989,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.0015117157974300832,
"grad_norm": 19.21511459350586,
"learning_rate": 0.0,
"logits/chosen": 1.6779730319976807,
"logits/rejected": 1.8961677551269531,
"loss": 1.3862,
"step": 1
},
{
"beta_dpo/beta_used": 0.09614178538322449,
"beta_dpo/beta_used_raw": 0.09614178538322449,
"beta_dpo/gap_mean": 0.001475283526815474,
"beta_dpo/gap_std": 0.1301599144935608,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.0030234315948601664,
"grad_norm": 17.83782958984375,
"learning_rate": 7.462686567164179e-09,
"logits/chosen": 1.873326063156128,
"logits/rejected": 1.763237714767456,
"loss": 1.3922,
"step": 2
},
{
"beta_dpo/beta_used": 0.0969439223408699,
"beta_dpo/beta_used_raw": 0.0969439223408699,
"beta_dpo/gap_mean": -0.004292218014597893,
"beta_dpo/gap_std": 0.18407246470451355,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.0045351473922902496,
"grad_norm": 21.088054656982422,
"learning_rate": 1.4925373134328357e-08,
"logits/chosen": 1.84206223487854,
"logits/rejected": 1.5545785427093506,
"loss": 1.3928,
"step": 3
},
{
"beta_dpo/beta_used": 0.09930766373872757,
"beta_dpo/beta_used_raw": 0.09930766373872757,
"beta_dpo/gap_mean": -0.017835495993494987,
"beta_dpo/gap_std": 0.22892938554286957,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.006046863189720333,
"grad_norm": 21.443336486816406,
"learning_rate": 2.2388059701492534e-08,
"logits/chosen": 1.959693193435669,
"logits/rejected": 1.9233078956604004,
"loss": 1.3881,
"step": 4
},
{
"beta_dpo/beta_used": 0.09406433999538422,
"beta_dpo/beta_used_raw": 0.09406433999538422,
"beta_dpo/gap_mean": -0.018799975514411926,
"beta_dpo/gap_std": 0.27064457535743713,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.007558578987150416,
"grad_norm": 18.985071182250977,
"learning_rate": 2.9850746268656714e-08,
"logits/chosen": 1.7132606506347656,
"logits/rejected": 1.4830102920532227,
"loss": 1.3973,
"step": 5
},
{
"beta_dpo/beta_used": 0.10003212094306946,
"beta_dpo/beta_used_raw": 0.10003212094306946,
"beta_dpo/gap_mean": -0.01150619424879551,
"beta_dpo/gap_std": 0.3005719780921936,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.009070294784580499,
"grad_norm": 19.63991928100586,
"learning_rate": 3.731343283582089e-08,
"logits/chosen": 1.6464662551879883,
"logits/rejected": 1.3061785697937012,
"loss": 1.3873,
"step": 6
},
{
"beta_dpo/beta_used": 0.09404729306697845,
"beta_dpo/beta_used_raw": 0.09404729306697845,
"beta_dpo/gap_mean": -0.025072161108255386,
"beta_dpo/gap_std": 0.32929858565330505,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.010582010582010581,
"grad_norm": 17.747419357299805,
"learning_rate": 4.477611940298507e-08,
"logits/chosen": 1.2335506677627563,
"logits/rejected": 1.066222071647644,
"loss": 1.3977,
"step": 7
},
{
"beta_dpo/beta_used": 0.10024942457675934,
"beta_dpo/beta_used_raw": 0.10024942457675934,
"beta_dpo/gap_mean": -0.03881003335118294,
"beta_dpo/gap_std": 0.3406470715999603,
"beta_dpo/mask_keep_frac": 0.5,
"epoch": 0.012093726379440665,
"grad_norm": 21.047828674316406,
"learning_rate": 5.223880597014925e-08,
"logits/chosen": 1.7191338539123535,
"logits/rejected": 1.676999807357788,
"loss": 1.3906,
"step": 8
},
{
"beta_dpo/beta_used": 0.10512945801019669,
"beta_dpo/beta_used_raw": 0.10512945801019669,
"beta_dpo/gap_mean": -0.029969248920679092,
"beta_dpo/gap_std": 0.34616127610206604,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.013605442176870748,
"grad_norm": 20.141834259033203,
"learning_rate": 5.970149253731343e-08,
"logits/chosen": 1.5813239812850952,
"logits/rejected": 1.5274288654327393,
"loss": 1.3799,
"step": 9
},
{
"beta_dpo/beta_used": 0.10164432227611542,
"beta_dpo/beta_used_raw": 0.10164432227611542,
"beta_dpo/gap_mean": -0.024617386981844902,
"beta_dpo/gap_std": 0.36627668142318726,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.015117157974300832,
"grad_norm": 21.468704223632812,
"learning_rate": 6.71641791044776e-08,
"logits/chosen": 1.8985390663146973,
"logits/rejected": 1.7897529602050781,
"loss": 1.386,
"step": 10
},
{
"beta_dpo/beta_used": 0.10073283314704895,
"beta_dpo/beta_used_raw": 0.10073283314704895,
"beta_dpo/gap_mean": -0.01293960027396679,
"beta_dpo/gap_std": 0.3726397156715393,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.016628873771730914,
"grad_norm": 19.517457962036133,
"learning_rate": 7.462686567164178e-08,
"logits/chosen": 1.5561755895614624,
"logits/rejected": 1.4315879344940186,
"loss": 1.3857,
"step": 11
},
{
"beta_dpo/beta_used": 0.09879305958747864,
"beta_dpo/beta_used_raw": 0.09879305958747864,
"beta_dpo/gap_mean": -0.01451108418405056,
"beta_dpo/gap_std": 0.3724828362464905,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.018140589569160998,
"grad_norm": 19.350879669189453,
"learning_rate": 8.208955223880596e-08,
"logits/chosen": 1.5376639366149902,
"logits/rejected": 1.5898655652999878,
"loss": 1.3903,
"step": 12
},
{
"beta_dpo/beta_used": 0.09995156526565552,
"beta_dpo/beta_used_raw": 0.09995156526565552,
"beta_dpo/gap_mean": -0.00666454154998064,
"beta_dpo/gap_std": 0.37418586015701294,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.019652305366591082,
"grad_norm": 20.34635353088379,
"learning_rate": 8.955223880597014e-08,
"logits/chosen": 1.6992497444152832,
"logits/rejected": 1.421729564666748,
"loss": 1.3869,
"step": 13
},
{
"beta_dpo/beta_used": 0.10110987722873688,
"beta_dpo/beta_used_raw": 0.10110987722873688,
"beta_dpo/gap_mean": -0.0020657971035689116,
"beta_dpo/gap_std": 0.3755612373352051,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.021164021164021163,
"grad_norm": 23.23613166809082,
"learning_rate": 9.701492537313432e-08,
"logits/chosen": 2.0708484649658203,
"logits/rejected": 1.755119800567627,
"loss": 1.3851,
"step": 14
},
{
"beta_dpo/beta_used": 0.09904544055461884,
"beta_dpo/beta_used_raw": 0.09904544055461884,
"beta_dpo/gap_mean": 0.00015588663518428802,
"beta_dpo/gap_std": 0.36678993701934814,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.022675736961451247,
"grad_norm": 22.2674617767334,
"learning_rate": 1.044776119402985e-07,
"logits/chosen": 1.3940855264663696,
"logits/rejected": 1.119559407234192,
"loss": 1.3882,
"step": 15
},
{
"beta_dpo/beta_used": 0.10093901306390762,
"beta_dpo/beta_used_raw": 0.10093901306390762,
"beta_dpo/gap_mean": -0.0052482327446341515,
"beta_dpo/gap_std": 0.3735610246658325,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.02418745275888133,
"grad_norm": 18.811349868774414,
"learning_rate": 1.1194029850746268e-07,
"logits/chosen": 1.9675464630126953,
"logits/rejected": 1.8949251174926758,
"loss": 1.3853,
"step": 16
},
{
"beta_dpo/beta_used": 0.10186785459518433,
"beta_dpo/beta_used_raw": 0.10186785459518433,
"beta_dpo/gap_mean": -0.0005937099922448397,
"beta_dpo/gap_std": 0.3771466016769409,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.025699168556311415,
"grad_norm": 20.184123992919922,
"learning_rate": 1.1940298507462686e-07,
"logits/chosen": 1.7564290761947632,
"logits/rejected": 1.6898235082626343,
"loss": 1.3831,
"step": 17
},
{
"beta_dpo/beta_used": 0.10032984614372253,
"beta_dpo/beta_used_raw": 0.10032984614372253,
"beta_dpo/gap_mean": 0.011224126443266869,
"beta_dpo/gap_std": 0.3758787512779236,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.027210884353741496,
"grad_norm": 18.658119201660156,
"learning_rate": 1.2686567164179106e-07,
"logits/chosen": 1.5427706241607666,
"logits/rejected": 1.4021761417388916,
"loss": 1.3844,
"step": 18
},
{
"beta_dpo/beta_used": 0.0984867587685585,
"beta_dpo/beta_used_raw": 0.0984867587685585,
"beta_dpo/gap_mean": 0.0026761912740767,
"beta_dpo/gap_std": 0.39050090312957764,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.02872260015117158,
"grad_norm": 22.035978317260742,
"learning_rate": 1.343283582089552e-07,
"logits/chosen": 1.7621450424194336,
"logits/rejected": 1.6765937805175781,
"loss": 1.3894,
"step": 19
},
{
"beta_dpo/beta_used": 0.10234874486923218,
"beta_dpo/beta_used_raw": 0.10234874486923218,
"beta_dpo/gap_mean": 0.002019322942942381,
"beta_dpo/gap_std": 0.391927033662796,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.030234315948601664,
"grad_norm": 21.037214279174805,
"learning_rate": 1.4179104477611938e-07,
"logits/chosen": 1.822493076324463,
"logits/rejected": 1.6277220249176025,
"loss": 1.3828,
"step": 20
},
{
"beta_dpo/beta_used": 0.10313962399959564,
"beta_dpo/beta_used_raw": 0.10313962399959564,
"beta_dpo/gap_mean": 0.007663208059966564,
"beta_dpo/gap_std": 0.3802725672721863,
"beta_dpo/mask_keep_frac": 0.625,
"epoch": 0.031746031746031744,
"grad_norm": 19.205894470214844,
"learning_rate": 1.4925373134328355e-07,
"logits/chosen": 1.2832739353179932,
"logits/rejected": 1.4847989082336426,
"loss": 1.3804,
"step": 21
},
{
"beta_dpo/beta_used": 0.0988527238368988,
"beta_dpo/beta_used_raw": 0.0988527238368988,
"beta_dpo/gap_mean": 0.0034460527822375298,
"beta_dpo/gap_std": 0.3804360628128052,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.03325774754346183,
"grad_norm": 19.234243392944336,
"learning_rate": 1.5671641791044775e-07,
"logits/chosen": 1.3626708984375,
"logits/rejected": 1.13639235496521,
"loss": 1.3873,
"step": 22
},
{
"beta_dpo/beta_used": 0.09999721497297287,
"beta_dpo/beta_used_raw": 0.09999721497297287,
"beta_dpo/gap_mean": 0.003892315551638603,
"beta_dpo/gap_std": 0.3914201259613037,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.03476946334089191,
"grad_norm": 19.51723861694336,
"learning_rate": 1.6417910447761193e-07,
"logits/chosen": 2.0461347103118896,
"logits/rejected": 1.7789829969406128,
"loss": 1.3863,
"step": 23
},
{
"beta_dpo/beta_used": 0.09975261986255646,
"beta_dpo/beta_used_raw": 0.09975261986255646,
"beta_dpo/gap_mean": -0.00156848831102252,
"beta_dpo/gap_std": 0.3875770568847656,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.036281179138321996,
"grad_norm": 18.69975471496582,
"learning_rate": 1.716417910447761e-07,
"logits/chosen": 1.5550925731658936,
"logits/rejected": 1.3700810670852661,
"loss": 1.3871,
"step": 24
},
{
"beta_dpo/beta_used": 0.09970206022262573,
"beta_dpo/beta_used_raw": 0.09970206022262573,
"beta_dpo/gap_mean": -0.006716427858918905,
"beta_dpo/gap_std": 0.4007958173751831,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.03779289493575208,
"grad_norm": 20.19523811340332,
"learning_rate": 1.7910447761194027e-07,
"logits/chosen": 1.5238394737243652,
"logits/rejected": 1.441294550895691,
"loss": 1.3877,
"step": 25
},
{
"beta_dpo/beta_used": 0.10117530822753906,
"beta_dpo/beta_used_raw": 0.10117530822753906,
"beta_dpo/gap_mean": 0.0002460250398144126,
"beta_dpo/gap_std": 0.3976287841796875,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.039304610733182165,
"grad_norm": 20.19486427307129,
"learning_rate": 1.8656716417910447e-07,
"logits/chosen": 2.229462146759033,
"logits/rejected": 2.186990261077881,
"loss": 1.3845,
"step": 26
},
{
"beta_dpo/beta_used": 0.099585622549057,
"beta_dpo/beta_used_raw": 0.099585622549057,
"beta_dpo/gap_mean": 0.013156171888113022,
"beta_dpo/gap_std": 0.402152419090271,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.04081632653061224,
"grad_norm": 21.181793212890625,
"learning_rate": 1.9402985074626865e-07,
"logits/chosen": 1.9965670108795166,
"logits/rejected": 1.999671220779419,
"loss": 1.3855,
"step": 27
},
{
"beta_dpo/beta_used": 0.09757953137159348,
"beta_dpo/beta_used_raw": 0.09757953137159348,
"beta_dpo/gap_mean": 0.01814894564449787,
"beta_dpo/gap_std": 0.39564138650894165,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.042328042328042326,
"grad_norm": 19.1815128326416,
"learning_rate": 2.0149253731343282e-07,
"logits/chosen": 1.282888412475586,
"logits/rejected": 1.315780758857727,
"loss": 1.3892,
"step": 28
},
{
"beta_dpo/beta_used": 0.10066419839859009,
"beta_dpo/beta_used_raw": 0.10066419839859009,
"beta_dpo/gap_mean": 0.02637687511742115,
"beta_dpo/gap_std": 0.39498424530029297,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.04383975812547241,
"grad_norm": 23.41122055053711,
"learning_rate": 2.08955223880597e-07,
"logits/chosen": 2.0083210468292236,
"logits/rejected": 2.0637381076812744,
"loss": 1.3826,
"step": 29
},
{
"beta_dpo/beta_used": 0.10326778143644333,
"beta_dpo/beta_used_raw": 0.10326778143644333,
"beta_dpo/gap_mean": 0.028650924563407898,
"beta_dpo/gap_std": 0.3952373266220093,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.045351473922902494,
"grad_norm": 21.245594024658203,
"learning_rate": 2.1641791044776117e-07,
"logits/chosen": 2.035595178604126,
"logits/rejected": 1.9250398874282837,
"loss": 1.3775,
"step": 30
},
{
"beta_dpo/beta_used": 0.09900397062301636,
"beta_dpo/beta_used_raw": 0.09900397062301636,
"beta_dpo/gap_mean": 0.022034619003534317,
"beta_dpo/gap_std": 0.3975624442100525,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.04686318972033258,
"grad_norm": 22.802661895751953,
"learning_rate": 2.2388059701492537e-07,
"logits/chosen": 1.9883079528808594,
"logits/rejected": 1.705573320388794,
"loss": 1.3851,
"step": 31
},
{
"beta_dpo/beta_used": 0.10333971679210663,
"beta_dpo/beta_used_raw": 0.10333971679210663,
"beta_dpo/gap_mean": 0.03041520155966282,
"beta_dpo/gap_std": 0.3910978436470032,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.04837490551776266,
"grad_norm": 22.380115509033203,
"learning_rate": 2.3134328358208954e-07,
"logits/chosen": 1.5761935710906982,
"logits/rejected": 1.5760250091552734,
"loss": 1.3766,
"step": 32
},
{
"beta_dpo/beta_used": 0.10148920118808746,
"beta_dpo/beta_used_raw": 0.10148920118808746,
"beta_dpo/gap_mean": 0.034046024084091187,
"beta_dpo/gap_std": 0.38926005363464355,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.049886621315192746,
"grad_norm": 21.281326293945312,
"learning_rate": 2.388059701492537e-07,
"logits/chosen": 1.4101418256759644,
"logits/rejected": 1.3217897415161133,
"loss": 1.3804,
"step": 33
},
{
"beta_dpo/beta_used": 0.09844163060188293,
"beta_dpo/beta_used_raw": 0.09844163060188293,
"beta_dpo/gap_mean": 0.042101725935935974,
"beta_dpo/gap_std": 0.3880201280117035,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.05139833711262283,
"grad_norm": 19.162466049194336,
"learning_rate": 2.4626865671641786e-07,
"logits/chosen": 2.0277884006500244,
"logits/rejected": 1.8365530967712402,
"loss": 1.385,
"step": 34
},
{
"beta_dpo/beta_used": 0.10297001898288727,
"beta_dpo/beta_used_raw": 0.10297001898288727,
"beta_dpo/gap_mean": 0.04004104435443878,
"beta_dpo/gap_std": 0.3860953450202942,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.05291005291005291,
"grad_norm": 19.943506240844727,
"learning_rate": 2.537313432835821e-07,
"logits/chosen": 1.2702012062072754,
"logits/rejected": 1.3567094802856445,
"loss": 1.3769,
"step": 35
},
{
"beta_dpo/beta_used": 0.09879133850336075,
"beta_dpo/beta_used_raw": 0.09879133850336075,
"beta_dpo/gap_mean": 0.03860355541110039,
"beta_dpo/gap_std": 0.3801459074020386,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.05442176870748299,
"grad_norm": 19.540878295898438,
"learning_rate": 2.611940298507462e-07,
"logits/chosen": 2.013148546218872,
"logits/rejected": 1.7665867805480957,
"loss": 1.3854,
"step": 36
},
{
"beta_dpo/beta_used": 0.09817594289779663,
"beta_dpo/beta_used_raw": 0.09817594289779663,
"beta_dpo/gap_mean": 0.03356537967920303,
"beta_dpo/gap_std": 0.37876373529434204,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.055933484504913075,
"grad_norm": 26.17854118347168,
"learning_rate": 2.686567164179104e-07,
"logits/chosen": 1.748681902885437,
"logits/rejected": 1.5148720741271973,
"loss": 1.3859,
"step": 37
},
{
"beta_dpo/beta_used": 0.09723386913537979,
"beta_dpo/beta_used_raw": 0.09723386913537979,
"beta_dpo/gap_mean": 0.017998045310378075,
"beta_dpo/gap_std": 0.376539945602417,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.05744520030234316,
"grad_norm": 20.4818115234375,
"learning_rate": 2.761194029850746e-07,
"logits/chosen": 1.6136703491210938,
"logits/rejected": 1.5167253017425537,
"loss": 1.3896,
"step": 38
},
{
"beta_dpo/beta_used": 0.09825208783149719,
"beta_dpo/beta_used_raw": 0.09825208783149719,
"beta_dpo/gap_mean": 0.019816506654024124,
"beta_dpo/gap_std": 0.37512654066085815,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.05895691609977324,
"grad_norm": 21.145137786865234,
"learning_rate": 2.8358208955223876e-07,
"logits/chosen": 2.1623120307922363,
"logits/rejected": 2.083242654800415,
"loss": 1.3872,
"step": 39
},
{
"beta_dpo/beta_used": 0.09770508855581284,
"beta_dpo/beta_used_raw": 0.09770508855581284,
"beta_dpo/gap_mean": 0.02871175855398178,
"beta_dpo/gap_std": 0.38634994626045227,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.06046863189720333,
"grad_norm": 18.661518096923828,
"learning_rate": 2.9104477611940296e-07,
"logits/chosen": 1.9411481618881226,
"logits/rejected": 1.8581569194793701,
"loss": 1.387,
"step": 40
},
{
"beta_dpo/beta_used": 0.10622584819793701,
"beta_dpo/beta_used_raw": 0.10622584819793701,
"beta_dpo/gap_mean": 0.031016860157251358,
"beta_dpo/gap_std": 0.39376121759414673,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.06198034769463341,
"grad_norm": 22.026676177978516,
"learning_rate": 2.985074626865671e-07,
"logits/chosen": 1.3353779315948486,
"logits/rejected": 1.3540756702423096,
"loss": 1.3724,
"step": 41
},
{
"beta_dpo/beta_used": 0.0990532785654068,
"beta_dpo/beta_used_raw": 0.0990532785654068,
"beta_dpo/gap_mean": 0.03635905683040619,
"beta_dpo/gap_std": 0.3946530222892761,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.06349206349206349,
"grad_norm": 20.40145492553711,
"learning_rate": 3.059701492537313e-07,
"logits/chosen": 1.4092631340026855,
"logits/rejected": 1.3597307205200195,
"loss": 1.3837,
"step": 42
},
{
"beta_dpo/beta_used": 0.09917229413986206,
"beta_dpo/beta_used_raw": 0.09917229413986206,
"beta_dpo/gap_mean": 0.020734082907438278,
"beta_dpo/gap_std": 0.3832094669342041,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.06500377928949358,
"grad_norm": 22.262836456298828,
"learning_rate": 3.134328358208955e-07,
"logits/chosen": 1.4906641244888306,
"logits/rejected": 1.297049641609192,
"loss": 1.3857,
"step": 43
},
{
"beta_dpo/beta_used": 0.10303438454866409,
"beta_dpo/beta_used_raw": 0.10303438454866409,
"beta_dpo/gap_mean": 0.015459949150681496,
"beta_dpo/gap_std": 0.3839811682701111,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.06651549508692366,
"grad_norm": 22.17262840270996,
"learning_rate": 3.2089552238805965e-07,
"logits/chosen": 1.881546139717102,
"logits/rejected": 1.903512954711914,
"loss": 1.3785,
"step": 44
},
{
"beta_dpo/beta_used": 0.0984266847372055,
"beta_dpo/beta_used_raw": 0.0984266847372055,
"beta_dpo/gap_mean": 0.01941034197807312,
"beta_dpo/gap_std": 0.3831687569618225,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.06802721088435375,
"grad_norm": 20.917078018188477,
"learning_rate": 3.2835820895522385e-07,
"logits/chosen": 1.663498044013977,
"logits/rejected": 1.6589391231536865,
"loss": 1.3874,
"step": 45
},
{
"beta_dpo/beta_used": 0.10082878172397614,
"beta_dpo/beta_used_raw": 0.10082878172397614,
"beta_dpo/gap_mean": 0.020450761541724205,
"beta_dpo/gap_std": 0.39133739471435547,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.06953892668178382,
"grad_norm": 20.342512130737305,
"learning_rate": 3.3582089552238805e-07,
"logits/chosen": 1.640000581741333,
"logits/rejected": 1.665790319442749,
"loss": 1.3827,
"step": 46
},
{
"beta_dpo/beta_used": 0.09943661093711853,
"beta_dpo/beta_used_raw": 0.09943661093711853,
"beta_dpo/gap_mean": 0.024434737861156464,
"beta_dpo/gap_std": 0.38775908946990967,
"beta_dpo/mask_keep_frac": 0.625,
"epoch": 0.0710506424792139,
"grad_norm": 18.455398559570312,
"learning_rate": 3.432835820895522e-07,
"logits/chosen": 1.6844422817230225,
"logits/rejected": 1.6502798795700073,
"loss": 1.385,
"step": 47
},
{
"beta_dpo/beta_used": 0.0995083749294281,
"beta_dpo/beta_used_raw": 0.0995083749294281,
"beta_dpo/gap_mean": 0.0223417766392231,
"beta_dpo/gap_std": 0.3805840313434601,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.07256235827664399,
"grad_norm": 18.35866355895996,
"learning_rate": 3.507462686567164e-07,
"logits/chosen": 1.8899521827697754,
"logits/rejected": 1.8364651203155518,
"loss": 1.386,
"step": 48
},
{
"beta_dpo/beta_used": 0.10304830223321915,
"beta_dpo/beta_used_raw": 0.10304830223321915,
"beta_dpo/gap_mean": 0.02127697691321373,
"beta_dpo/gap_std": 0.37601011991500854,
"beta_dpo/mask_keep_frac": 1.0,
"epoch": 0.07407407407407407,
"grad_norm": 20.155515670776367,
"learning_rate": 3.5820895522388055e-07,
"logits/chosen": 1.607337236404419,
"logits/rejected": 1.5268868207931519,
"loss": 1.3787,
"step": 49
},
{
"beta_dpo/beta_used": 0.0964367538690567,
"beta_dpo/beta_used_raw": 0.0964367538690567,
"beta_dpo/gap_mean": 0.017594996839761734,
"beta_dpo/gap_std": 0.36542147397994995,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.07558578987150416,
"grad_norm": 19.53146743774414,
"learning_rate": 3.6567164179104475e-07,
"logits/chosen": 1.6052238941192627,
"logits/rejected": 1.4814239740371704,
"loss": 1.3898,
"step": 50
},
{
"beta_dpo/beta_used": 0.10200951993465424,
"beta_dpo/beta_used_raw": 0.10200951993465424,
"beta_dpo/gap_mean": 0.02372138947248459,
"beta_dpo/gap_std": 0.3648919463157654,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.07709750566893424,
"grad_norm": 22.443063735961914,
"learning_rate": 3.7313432835820895e-07,
"logits/chosen": 1.822296380996704,
"logits/rejected": 1.6959524154663086,
"loss": 1.3805,
"step": 51
},
{
"beta_dpo/beta_used": 0.09915009140968323,
"beta_dpo/beta_used_raw": 0.09915009140968323,
"beta_dpo/gap_mean": 0.02502043917775154,
"beta_dpo/gap_std": 0.37956005334854126,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.07860922146636433,
"grad_norm": 22.795974731445312,
"learning_rate": 3.805970149253731e-07,
"logits/chosen": 2.0509259700775146,
"logits/rejected": 1.8106316328048706,
"loss": 1.3872,
"step": 52
},
{
"beta_dpo/beta_used": 0.10304185748100281,
"beta_dpo/beta_used_raw": 0.10304185748100281,
"beta_dpo/gap_mean": 0.028562255203723907,
"beta_dpo/gap_std": 0.3935072422027588,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.0801209372637944,
"grad_norm": 20.028133392333984,
"learning_rate": 3.880597014925373e-07,
"logits/chosen": 1.4427279233932495,
"logits/rejected": 1.4917798042297363,
"loss": 1.378,
"step": 53
},
{
"beta_dpo/beta_used": 0.10378183424472809,
"beta_dpo/beta_used_raw": 0.10378183424472809,
"beta_dpo/gap_mean": 0.04398445785045624,
"beta_dpo/gap_std": 0.41044336557388306,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.08163265306122448,
"grad_norm": 18.53426170349121,
"learning_rate": 3.9552238805970144e-07,
"logits/chosen": 1.62733793258667,
"logits/rejected": 1.6121970415115356,
"loss": 1.3751,
"step": 54
},
{
"beta_dpo/beta_used": 0.10331679880619049,
"beta_dpo/beta_used_raw": 0.10331679880619049,
"beta_dpo/gap_mean": 0.05415666103363037,
"beta_dpo/gap_std": 0.41562163829803467,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.08314436885865457,
"grad_norm": 22.8297176361084,
"learning_rate": 4.0298507462686564e-07,
"logits/chosen": 1.5697447061538696,
"logits/rejected": 1.358530044555664,
"loss": 1.3757,
"step": 55
},
{
"beta_dpo/beta_used": 0.10186167806386948,
"beta_dpo/beta_used_raw": 0.10186167806386948,
"beta_dpo/gap_mean": 0.06482543796300888,
"beta_dpo/gap_std": 0.4240786135196686,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.08465608465608465,
"grad_norm": 18.29169273376465,
"learning_rate": 4.1044776119402984e-07,
"logits/chosen": 1.5528168678283691,
"logits/rejected": 1.682697057723999,
"loss": 1.3764,
"step": 56
},
{
"beta_dpo/beta_used": 0.10814331471920013,
"beta_dpo/beta_used_raw": 0.10814331471920013,
"beta_dpo/gap_mean": 0.0803925096988678,
"beta_dpo/gap_std": 0.4210129976272583,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.08616780045351474,
"grad_norm": 23.717344284057617,
"learning_rate": 4.17910447761194e-07,
"logits/chosen": 2.119786024093628,
"logits/rejected": 2.0530922412872314,
"loss": 1.3638,
"step": 57
},
{
"beta_dpo/beta_used": 0.09881128370761871,
"beta_dpo/beta_used_raw": 0.09881128370761871,
"beta_dpo/gap_mean": 0.08094684034585953,
"beta_dpo/gap_std": 0.4258253574371338,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.08767951625094482,
"grad_norm": 21.60850715637207,
"learning_rate": 4.253731343283582e-07,
"logits/chosen": 1.5111796855926514,
"logits/rejected": 1.3472614288330078,
"loss": 1.3792,
"step": 58
},
{
"beta_dpo/beta_used": 0.10262042284011841,
"beta_dpo/beta_used_raw": 0.10262042284011841,
"beta_dpo/gap_mean": 0.08273988962173462,
"beta_dpo/gap_std": 0.4285188913345337,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.08919123204837491,
"grad_norm": 18.995695114135742,
"learning_rate": 4.3283582089552234e-07,
"logits/chosen": 1.840743064880371,
"logits/rejected": 1.5437428951263428,
"loss": 1.3735,
"step": 59
},
{
"beta_dpo/beta_used": 0.09564212709665298,
"beta_dpo/beta_used_raw": 0.09564212709665298,
"beta_dpo/gap_mean": 0.08741338551044464,
"beta_dpo/gap_std": 0.4274219870567322,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.09070294784580499,
"grad_norm": 18.22614288330078,
"learning_rate": 4.4029850746268654e-07,
"logits/chosen": 2.1714425086975098,
"logits/rejected": 2.212477684020996,
"loss": 1.3846,
"step": 60
},
{
"beta_dpo/beta_used": 0.0980357974767685,
"beta_dpo/beta_used_raw": 0.0980357974767685,
"beta_dpo/gap_mean": 0.08621242642402649,
"beta_dpo/gap_std": 0.4376525282859802,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.09221466364323508,
"grad_norm": 17.79900550842285,
"learning_rate": 4.4776119402985074e-07,
"logits/chosen": 1.433061122894287,
"logits/rejected": 1.248925805091858,
"loss": 1.3788,
"step": 61
},
{
"beta_dpo/beta_used": 0.10317344218492508,
"beta_dpo/beta_used_raw": 0.10317344218492508,
"beta_dpo/gap_mean": 0.09457056224346161,
"beta_dpo/gap_std": 0.44412726163864136,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.09372637944066516,
"grad_norm": 22.70246696472168,
"learning_rate": 4.552238805970149e-07,
"logits/chosen": 1.8834363222122192,
"logits/rejected": 1.6489927768707275,
"loss": 1.3714,
"step": 62
},
{
"beta_dpo/beta_used": 0.09803298115730286,
"beta_dpo/beta_used_raw": 0.09803298115730286,
"beta_dpo/gap_mean": 0.10839153081178665,
"beta_dpo/gap_std": 0.45854881405830383,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.09523809523809523,
"grad_norm": 20.222421646118164,
"learning_rate": 4.626865671641791e-07,
"logits/chosen": 2.5329227447509766,
"logits/rejected": 2.572336196899414,
"loss": 1.38,
"step": 63
},
{
"beta_dpo/beta_used": 0.09539124369621277,
"beta_dpo/beta_used_raw": 0.09539124369621277,
"beta_dpo/gap_mean": 0.09624745696783066,
"beta_dpo/gap_std": 0.4746573567390442,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.09674981103552532,
"grad_norm": 17.34638214111328,
"learning_rate": 4.701492537313433e-07,
"logits/chosen": 1.5154216289520264,
"logits/rejected": 1.2605938911437988,
"loss": 1.3842,
"step": 64
},
{
"beta_dpo/beta_used": 0.10210136324167252,
"beta_dpo/beta_used_raw": 0.10210136324167252,
"beta_dpo/gap_mean": 0.10235883295536041,
"beta_dpo/gap_std": 0.47725844383239746,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.0982615268329554,
"grad_norm": 21.575925827026367,
"learning_rate": 4.776119402985074e-07,
"logits/chosen": 1.746931552886963,
"logits/rejected": 1.831960678100586,
"loss": 1.3734,
"step": 65
},
{
"beta_dpo/beta_used": 0.10511539876461029,
"beta_dpo/beta_used_raw": 0.10511539876461029,
"beta_dpo/gap_mean": 0.11833730340003967,
"beta_dpo/gap_std": 0.4751163125038147,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.09977324263038549,
"grad_norm": 22.723419189453125,
"learning_rate": 4.850746268656717e-07,
"logits/chosen": 1.9626502990722656,
"logits/rejected": 1.846794605255127,
"loss": 1.3663,
"step": 66
},
{
"beta_dpo/beta_used": 0.10089154541492462,
"beta_dpo/beta_used_raw": 0.10089154541492462,
"beta_dpo/gap_mean": 0.12341433763504028,
"beta_dpo/gap_std": 0.48649847507476807,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.10128495842781557,
"grad_norm": 20.514158248901367,
"learning_rate": 4.925373134328357e-07,
"logits/chosen": 1.7593741416931152,
"logits/rejected": 1.6084721088409424,
"loss": 1.3732,
"step": 67
},
{
"beta_dpo/beta_used": 0.10202755033969879,
"beta_dpo/beta_used_raw": 0.10202755033969879,
"beta_dpo/gap_mean": 0.13001835346221924,
"beta_dpo/gap_std": 0.4906574487686157,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.10279667422524566,
"grad_norm": 23.70347785949707,
"learning_rate": 5e-07,
"logits/chosen": 1.9041826725006104,
"logits/rejected": 1.6085681915283203,
"loss": 1.3687,
"step": 68
},
{
"beta_dpo/beta_used": 0.10150092095136642,
"beta_dpo/beta_used_raw": 0.10150092095136642,
"beta_dpo/gap_mean": 0.12389479577541351,
"beta_dpo/gap_std": 0.4982506036758423,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.10430839002267574,
"grad_norm": 25.167369842529297,
"learning_rate": 4.999965034812934e-07,
"logits/chosen": 1.972752332687378,
"logits/rejected": 1.8324453830718994,
"loss": 1.3729,
"step": 69
},
{
"beta_dpo/beta_used": 0.09665323793888092,
"beta_dpo/beta_used_raw": 0.09665323793888092,
"beta_dpo/gap_mean": 0.14055848121643066,
"beta_dpo/gap_std": 0.5118545293807983,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.10582010582010581,
"grad_norm": 19.85059356689453,
"learning_rate": 4.999860140229787e-07,
"logits/chosen": 1.6715452671051025,
"logits/rejected": 1.7321879863739014,
"loss": 1.3801,
"step": 70
},
{
"beta_dpo/beta_used": 0.09737221896648407,
"beta_dpo/beta_used_raw": 0.09737221896648407,
"beta_dpo/gap_mean": 0.1417197287082672,
"beta_dpo/gap_std": 0.521407425403595,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.1073318216175359,
"grad_norm": 19.420684814453125,
"learning_rate": 4.999685319184688e-07,
"logits/chosen": 1.5145726203918457,
"logits/rejected": 1.5156700611114502,
"loss": 1.3773,
"step": 71
},
{
"beta_dpo/beta_used": 0.10205356776714325,
"beta_dpo/beta_used_raw": 0.10205356776714325,
"beta_dpo/gap_mean": 0.14077220857143402,
"beta_dpo/gap_std": 0.5270059704780579,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.10884353741496598,
"grad_norm": 20.89740562438965,
"learning_rate": 4.999440576567755e-07,
"logits/chosen": 1.4117028713226318,
"logits/rejected": 1.1985228061676025,
"loss": 1.3695,
"step": 72
},
{
"beta_dpo/beta_used": 0.09661644697189331,
"beta_dpo/beta_used_raw": 0.09661644697189331,
"beta_dpo/gap_mean": 0.11880473792552948,
"beta_dpo/gap_std": 0.5475245714187622,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.11035525321239607,
"grad_norm": 19.121309280395508,
"learning_rate": 4.999125919224965e-07,
"logits/chosen": 1.3632001876831055,
"logits/rejected": 1.357191801071167,
"loss": 1.3799,
"step": 73
},
{
"beta_dpo/beta_used": 0.10562695562839508,
"beta_dpo/beta_used_raw": 0.10562695562839508,
"beta_dpo/gap_mean": 0.13782186806201935,
"beta_dpo/gap_std": 0.5654876232147217,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.11186696900982615,
"grad_norm": 21.445409774780273,
"learning_rate": 4.998741355957963e-07,
"logits/chosen": 1.8615484237670898,
"logits/rejected": 1.6468513011932373,
"loss": 1.3625,
"step": 74
},
{
"beta_dpo/beta_used": 0.09824702143669128,
"beta_dpo/beta_used_raw": 0.09824702143669128,
"beta_dpo/gap_mean": 0.15777266025543213,
"beta_dpo/gap_std": 0.5702564716339111,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.11337868480725624,
"grad_norm": 19.100439071655273,
"learning_rate": 4.998286897523808e-07,
"logits/chosen": 1.8663270473480225,
"logits/rejected": 1.7803092002868652,
"loss": 1.373,
"step": 75
},
{
"beta_dpo/beta_used": 0.09762119501829147,
"beta_dpo/beta_used_raw": 0.09762119501829147,
"beta_dpo/gap_mean": 0.1676311492919922,
"beta_dpo/gap_std": 0.5960586071014404,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.11489040060468632,
"grad_norm": 19.10355567932129,
"learning_rate": 4.997762556634679e-07,
"logits/chosen": 1.2259998321533203,
"logits/rejected": 1.106650948524475,
"loss": 1.3749,
"step": 76
},
{
"beta_dpo/beta_used": 0.10073893517255783,
"beta_dpo/beta_used_raw": 0.10073893517255783,
"beta_dpo/gap_mean": 0.2034570872783661,
"beta_dpo/gap_std": 0.6084505319595337,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.1164021164021164,
"grad_norm": 21.278688430786133,
"learning_rate": 4.99716834795752e-07,
"logits/chosen": 1.0265986919403076,
"logits/rejected": 1.192859411239624,
"loss": 1.3651,
"step": 77
},
{
"beta_dpo/beta_used": 0.1015826165676117,
"beta_dpo/beta_used_raw": 0.1015826165676117,
"beta_dpo/gap_mean": 0.20583921670913696,
"beta_dpo/gap_std": 0.6371290683746338,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.11791383219954649,
"grad_norm": 20.532432556152344,
"learning_rate": 4.996504288113623e-07,
"logits/chosen": 1.601978063583374,
"logits/rejected": 1.5848236083984375,
"loss": 1.3664,
"step": 78
},
{
"beta_dpo/beta_used": 0.10259930044412613,
"beta_dpo/beta_used_raw": 0.10259930044412613,
"beta_dpo/gap_mean": 0.2349783480167389,
"beta_dpo/gap_std": 0.6695432066917419,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.11942554799697656,
"grad_norm": 20.652477264404297,
"learning_rate": 4.995770395678171e-07,
"logits/chosen": 1.9225590229034424,
"logits/rejected": 1.9619791507720947,
"loss": 1.3567,
"step": 79
},
{
"beta_dpo/beta_used": 0.0984174907207489,
"beta_dpo/beta_used_raw": 0.0984174907207489,
"beta_dpo/gap_mean": 0.2428167164325714,
"beta_dpo/gap_std": 0.7031147480010986,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.12093726379440665,
"grad_norm": 19.037858963012695,
"learning_rate": 4.994966691179711e-07,
"logits/chosen": 1.53074312210083,
"logits/rejected": 1.2819523811340332,
"loss": 1.3637,
"step": 80
},
{
"beta_dpo/beta_used": 0.10066931694746017,
"beta_dpo/beta_used_raw": 0.10066931694746017,
"beta_dpo/gap_mean": 0.2573161721229553,
"beta_dpo/gap_std": 0.7247613668441772,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.12244897959183673,
"grad_norm": 20.873491287231445,
"learning_rate": 4.994093197099587e-07,
"logits/chosen": 1.4366528987884521,
"logits/rejected": 1.3545148372650146,
"loss": 1.36,
"step": 81
},
{
"beta_dpo/beta_used": 0.10298150777816772,
"beta_dpo/beta_used_raw": 0.10298150777816772,
"beta_dpo/gap_mean": 0.28897643089294434,
"beta_dpo/gap_std": 0.7567130327224731,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.12396069538926682,
"grad_norm": 19.724485397338867,
"learning_rate": 4.993149937871306e-07,
"logits/chosen": 1.6228649616241455,
"logits/rejected": 1.4233934879302979,
"loss": 1.3517,
"step": 82
},
{
"beta_dpo/beta_used": 0.10570499300956726,
"beta_dpo/beta_used_raw": 0.10570499300956726,
"beta_dpo/gap_mean": 0.31459736824035645,
"beta_dpo/gap_std": 0.752688467502594,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.1254724111866969,
"grad_norm": 21.929424285888672,
"learning_rate": 4.992136939879856e-07,
"logits/chosen": 1.4179167747497559,
"logits/rejected": 1.0729384422302246,
"loss": 1.3439,
"step": 83
},
{
"beta_dpo/beta_used": 0.10633272677659988,
"beta_dpo/beta_used_raw": 0.10633272677659988,
"beta_dpo/gap_mean": 0.33907008171081543,
"beta_dpo/gap_std": 0.7554141283035278,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.12698412698412698,
"grad_norm": 22.371440887451172,
"learning_rate": 4.991054231460969e-07,
"logits/chosen": 1.7531371116638184,
"logits/rejected": 1.5867257118225098,
"loss": 1.3399,
"step": 84
},
{
"beta_dpo/beta_used": 0.09879690408706665,
"beta_dpo/beta_used_raw": 0.09879690408706665,
"beta_dpo/gap_mean": 0.3559741973876953,
"beta_dpo/gap_std": 0.7542663812637329,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.12849584278155707,
"grad_norm": 18.66929817199707,
"learning_rate": 4.989901842900325e-07,
"logits/chosen": 2.157787322998047,
"logits/rejected": 2.0606753826141357,
"loss": 1.3527,
"step": 85
},
{
"beta_dpo/beta_used": 0.09159150719642639,
"beta_dpo/beta_used_raw": 0.09159150719642639,
"beta_dpo/gap_mean": 0.3374265432357788,
"beta_dpo/gap_std": 0.7478652000427246,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.13000755857898716,
"grad_norm": 18.308496475219727,
"learning_rate": 4.988679806432711e-07,
"logits/chosen": 1.6739656925201416,
"logits/rejected": 1.657767653465271,
"loss": 1.3675,
"step": 86
},
{
"beta_dpo/beta_used": 0.10155273973941803,
"beta_dpo/beta_used_raw": 0.10155273973941803,
"beta_dpo/gap_mean": 0.35617873072624207,
"beta_dpo/gap_std": 0.7686408162117004,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.13151927437641722,
"grad_norm": 19.75534439086914,
"learning_rate": 4.987388156241114e-07,
"logits/chosen": 1.3862335681915283,
"logits/rejected": 1.216930627822876,
"loss": 1.3498,
"step": 87
},
{
"beta_dpo/beta_used": 0.09993347525596619,
"beta_dpo/beta_used_raw": 0.09993347525596619,
"beta_dpo/gap_mean": 0.36177581548690796,
"beta_dpo/gap_std": 0.7989368438720703,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.1330309901738473,
"grad_norm": 20.010814666748047,
"learning_rate": 4.986026928455767e-07,
"logits/chosen": 1.2944331169128418,
"logits/rejected": 1.2594363689422607,
"loss": 1.348,
"step": 88
},
{
"beta_dpo/beta_used": 0.09803235530853271,
"beta_dpo/beta_used_raw": 0.09803235530853271,
"beta_dpo/gap_mean": 0.3744267523288727,
"beta_dpo/gap_std": 0.8301786184310913,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.1345427059712774,
"grad_norm": 17.966041564941406,
"learning_rate": 4.984596161153135e-07,
"logits/chosen": 1.8058236837387085,
"logits/rejected": 1.5427722930908203,
"loss": 1.3533,
"step": 89
},
{
"beta_dpo/beta_used": 0.1015101820230484,
"beta_dpo/beta_used_raw": 0.1015101820230484,
"beta_dpo/gap_mean": 0.4083542227745056,
"beta_dpo/gap_std": 0.875269889831543,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.1360544217687075,
"grad_norm": 33.3956298828125,
"learning_rate": 4.983095894354857e-07,
"logits/chosen": 1.816709280014038,
"logits/rejected": 1.4326956272125244,
"loss": 1.3432,
"step": 90
},
{
"beta_dpo/beta_used": 0.10312025249004364,
"beta_dpo/beta_used_raw": 0.10312025249004364,
"beta_dpo/gap_mean": 0.40964722633361816,
"beta_dpo/gap_std": 0.9051263332366943,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.13756613756613756,
"grad_norm": 19.644180297851562,
"learning_rate": 4.98152617002662e-07,
"logits/chosen": 2.385554790496826,
"logits/rejected": 2.0319085121154785,
"loss": 1.3437,
"step": 91
},
{
"beta_dpo/beta_used": 0.09450967609882355,
"beta_dpo/beta_used_raw": 0.09450967609882355,
"beta_dpo/gap_mean": 0.4264791011810303,
"beta_dpo/gap_std": 0.934371829032898,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.13907785336356765,
"grad_norm": 18.947912216186523,
"learning_rate": 4.979887032076988e-07,
"logits/chosen": 1.7886258363723755,
"logits/rejected": 1.6689845323562622,
"loss": 1.3548,
"step": 92
},
{
"beta_dpo/beta_used": 0.09470728039741516,
"beta_dpo/beta_used_raw": 0.09470728039741516,
"beta_dpo/gap_mean": 0.4246646761894226,
"beta_dpo/gap_std": 0.9929322004318237,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.14058956916099774,
"grad_norm": 16.796300888061523,
"learning_rate": 4.978178526356172e-07,
"logits/chosen": 1.7977063655853271,
"logits/rejected": 1.609261393547058,
"loss": 1.3555,
"step": 93
},
{
"beta_dpo/beta_used": 0.11019230633974075,
"beta_dpo/beta_used_raw": 0.11019230633974075,
"beta_dpo/gap_mean": 0.47246092557907104,
"beta_dpo/gap_std": 1.048844575881958,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.1421012849584278,
"grad_norm": 37.086185455322266,
"learning_rate": 4.976400700654751e-07,
"logits/chosen": 1.6770544052124023,
"logits/rejected": 1.748682975769043,
"loss": 1.3159,
"step": 94
},
{
"beta_dpo/beta_used": 0.1036120057106018,
"beta_dpo/beta_used_raw": 0.1036120057106018,
"beta_dpo/gap_mean": 0.4928344488143921,
"beta_dpo/gap_std": 1.090996503829956,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.1436130007558579,
"grad_norm": 21.989009857177734,
"learning_rate": 4.974553604702332e-07,
"logits/chosen": 1.2581148147583008,
"logits/rejected": 1.1436889171600342,
"loss": 1.337,
"step": 95
},
{
"beta_dpo/beta_used": 0.08425632119178772,
"beta_dpo/beta_used_raw": 0.08425632119178772,
"beta_dpo/gap_mean": 0.5038758516311646,
"beta_dpo/gap_std": 1.110231637954712,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.14512471655328799,
"grad_norm": 16.523128509521484,
"learning_rate": 4.972637290166157e-07,
"logits/chosen": 1.5648579597473145,
"logits/rejected": 1.2802821397781372,
"loss": 1.3627,
"step": 96
},
{
"beta_dpo/beta_used": 0.08535897731781006,
"beta_dpo/beta_used_raw": 0.08535897731781006,
"beta_dpo/gap_mean": 0.48465272784233093,
"beta_dpo/gap_std": 1.1230860948562622,
"beta_dpo/mask_keep_frac": 0.5625,
"epoch": 0.14663643235071808,
"grad_norm": 18.421024322509766,
"learning_rate": 4.970651810649666e-07,
"logits/chosen": 1.367387294769287,
"logits/rejected": 1.616769552230835,
"loss": 1.3704,
"step": 97
},
{
"beta_dpo/beta_used": 0.10194718837738037,
"beta_dpo/beta_used_raw": 0.10194718837738037,
"beta_dpo/gap_mean": 0.44564807415008545,
"beta_dpo/gap_std": 1.1249895095825195,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.14814814814814814,
"grad_norm": 20.25482749938965,
"learning_rate": 4.968597221690985e-07,
"logits/chosen": 1.3039189577102661,
"logits/rejected": 1.3426978588104248,
"loss": 1.3431,
"step": 98
},
{
"beta_dpo/beta_used": 0.09411941468715668,
"beta_dpo/beta_used_raw": 0.09411941468715668,
"beta_dpo/gap_mean": 0.4496995806694031,
"beta_dpo/gap_std": 1.2001111507415771,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.14965986394557823,
"grad_norm": 20.5500545501709,
"learning_rate": 4.966473580761389e-07,
"logits/chosen": 1.912778615951538,
"logits/rejected": 1.7203798294067383,
"loss": 1.3597,
"step": 99
},
{
"beta_dpo/beta_used": 0.10309496521949768,
"beta_dpo/beta_used_raw": 0.10309496521949768,
"beta_dpo/gap_mean": 0.4821561872959137,
"beta_dpo/gap_std": 1.2820096015930176,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.15117157974300832,
"grad_norm": 22.72227668762207,
"learning_rate": 4.964280947263676e-07,
"logits/chosen": 1.8114492893218994,
"logits/rejected": 1.7894963026046753,
"loss": 1.3382,
"step": 100
},
{
"epoch": 0.15117157974300832,
"eval_beta_dpo/beta_used": 0.10933709144592285,
"eval_beta_dpo/beta_used_raw": 0.10933709144592285,
"eval_beta_dpo/gap_mean": 0.5103484392166138,
"eval_beta_dpo/gap_std": 1.3374193906784058,
"eval_beta_dpo/mask_keep_frac": 1.0,
"eval_logits/chosen": 1.8052293062210083,
"eval_logits/rejected": 1.6945847272872925,
"eval_loss": 0.6595867276191711,
"eval_runtime": 42.667,
"eval_samples_per_second": 53.976,
"eval_steps_per_second": 1.687,
"step": 100
},
{
"beta_dpo/beta_used": 0.10079428553581238,
"beta_dpo/beta_used_raw": 0.10079428553581238,
"beta_dpo/gap_mean": 0.5434271097183228,
"beta_dpo/gap_std": 1.337038278579712,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.15268329554043839,
"grad_norm": 18.15898323059082,
"learning_rate": 4.96201938253052e-07,
"logits/chosen": 0.826664924621582,
"logits/rejected": 0.6784051060676575,
"loss": 1.3319,
"step": 101
},
{
"beta_dpo/beta_used": 0.08604797720909119,
"beta_dpo/beta_used_raw": 0.08604797720909119,
"beta_dpo/gap_mean": 0.5208926200866699,
"beta_dpo/gap_std": 1.3681602478027344,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.15419501133786848,
"grad_norm": 16.30609703063965,
"learning_rate": 4.959688949822748e-07,
"logits/chosen": 1.3592952489852905,
"logits/rejected": 1.462346076965332,
"loss": 1.3624,
"step": 102
},
{
"beta_dpo/beta_used": 0.10142231732606888,
"beta_dpo/beta_used_raw": 0.10142231732606888,
"beta_dpo/gap_mean": 0.5772026181221008,
"beta_dpo/gap_std": 1.416075348854065,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.15570672713529857,
"grad_norm": 19.743104934692383,
"learning_rate": 4.957289714327572e-07,
"logits/chosen": 1.7426373958587646,
"logits/rejected": 1.7846993207931519,
"loss": 1.3296,
"step": 103
},
{
"beta_dpo/beta_used": 0.09698724746704102,
"beta_dpo/beta_used_raw": 0.09698724746704102,
"beta_dpo/gap_mean": 0.6326186656951904,
"beta_dpo/gap_std": 1.4738898277282715,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.15721844293272866,
"grad_norm": 20.580957412719727,
"learning_rate": 4.954821743156767e-07,
"logits/chosen": 1.9435052871704102,
"logits/rejected": 1.6555917263031006,
"loss": 1.33,
"step": 104
},
{
"beta_dpo/beta_used": 0.10263784229755402,
"beta_dpo/beta_used_raw": 0.10263784229755402,
"beta_dpo/gap_mean": 0.6641653776168823,
"beta_dpo/gap_std": 1.6008001565933228,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.15873015873015872,
"grad_norm": 21.733863830566406,
"learning_rate": 4.952285105344791e-07,
"logits/chosen": 1.7513964176177979,
"logits/rejected": 1.516118049621582,
"loss": 1.3237,
"step": 105
},
{
"beta_dpo/beta_used": 0.10779309272766113,
"beta_dpo/beta_used_raw": 0.10779309272766113,
"beta_dpo/gap_mean": 0.674252450466156,
"beta_dpo/gap_std": 1.6650457382202148,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.1602418745275888,
"grad_norm": 21.42998504638672,
"learning_rate": 4.949679871846857e-07,
"logits/chosen": 1.7994898557662964,
"logits/rejected": 1.6022930145263672,
"loss": 1.3007,
"step": 106
},
{
"beta_dpo/beta_used": 0.08230964839458466,
"beta_dpo/beta_used_raw": 0.08230964839458466,
"beta_dpo/gap_mean": 0.6531383395195007,
"beta_dpo/gap_std": 1.6853680610656738,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.1617535903250189,
"grad_norm": 17.759193420410156,
"learning_rate": 4.947006115536947e-07,
"logits/chosen": 1.3034350872039795,
"logits/rejected": 1.4832148551940918,
"loss": 1.3548,
"step": 107
},
{
"beta_dpo/beta_used": 0.10429038107395172,
"beta_dpo/beta_used_raw": 0.10429038107395172,
"beta_dpo/gap_mean": 0.6379518508911133,
"beta_dpo/gap_std": 1.6854841709136963,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.16326530612244897,
"grad_norm": 20.740493774414062,
"learning_rate": 4.944263911205772e-07,
"logits/chosen": 1.1033403873443604,
"logits/rejected": 0.858239471912384,
"loss": 1.313,
"step": 108
},
{
"beta_dpo/beta_used": 0.08762006461620331,
"beta_dpo/beta_used_raw": 0.08762006461620331,
"beta_dpo/gap_mean": 0.6862611770629883,
"beta_dpo/gap_std": 1.7554314136505127,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.16477702191987906,
"grad_norm": 17.178913116455078,
"learning_rate": 4.941453335558681e-07,
"logits/chosen": 1.7482877969741821,
"logits/rejected": 1.5088105201721191,
"loss": 1.3494,
"step": 109
},
{
"beta_dpo/beta_used": 0.0838538110256195,
"beta_dpo/beta_used_raw": 0.0838538110256195,
"beta_dpo/gap_mean": 0.6247843503952026,
"beta_dpo/gap_std": 1.8059306144714355,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.16628873771730915,
"grad_norm": 17.97353744506836,
"learning_rate": 4.938574467213517e-07,
"logits/chosen": 1.412921667098999,
"logits/rejected": 1.4799084663391113,
"loss": 1.3564,
"step": 110
},
{
"beta_dpo/beta_used": 0.10175025463104248,
"beta_dpo/beta_used_raw": 0.10175025463104248,
"beta_dpo/gap_mean": 0.6145044565200806,
"beta_dpo/gap_std": 1.817657470703125,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.16780045351473924,
"grad_norm": 19.397005081176758,
"learning_rate": 4.935627386698418e-07,
"logits/chosen": 1.7223619222640991,
"logits/rejected": 1.7298330068588257,
"loss": 1.3224,
"step": 111
},
{
"beta_dpo/beta_used": 0.10769188404083252,
"beta_dpo/beta_used_raw": 0.10769188404083252,
"beta_dpo/gap_mean": 0.6791462898254395,
"beta_dpo/gap_std": 1.8483753204345703,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.1693121693121693,
"grad_norm": 22.855987548828125,
"learning_rate": 4.932612176449559e-07,
"logits/chosen": 1.450548529624939,
"logits/rejected": 1.254005789756775,
"loss": 1.3142,
"step": 112
},
{
"beta_dpo/beta_used": 0.10324016213417053,
"beta_dpo/beta_used_raw": 0.10324016213417053,
"beta_dpo/gap_mean": 0.6565523743629456,
"beta_dpo/gap_std": 1.891095757484436,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.1708238851095994,
"grad_norm": 18.695514678955078,
"learning_rate": 4.929528920808854e-07,
"logits/chosen": 1.0113716125488281,
"logits/rejected": 1.1878894567489624,
"loss": 1.3211,
"step": 113
},
{
"beta_dpo/beta_used": 0.09588593244552612,
"beta_dpo/beta_used_raw": 0.09588593244552612,
"beta_dpo/gap_mean": 0.7151613235473633,
"beta_dpo/gap_std": 1.9551301002502441,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.17233560090702948,
"grad_norm": 19.19485855102539,
"learning_rate": 4.92637770602159e-07,
"logits/chosen": 2.16898250579834,
"logits/rejected": 2.159493923187256,
"loss": 1.328,
"step": 114
},
{
"beta_dpo/beta_used": 0.09995594620704651,
"beta_dpo/beta_used_raw": 0.09995594620704651,
"beta_dpo/gap_mean": 0.7490643858909607,
"beta_dpo/gap_std": 1.9629037380218506,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.17384731670445955,
"grad_norm": 19.53900718688965,
"learning_rate": 4.923158620234019e-07,
"logits/chosen": 1.8840827941894531,
"logits/rejected": 1.6221849918365479,
"loss": 1.3267,
"step": 115
},
{
"beta_dpo/beta_used": 0.10833384841680527,
"beta_dpo/beta_used_raw": 0.10833384841680527,
"beta_dpo/gap_mean": 0.7924877405166626,
"beta_dpo/gap_std": 1.9609473943710327,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.17535903250188964,
"grad_norm": 22.43212890625,
"learning_rate": 4.91987175349089e-07,
"logits/chosen": 1.73817777633667,
"logits/rejected": 1.499211072921753,
"loss": 1.2895,
"step": 116
},
{
"beta_dpo/beta_used": 0.09760797768831253,
"beta_dpo/beta_used_raw": 0.09760797768831253,
"beta_dpo/gap_mean": 0.9041982889175415,
"beta_dpo/gap_std": 1.9778673648834229,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.17687074829931973,
"grad_norm": 18.086820602416992,
"learning_rate": 4.916517197732933e-07,
"logits/chosen": 1.8457000255584717,
"logits/rejected": 1.6579217910766602,
"loss": 1.3005,
"step": 117
},
{
"beta_dpo/beta_used": 0.08526713401079178,
"beta_dpo/beta_used_raw": 0.08526713401079178,
"beta_dpo/gap_mean": 0.8945071697235107,
"beta_dpo/gap_std": 2.001413345336914,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.17838246409674982,
"grad_norm": 17.47612190246582,
"learning_rate": 4.913095046794281e-07,
"logits/chosen": 0.9872667789459229,
"logits/rejected": 0.9681127071380615,
"loss": 1.3328,
"step": 118
},
{
"beta_dpo/beta_used": 0.09429244697093964,
"beta_dpo/beta_used_raw": 0.09429244697093964,
"beta_dpo/gap_mean": 0.8825496435165405,
"beta_dpo/gap_std": 2.0767202377319336,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.17989417989417988,
"grad_norm": 17.861783981323242,
"learning_rate": 4.909605396399855e-07,
"logits/chosen": 1.8808125257492065,
"logits/rejected": 2.0650906562805176,
"loss": 1.3181,
"step": 119
},
{
"beta_dpo/beta_used": 0.1025141030550003,
"beta_dpo/beta_used_raw": 0.1025141030550003,
"beta_dpo/gap_mean": 0.9791843891143799,
"beta_dpo/gap_std": 2.105668067932129,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.18140589569160998,
"grad_norm": 21.278732299804688,
"learning_rate": 4.906048344162676e-07,
"logits/chosen": 1.837113380432129,
"logits/rejected": 1.619814395904541,
"loss": 1.292,
"step": 120
},
{
"beta_dpo/beta_used": 0.08260353654623032,
"beta_dpo/beta_used_raw": 0.08260353654623032,
"beta_dpo/gap_mean": 1.0040740966796875,
"beta_dpo/gap_std": 2.16635799407959,
"beta_dpo/mask_keep_frac": 0.625,
"epoch": 0.18291761148904007,
"grad_norm": 18.3269100189209,
"learning_rate": 4.902423989581143e-07,
"logits/chosen": 1.975892186164856,
"logits/rejected": 1.6966898441314697,
"loss": 1.335,
"step": 121
},
{
"beta_dpo/beta_used": 0.08021458238363266,
"beta_dpo/beta_used_raw": 0.08021458238363266,
"beta_dpo/gap_mean": 0.9787734746932983,
"beta_dpo/gap_std": 2.237492561340332,
"beta_dpo/mask_keep_frac": 0.625,
"epoch": 0.18442932728647016,
"grad_norm": 17.738651275634766,
"learning_rate": 4.898732434036243e-07,
"logits/chosen": 1.569380283355713,
"logits/rejected": 1.292022466659546,
"loss": 1.336,
"step": 122
},
{
"beta_dpo/beta_used": 0.10076497495174408,
"beta_dpo/beta_used_raw": 0.10076497495174408,
"beta_dpo/gap_mean": 0.9931870698928833,
"beta_dpo/gap_std": 2.2701330184936523,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.18594104308390022,
"grad_norm": 18.512353897094727,
"learning_rate": 4.894973780788722e-07,
"logits/chosen": 1.728874921798706,
"logits/rejected": 1.3382471799850464,
"loss": 1.2965,
"step": 123
},
{
"beta_dpo/beta_used": 0.10337992012500763,
"beta_dpo/beta_used_raw": 0.10337992012500763,
"beta_dpo/gap_mean": 1.021672010421753,
"beta_dpo/gap_std": 2.3348677158355713,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.1874527588813303,
"grad_norm": 20.14508628845215,
"learning_rate": 4.89114813497619e-07,
"logits/chosen": 1.8601114749908447,
"logits/rejected": 1.4402656555175781,
"loss": 1.2728,
"step": 124
},
{
"beta_dpo/beta_used": 0.11034771800041199,
"beta_dpo/beta_used_raw": 0.11034771800041199,
"beta_dpo/gap_mean": 1.1416581869125366,
"beta_dpo/gap_std": 2.3627383708953857,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.1889644746787604,
"grad_norm": 26.62055778503418,
"learning_rate": 4.887255603610184e-07,
"logits/chosen": 2.262009382247925,
"logits/rejected": 1.918278694152832,
"loss": 1.2408,
"step": 125
},
{
"beta_dpo/beta_used": 0.07098745554685593,
"beta_dpo/beta_used_raw": 0.07098745554685593,
"beta_dpo/gap_mean": 1.104027271270752,
"beta_dpo/gap_std": 2.409133195877075,
"beta_dpo/mask_keep_frac": 0.625,
"epoch": 0.19047619047619047,
"grad_norm": 15.002470970153809,
"learning_rate": 4.883296295573176e-07,
"logits/chosen": 1.2601641416549683,
"logits/rejected": 1.2013548612594604,
"loss": 1.3446,
"step": 126
},
{
"beta_dpo/beta_used": 0.11030158400535583,
"beta_dpo/beta_used_raw": 0.11030158400535583,
"beta_dpo/gap_mean": 1.2010130882263184,
"beta_dpo/gap_std": 2.4333302974700928,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.19198790627362056,
"grad_norm": 19.414600372314453,
"learning_rate": 4.87927032161552e-07,
"logits/chosen": 2.3014814853668213,
"logits/rejected": 2.174217939376831,
"loss": 1.2273,
"step": 127
},
{
"beta_dpo/beta_used": 0.10572034865617752,
"beta_dpo/beta_used_raw": 0.10572034865617752,
"beta_dpo/gap_mean": 1.1918036937713623,
"beta_dpo/gap_std": 2.564605236053467,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.19349962207105065,
"grad_norm": 23.319074630737305,
"learning_rate": 4.875177794352363e-07,
"logits/chosen": 1.6076852083206177,
"logits/rejected": 1.4918580055236816,
"loss": 1.26,
"step": 128
},
{
"beta_dpo/beta_used": 0.0822620838880539,
"beta_dpo/beta_used_raw": 0.0822620838880539,
"beta_dpo/gap_mean": 1.131927251815796,
"beta_dpo/gap_std": 2.708618640899658,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.19501133786848074,
"grad_norm": 16.706241607666016,
"learning_rate": 4.871018828260491e-07,
"logits/chosen": 1.3592138290405273,
"logits/rejected": 1.2817442417144775,
"loss": 1.3246,
"step": 129
},
{
"beta_dpo/beta_used": 0.08625729382038116,
"beta_dpo/beta_used_raw": 0.08625729382038116,
"beta_dpo/gap_mean": 1.1664378643035889,
"beta_dpo/gap_std": 2.7734792232513428,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.1965230536659108,
"grad_norm": 17.94251823425293,
"learning_rate": 4.866793539675126e-07,
"logits/chosen": 1.725219488143921,
"logits/rejected": 1.631973147392273,
"loss": 1.3189,
"step": 130
},
{
"beta_dpo/beta_used": 0.11515168845653534,
"beta_dpo/beta_used_raw": 0.11515168845653534,
"beta_dpo/gap_mean": 1.2549471855163574,
"beta_dpo/gap_std": 2.857564687728882,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.1980347694633409,
"grad_norm": 22.736597061157227,
"learning_rate": 4.86250204678667e-07,
"logits/chosen": 1.5887306928634644,
"logits/rejected": 1.3573846817016602,
"loss": 1.2451,
"step": 131
},
{
"beta_dpo/beta_used": 0.11845074594020844,
"beta_dpo/beta_used_raw": 0.11845074594020844,
"beta_dpo/gap_mean": 1.2605340480804443,
"beta_dpo/gap_std": 2.817857265472412,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.19954648526077098,
"grad_norm": 25.156320571899414,
"learning_rate": 4.858144469637408e-07,
"logits/chosen": 1.8204238414764404,
"logits/rejected": 1.8674492835998535,
"loss": 1.2091,
"step": 132
},
{
"beta_dpo/beta_used": 0.08995058387517929,
"beta_dpo/beta_used_raw": 0.08995058387517929,
"beta_dpo/gap_mean": 1.248791217803955,
"beta_dpo/gap_std": 2.8684797286987305,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.20105820105820105,
"grad_norm": 17.265727996826172,
"learning_rate": 4.853720930118138e-07,
"logits/chosen": 1.4804385900497437,
"logits/rejected": 1.4534518718719482,
"loss": 1.3045,
"step": 133
},
{
"beta_dpo/beta_used": 0.08332835137844086,
"beta_dpo/beta_used_raw": 0.08332835137844086,
"beta_dpo/gap_mean": 1.3323745727539062,
"beta_dpo/gap_std": 2.947547435760498,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.20256991685563114,
"grad_norm": 16.170001983642578,
"learning_rate": 4.849231551964771e-07,
"logits/chosen": 2.1145153045654297,
"logits/rejected": 2.052708148956299,
"loss": 1.31,
"step": 134
},
{
"beta_dpo/beta_used": 0.10449250787496567,
"beta_dpo/beta_used_raw": 0.10449250787496567,
"beta_dpo/gap_mean": 1.3598275184631348,
"beta_dpo/gap_std": 2.9727349281311035,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.20408163265306123,
"grad_norm": 20.1523380279541,
"learning_rate": 4.844676460754862e-07,
"logits/chosen": 1.7255396842956543,
"logits/rejected": 1.7618924379348755,
"loss": 1.2582,
"step": 135
},
{
"beta_dpo/beta_used": 0.10133585333824158,
"beta_dpo/beta_used_raw": 0.10133585333824158,
"beta_dpo/gap_mean": 1.4881207942962646,
"beta_dpo/gap_std": 3.178489923477173,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.20559334845049132,
"grad_norm": 22.073606491088867,
"learning_rate": 4.840055783904106e-07,
"logits/chosen": 1.3234667778015137,
"logits/rejected": 1.4829561710357666,
"loss": 1.2474,
"step": 136
},
{
"beta_dpo/beta_used": 0.09466598182916641,
"beta_dpo/beta_used_raw": 0.09466598182916641,
"beta_dpo/gap_mean": 1.4787802696228027,
"beta_dpo/gap_std": 3.2747902870178223,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.20710506424792138,
"grad_norm": 18.009923934936523,
"learning_rate": 4.835369650662767e-07,
"logits/chosen": 1.520973801612854,
"logits/rejected": 1.3727699518203735,
"loss": 1.2673,
"step": 137
},
{
"beta_dpo/beta_used": 0.08019311726093292,
"beta_dpo/beta_used_raw": 0.08019311726093292,
"beta_dpo/gap_mean": 1.5378533601760864,
"beta_dpo/gap_std": 3.3426733016967773,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.20861678004535147,
"grad_norm": 15.670638084411621,
"learning_rate": 4.830618192112065e-07,
"logits/chosen": 1.321858286857605,
"logits/rejected": 1.368009090423584,
"loss": 1.3035,
"step": 138
},
{
"beta_dpo/beta_used": 0.10574564337730408,
"beta_dpo/beta_used_raw": 0.10574564337730408,
"beta_dpo/gap_mean": 1.5016133785247803,
"beta_dpo/gap_std": 3.4950404167175293,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.21012849584278157,
"grad_norm": 24.677949905395508,
"learning_rate": 4.825801541160509e-07,
"logits/chosen": 1.1677018404006958,
"logits/rejected": 1.1444388628005981,
"loss": 1.2488,
"step": 139
},
{
"beta_dpo/beta_used": 0.12155772745609283,
"beta_dpo/beta_used_raw": 0.12155772745609283,
"beta_dpo/gap_mean": 1.6673638820648193,
"beta_dpo/gap_std": 3.619114398956299,
"beta_dpo/mask_keep_frac": 0.625,
"epoch": 0.21164021164021163,
"grad_norm": 26.427644729614258,
"learning_rate": 4.820919832540181e-07,
"logits/chosen": 1.3720524311065674,
"logits/rejected": 1.4061660766601562,
"loss": 1.2148,
"step": 140
},
{
"beta_dpo/beta_used": 0.12430672347545624,
"beta_dpo/beta_used_raw": 0.12430672347545624,
"beta_dpo/gap_mean": 1.8043220043182373,
"beta_dpo/gap_std": 3.798015594482422,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.21315192743764172,
"grad_norm": 22.859867095947266,
"learning_rate": 4.815973202802966e-07,
"logits/chosen": 1.7817519903182983,
"logits/rejected": 1.6559662818908691,
"loss": 1.1736,
"step": 141
},
{
"beta_dpo/beta_used": 0.0646064430475235,
"beta_dpo/beta_used_raw": 0.0646064430475235,
"beta_dpo/gap_mean": 1.7735939025878906,
"beta_dpo/gap_std": 3.847339630126953,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.2146636432350718,
"grad_norm": 13.155664443969727,
"learning_rate": 4.810961790316729e-07,
"logits/chosen": 1.7617301940917969,
"logits/rejected": 1.6993064880371094,
"loss": 1.3175,
"step": 142
},
{
"beta_dpo/beta_used": 0.10804080963134766,
"beta_dpo/beta_used_raw": 0.10804080963134766,
"beta_dpo/gap_mean": 1.6947863101959229,
"beta_dpo/gap_std": 3.9340009689331055,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.2161753590325019,
"grad_norm": 21.003841400146484,
"learning_rate": 4.805885735261454e-07,
"logits/chosen": 1.9638588428497314,
"logits/rejected": 1.8217556476593018,
"loss": 1.2281,
"step": 143
},
{
"beta_dpo/beta_used": 0.08677056431770325,
"beta_dpo/beta_used_raw": 0.08677056431770325,
"beta_dpo/gap_mean": 1.650296926498413,
"beta_dpo/gap_std": 4.212867259979248,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.21768707482993196,
"grad_norm": 18.106487274169922,
"learning_rate": 4.800745179625307e-07,
"logits/chosen": 1.9297895431518555,
"logits/rejected": 1.861382246017456,
"loss": 1.2954,
"step": 144
},
{
"beta_dpo/beta_used": 0.16803482174873352,
"beta_dpo/beta_used_raw": 0.16803482174873352,
"beta_dpo/gap_mean": 1.6556284427642822,
"beta_dpo/gap_std": 4.449127674102783,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.21919879062736206,
"grad_norm": 31.1252384185791,
"learning_rate": 4.795540267200686e-07,
"logits/chosen": 1.0947413444519043,
"logits/rejected": 1.0834863185882568,
"loss": 1.0266,
"step": 145
},
{
"beta_dpo/beta_used": 0.12755730748176575,
"beta_dpo/beta_used_raw": 0.12755730748176575,
"beta_dpo/gap_mean": 1.6483957767486572,
"beta_dpo/gap_std": 4.460909366607666,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.22071050642479215,
"grad_norm": 23.488847732543945,
"learning_rate": 4.790271143580173e-07,
"logits/chosen": 1.5351951122283936,
"logits/rejected": 1.6117818355560303,
"loss": 1.1962,
"step": 146
},
{
"beta_dpo/beta_used": 0.06719569861888885,
"beta_dpo/beta_used_raw": 0.06719569861888885,
"beta_dpo/gap_mean": 1.6245243549346924,
"beta_dpo/gap_std": 4.596627235412598,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.2222222222222222,
"grad_norm": 15.262714385986328,
"learning_rate": 4.784937956152489e-07,
"logits/chosen": 1.4650212526321411,
"logits/rejected": 1.396628737449646,
"loss": 1.3401,
"step": 147
},
{
"beta_dpo/beta_used": 0.15461790561676025,
"beta_dpo/beta_used_raw": 0.15461790561676025,
"beta_dpo/gap_mean": 1.712737798690796,
"beta_dpo/gap_std": 4.725405216217041,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.2237339380196523,
"grad_norm": 24.29132080078125,
"learning_rate": 4.779540854098347e-07,
"logits/chosen": 2.417107105255127,
"logits/rejected": 2.175968647003174,
"loss": 1.0703,
"step": 148
},
{
"beta_dpo/beta_used": 0.09070044755935669,
"beta_dpo/beta_used_raw": 0.09070044755935669,
"beta_dpo/gap_mean": 1.810120701789856,
"beta_dpo/gap_std": 4.730660438537598,
"beta_dpo/mask_keep_frac": 0.625,
"epoch": 0.2252456538170824,
"grad_norm": 18.492767333984375,
"learning_rate": 4.774079988386296e-07,
"logits/chosen": 1.2221202850341797,
"logits/rejected": 1.3723053932189941,
"loss": 1.2579,
"step": 149
},
{
"beta_dpo/beta_used": 0.11454713344573975,
"beta_dpo/beta_used_raw": 0.11454713344573975,
"beta_dpo/gap_mean": 2.0056028366088867,
"beta_dpo/gap_std": 5.036479949951172,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.22675736961451248,
"grad_norm": 26.087398529052734,
"learning_rate": 4.768555511768486e-07,
"logits/chosen": 1.4887826442718506,
"logits/rejected": 1.5462052822113037,
"loss": 1.2657,
"step": 150
},
{
"beta_dpo/beta_used": 0.1551978588104248,
"beta_dpo/beta_used_raw": 0.1551978588104248,
"beta_dpo/gap_mean": 2.2529079914093018,
"beta_dpo/gap_std": 5.154142379760742,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.22826908541194255,
"grad_norm": 25.759302139282227,
"learning_rate": 4.762967578776406e-07,
"logits/chosen": 1.5137187242507935,
"logits/rejected": 1.3097262382507324,
"loss": 1.0226,
"step": 151
},
{
"beta_dpo/beta_used": 0.10738147795200348,
"beta_dpo/beta_used_raw": 0.10738147795200348,
"beta_dpo/gap_mean": 2.308027744293213,
"beta_dpo/gap_std": 5.2502970695495605,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.22978080120937264,
"grad_norm": 22.590726852416992,
"learning_rate": 4.757316345716553e-07,
"logits/chosen": 1.6765403747558594,
"logits/rejected": 1.669187068939209,
"loss": 1.1676,
"step": 152
},
{
"beta_dpo/beta_used": 0.10762692987918854,
"beta_dpo/beta_used_raw": 0.10762692987918854,
"beta_dpo/gap_mean": 2.3354625701904297,
"beta_dpo/gap_std": 5.303244590759277,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.23129251700680273,
"grad_norm": 19.168458938598633,
"learning_rate": 4.751601970666064e-07,
"logits/chosen": 0.8450142741203308,
"logits/rejected": 0.7212068438529968,
"loss": 1.1869,
"step": 153
},
{
"beta_dpo/beta_used": 0.12070707976818085,
"beta_dpo/beta_used_raw": 0.12070707976818085,
"beta_dpo/gap_mean": 2.360574960708618,
"beta_dpo/gap_std": 5.460031986236572,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.2328042328042328,
"grad_norm": 21.795913696289062,
"learning_rate": 4.745824613468292e-07,
"logits/chosen": 1.0570695400238037,
"logits/rejected": 1.2983663082122803,
"loss": 1.1894,
"step": 154
},
{
"beta_dpo/beta_used": 0.1528300940990448,
"beta_dpo/beta_used_raw": 0.1528300940990448,
"beta_dpo/gap_mean": 2.3790721893310547,
"beta_dpo/gap_std": 5.66038703918457,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.23431594860166288,
"grad_norm": 39.34039306640625,
"learning_rate": 4.7399844357283393e-07,
"logits/chosen": 1.4706714153289795,
"logits/rejected": 1.307586431503296,
"loss": 1.1006,
"step": 155
},
{
"beta_dpo/beta_used": 0.14839857816696167,
"beta_dpo/beta_used_raw": 0.14839857816696167,
"beta_dpo/gap_mean": 2.5533552169799805,
"beta_dpo/gap_std": 5.758601188659668,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.23582766439909297,
"grad_norm": 29.02472686767578,
"learning_rate": 4.7340816008085305e-07,
"logits/chosen": 1.2706935405731201,
"logits/rejected": 1.512930154800415,
"loss": 1.0439,
"step": 156
},
{
"beta_dpo/beta_used": 0.09339036047458649,
"beta_dpo/beta_used_raw": 0.09339036047458649,
"beta_dpo/gap_mean": 2.6657767295837402,
"beta_dpo/gap_std": 5.844965934753418,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.23733938019652306,
"grad_norm": 16.238609313964844,
"learning_rate": 4.728116273823847e-07,
"logits/chosen": 1.0520925521850586,
"logits/rejected": 1.0182958841323853,
"loss": 1.1508,
"step": 157
},
{
"beta_dpo/beta_used": 0.168321892619133,
"beta_dpo/beta_used_raw": 0.168321892619133,
"beta_dpo/gap_mean": 2.6134791374206543,
"beta_dpo/gap_std": 6.089890480041504,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.23885109599395313,
"grad_norm": 28.28655242919922,
"learning_rate": 4.7220886216373085e-07,
"logits/chosen": 1.3058767318725586,
"logits/rejected": 1.2950568199157715,
"loss": 1.0635,
"step": 158
},
{
"beta_dpo/beta_used": 0.08723060041666031,
"beta_dpo/beta_used_raw": 0.08723060041666031,
"beta_dpo/gap_mean": 2.738887310028076,
"beta_dpo/gap_std": 6.282135963439941,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.24036281179138322,
"grad_norm": 18.19098472595215,
"learning_rate": 4.715998812855304e-07,
"logits/chosen": 1.397586464881897,
"logits/rejected": 1.3978208303451538,
"loss": 1.2233,
"step": 159
},
{
"beta_dpo/beta_used": 0.09871069341897964,
"beta_dpo/beta_used_raw": 0.09871069341897964,
"beta_dpo/gap_mean": 2.721683979034424,
"beta_dpo/gap_std": 6.240549087524414,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.2418745275888133,
"grad_norm": 23.070819854736328,
"learning_rate": 4.7098470178228755e-07,
"logits/chosen": 1.0362298488616943,
"logits/rejected": 0.9539611339569092,
"loss": 1.1959,
"step": 160
},
{
"beta_dpo/beta_used": 0.13182277977466583,
"beta_dpo/beta_used_raw": 0.13182277977466583,
"beta_dpo/gap_mean": 2.772425413131714,
"beta_dpo/gap_std": 6.313028335571289,
"beta_dpo/mask_keep_frac": 0.625,
"epoch": 0.24338624338624337,
"grad_norm": 25.8764591217041,
"learning_rate": 4.703633408618955e-07,
"logits/chosen": 1.6497914791107178,
"logits/rejected": 1.4926035404205322,
"loss": 1.0769,
"step": 161
},
{
"beta_dpo/beta_used": 0.15703752636909485,
"beta_dpo/beta_used_raw": 0.15703752636909485,
"beta_dpo/gap_mean": 3.0325098037719727,
"beta_dpo/gap_std": 6.246161460876465,
"beta_dpo/mask_keep_frac": 0.625,
"epoch": 0.24489795918367346,
"grad_norm": 25.955524444580078,
"learning_rate": 4.697358159051549e-07,
"logits/chosen": 1.6864545345306396,
"logits/rejected": 1.713794231414795,
"loss": 0.9487,
"step": 162
},
{
"beta_dpo/beta_used": 0.034870997071266174,
"beta_dpo/beta_used_raw": 0.02417636662721634,
"beta_dpo/gap_mean": 3.10768723487854,
"beta_dpo/gap_std": 6.307939529418945,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.24640967498110355,
"grad_norm": 10.59152603149414,
"learning_rate": 4.691021444652876e-07,
"logits/chosen": 1.5043668746948242,
"logits/rejected": 1.0593593120574951,
"loss": 1.3254,
"step": 163
},
{
"beta_dpo/beta_used": 0.1477426290512085,
"beta_dpo/beta_used_raw": 0.1477426290512085,
"beta_dpo/gap_mean": 3.3669991493225098,
"beta_dpo/gap_std": 6.525307655334473,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.24792139077853365,
"grad_norm": 33.83066177368164,
"learning_rate": 4.6846234426744624e-07,
"logits/chosen": 1.4280340671539307,
"logits/rejected": 1.1328227519989014,
"loss": 1.0885,
"step": 164
},
{
"beta_dpo/beta_used": 0.12049752473831177,
"beta_dpo/beta_used_raw": 0.12049752473831177,
"beta_dpo/gap_mean": 3.529249429702759,
"beta_dpo/gap_std": 6.616786956787109,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.2494331065759637,
"grad_norm": 20.40849494934082,
"learning_rate": 4.678164332082175e-07,
"logits/chosen": 1.7118687629699707,
"logits/rejected": 1.7932038307189941,
"loss": 1.0097,
"step": 165
},
{
"beta_dpo/beta_used": 0.0652805045247078,
"beta_dpo/beta_used_raw": 0.0652805045247078,
"beta_dpo/gap_mean": 3.6699838638305664,
"beta_dpo/gap_std": 6.657036781311035,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.2509448223733938,
"grad_norm": 16.213890075683594,
"learning_rate": 4.6716442935512214e-07,
"logits/chosen": 1.7215876579284668,
"logits/rejected": 1.584639310836792,
"loss": 1.2227,
"step": 166
},
{
"beta_dpo/beta_used": 0.0965694785118103,
"beta_dpo/beta_used_raw": 0.0965694785118103,
"beta_dpo/gap_mean": 3.741748809814453,
"beta_dpo/gap_std": 6.662418365478516,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.25245653817082386,
"grad_norm": 17.435684204101562,
"learning_rate": 4.6650635094610966e-07,
"logits/chosen": 1.4423127174377441,
"logits/rejected": 1.248117446899414,
"loss": 1.1171,
"step": 167
},
{
"beta_dpo/beta_used": 0.08136512339115143,
"beta_dpo/beta_used_raw": 0.06723477691411972,
"beta_dpo/gap_mean": 3.637478828430176,
"beta_dpo/gap_std": 6.633077621459961,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.25396825396825395,
"grad_norm": 16.939830780029297,
"learning_rate": 4.6584221638904767e-07,
"logits/chosen": 1.5010664463043213,
"logits/rejected": 1.5989562273025513,
"loss": 1.1493,
"step": 168
},
{
"beta_dpo/beta_used": 0.10735826194286346,
"beta_dpo/beta_used_raw": 0.10735826194286346,
"beta_dpo/gap_mean": 3.6373238563537598,
"beta_dpo/gap_std": 6.84861421585083,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.25547996976568405,
"grad_norm": 22.096202850341797,
"learning_rate": 4.651720442612075e-07,
"logits/chosen": 1.2934666872024536,
"logits/rejected": 1.2724759578704834,
"loss": 1.1346,
"step": 169
},
{
"beta_dpo/beta_used": 0.09798265993595123,
"beta_dpo/beta_used_raw": 0.09798265993595123,
"beta_dpo/gap_mean": 3.4399917125701904,
"beta_dpo/gap_std": 7.241048812866211,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.25699168556311414,
"grad_norm": 20.32015037536621,
"learning_rate": 4.6449585330874425e-07,
"logits/chosen": 1.449697494506836,
"logits/rejected": 1.5872085094451904,
"loss": 1.182,
"step": 170
},
{
"beta_dpo/beta_used": 0.09298588335514069,
"beta_dpo/beta_used_raw": 0.09298588335514069,
"beta_dpo/gap_mean": 3.642791271209717,
"beta_dpo/gap_std": 7.622129440307617,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.2585034013605442,
"grad_norm": 21.328685760498047,
"learning_rate": 4.6381366244617224e-07,
"logits/chosen": 2.529806613922119,
"logits/rejected": 2.442068099975586,
"loss": 1.1731,
"step": 171
},
{
"beta_dpo/beta_used": 0.053223028779029846,
"beta_dpo/beta_used_raw": 0.053223028779029846,
"beta_dpo/gap_mean": 3.6746082305908203,
"beta_dpo/gap_std": 7.658779144287109,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.2600151171579743,
"grad_norm": 11.974435806274414,
"learning_rate": 4.631254907558365e-07,
"logits/chosen": 2.2132601737976074,
"logits/rejected": 2.062042474746704,
"loss": 1.2614,
"step": 172
},
{
"beta_dpo/beta_used": 0.07246831804513931,
"beta_dpo/beta_used_raw": 0.056058838963508606,
"beta_dpo/gap_mean": 3.904388904571533,
"beta_dpo/gap_std": 7.858163833618164,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.2615268329554044,
"grad_norm": 16.666893005371094,
"learning_rate": 4.624313574873786e-07,
"logits/chosen": 1.2529126405715942,
"logits/rejected": 1.277488112449646,
"loss": 1.2057,
"step": 173
},
{
"beta_dpo/beta_used": 0.14423680305480957,
"beta_dpo/beta_used_raw": 0.14423680305480957,
"beta_dpo/gap_mean": 4.094144344329834,
"beta_dpo/gap_std": 7.942702293395996,
"beta_dpo/mask_keep_frac": 0.625,
"epoch": 0.26303854875283444,
"grad_norm": 29.772438049316406,
"learning_rate": 4.61731282057198e-07,
"logits/chosen": 1.4726738929748535,
"logits/rejected": 1.160088062286377,
"loss": 0.9648,
"step": 174
},
{
"beta_dpo/beta_used": 0.1600879430770874,
"beta_dpo/beta_used_raw": 0.1600879430770874,
"beta_dpo/gap_mean": 4.209630012512207,
"beta_dpo/gap_std": 8.244287490844727,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.26455026455026454,
"grad_norm": 27.950191497802734,
"learning_rate": 4.6102528404790965e-07,
"logits/chosen": 2.0793564319610596,
"logits/rejected": 1.8890061378479004,
"loss": 1.0063,
"step": 175
},
{
"beta_dpo/beta_used": 0.05491591989994049,
"beta_dpo/beta_used_raw": -0.007800232619047165,
"beta_dpo/gap_mean": 4.055308818817139,
"beta_dpo/gap_std": 8.47665023803711,
"beta_dpo/mask_keep_frac": 0.5625,
"epoch": 0.2660619803476946,
"grad_norm": 16.41081428527832,
"learning_rate": 4.603133832077953e-07,
"logits/chosen": 1.8980469703674316,
"logits/rejected": 1.8360246419906616,
"loss": 1.2205,
"step": 176
},
{
"beta_dpo/beta_used": 0.18383970856666565,
"beta_dpo/beta_used_raw": 0.18383970856666565,
"beta_dpo/gap_mean": 4.529065132141113,
"beta_dpo/gap_std": 8.682202339172363,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.2675736961451247,
"grad_norm": 38.8032112121582,
"learning_rate": 4.5959559945025183e-07,
"logits/chosen": 1.898195743560791,
"logits/rejected": 1.8038549423217773,
"loss": 0.9231,
"step": 177
},
{
"beta_dpo/beta_used": 0.19010929763317108,
"beta_dpo/beta_used_raw": 0.19010929763317108,
"beta_dpo/gap_mean": 4.975588798522949,
"beta_dpo/gap_std": 8.499929428100586,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.2690854119425548,
"grad_norm": 33.64447021484375,
"learning_rate": 4.588719528532341e-07,
"logits/chosen": 1.8813047409057617,
"logits/rejected": 1.6573269367218018,
"loss": 0.7533,
"step": 178
},
{
"beta_dpo/beta_used": 0.09621996432542801,
"beta_dpo/beta_used_raw": 0.09621996432542801,
"beta_dpo/gap_mean": 4.808865070343018,
"beta_dpo/gap_std": 8.72989273071289,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.2705971277399849,
"grad_norm": 27.32986831665039,
"learning_rate": 4.581424636586928e-07,
"logits/chosen": 1.7015210390090942,
"logits/rejected": 1.7635328769683838,
"loss": 1.068,
"step": 179
},
{
"beta_dpo/beta_used": 0.025560760870575905,
"beta_dpo/beta_used_raw": 0.006768429651856422,
"beta_dpo/gap_mean": 4.6550421714782715,
"beta_dpo/gap_std": 8.802743911743164,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.272108843537415,
"grad_norm": 6.477420806884766,
"learning_rate": 4.5740715227200897e-07,
"logits/chosen": 1.3054771423339844,
"logits/rejected": 0.9867875576019287,
"loss": 1.3093,
"step": 180
},
{
"beta_dpo/beta_used": 0.08936936408281326,
"beta_dpo/beta_used_raw": 0.08936936408281326,
"beta_dpo/gap_mean": 4.8378005027771,
"beta_dpo/gap_std": 8.81364631652832,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.273620559334845,
"grad_norm": 22.288198471069336,
"learning_rate": 4.566660392614228e-07,
"logits/chosen": 1.4038888216018677,
"logits/rejected": 1.3131110668182373,
"loss": 1.1065,
"step": 181
},
{
"beta_dpo/beta_used": 0.1858215630054474,
"beta_dpo/beta_used_raw": 0.1858215630054474,
"beta_dpo/gap_mean": 5.14285945892334,
"beta_dpo/gap_std": 8.877325057983398,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.2751322751322751,
"grad_norm": 33.52143859863281,
"learning_rate": 4.5591914535745817e-07,
"logits/chosen": 1.562524676322937,
"logits/rejected": 1.2491695880889893,
"loss": 0.8402,
"step": 182
},
{
"beta_dpo/beta_used": 0.04903354123234749,
"beta_dpo/beta_used_raw": 0.020485244691371918,
"beta_dpo/gap_mean": 4.961426734924316,
"beta_dpo/gap_std": 9.019545555114746,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.2766439909297052,
"grad_norm": 12.803986549377441,
"learning_rate": 4.551664914523433e-07,
"logits/chosen": 1.5661842823028564,
"logits/rejected": 1.6295418739318848,
"loss": 1.2587,
"step": 183
},
{
"beta_dpo/beta_used": 0.03347941115498543,
"beta_dpo/beta_used_raw": 0.03347941115498543,
"beta_dpo/gap_mean": 4.973166465759277,
"beta_dpo/gap_std": 8.975120544433594,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.2781557067271353,
"grad_norm": 6.735782623291016,
"learning_rate": 4.544080985994258e-07,
"logits/chosen": 1.7749900817871094,
"logits/rejected": 1.7194840908050537,
"loss": 1.278,
"step": 184
},
{
"beta_dpo/beta_used": 0.1335582137107849,
"beta_dpo/beta_used_raw": 0.09271209686994553,
"beta_dpo/gap_mean": 5.115813255310059,
"beta_dpo/gap_std": 9.346285820007324,
"beta_dpo/mask_keep_frac": 0.5625,
"epoch": 0.2796674225245654,
"grad_norm": 23.257200241088867,
"learning_rate": 4.5364398801258394e-07,
"logits/chosen": 1.6902034282684326,
"logits/rejected": 1.6307165622711182,
"loss": 1.0535,
"step": 185
},
{
"beta_dpo/beta_used": 0.08969143778085709,
"beta_dpo/beta_used_raw": 0.08969143778085709,
"beta_dpo/gap_mean": 5.149080753326416,
"beta_dpo/gap_std": 9.851451873779297,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.2811791383219955,
"grad_norm": 18.274940490722656,
"learning_rate": 4.5287418106563354e-07,
"logits/chosen": 1.2696905136108398,
"logits/rejected": 1.034300446510315,
"loss": 1.1545,
"step": 186
},
{
"beta_dpo/beta_used": 0.19267341494560242,
"beta_dpo/beta_used_raw": 0.19267341494560242,
"beta_dpo/gap_mean": 5.213037490844727,
"beta_dpo/gap_std": 10.063613891601562,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.28269085411942557,
"grad_norm": 36.83070755004883,
"learning_rate": 4.520986992917297e-07,
"logits/chosen": 1.6270906925201416,
"logits/rejected": 1.3702093362808228,
"loss": 0.868,
"step": 187
},
{
"beta_dpo/beta_used": 0.07237689942121506,
"beta_dpo/beta_used_raw": 0.011760570108890533,
"beta_dpo/gap_mean": 5.006385326385498,
"beta_dpo/gap_std": 9.972711563110352,
"beta_dpo/mask_keep_frac": 0.625,
"epoch": 0.2842025699168556,
"grad_norm": 18.037521362304688,
"learning_rate": 4.5131756438276466e-07,
"logits/chosen": 1.9290728569030762,
"logits/rejected": 1.627413034439087,
"loss": 1.2184,
"step": 188
},
{
"beta_dpo/beta_used": 0.13262778520584106,
"beta_dpo/beta_used_raw": 0.08808554708957672,
"beta_dpo/gap_mean": 5.106810092926025,
"beta_dpo/gap_std": 9.941263198852539,
"beta_dpo/mask_keep_frac": 0.625,
"epoch": 0.2857142857142857,
"grad_norm": 49.00212478637695,
"learning_rate": 4.5053079818876096e-07,
"logits/chosen": 1.581903338432312,
"logits/rejected": 1.6713547706604004,
"loss": 1.1338,
"step": 189
},
{
"beta_dpo/beta_used": 0.2871774435043335,
"beta_dpo/beta_used_raw": 0.2871774435043335,
"beta_dpo/gap_mean": 5.393362522125244,
"beta_dpo/gap_std": 9.98210620880127,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.2872260015117158,
"grad_norm": 62.98942947387695,
"learning_rate": 4.4973842271726024e-07,
"logits/chosen": 1.2159141302108765,
"logits/rejected": 0.8774590492248535,
"loss": 0.6566,
"step": 190
},
{
"beta_dpo/beta_used": 0.07902415096759796,
"beta_dpo/beta_used_raw": 0.07902415096759796,
"beta_dpo/gap_mean": 5.325117588043213,
"beta_dpo/gap_std": 10.09085750579834,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.2887377173091459,
"grad_norm": 17.34073257446289,
"learning_rate": 4.48940460132708e-07,
"logits/chosen": 1.7142860889434814,
"logits/rejected": 1.5624032020568848,
"loss": 1.1187,
"step": 191
},
{
"beta_dpo/beta_used": 0.022713923826813698,
"beta_dpo/beta_used_raw": 0.022713923826813698,
"beta_dpo/gap_mean": 4.850440502166748,
"beta_dpo/gap_std": 9.887323379516602,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.29024943310657597,
"grad_norm": 4.582393169403076,
"learning_rate": 4.481369327558329e-07,
"logits/chosen": 1.753859281539917,
"logits/rejected": 1.7524299621582031,
"loss": 1.306,
"step": 192
},
{
"beta_dpo/beta_used": 0.03702099993824959,
"beta_dpo/beta_used_raw": 0.03702099993824959,
"beta_dpo/gap_mean": 4.930882453918457,
"beta_dpo/gap_std": 9.861164093017578,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.29176114890400606,
"grad_norm": 10.907903671264648,
"learning_rate": 4.47327863063023e-07,
"logits/chosen": 1.5122041702270508,
"logits/rejected": 1.3613293170928955,
"loss": 1.2766,
"step": 193
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.030461229383945465,
"beta_dpo/gap_mean": 4.714853286743164,
"beta_dpo/gap_std": 10.037712097167969,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.29327286470143615,
"grad_norm": 0.24061015248298645,
"learning_rate": 4.4651327368569684e-07,
"logits/chosen": 1.5545375347137451,
"logits/rejected": 1.6280193328857422,
"loss": 1.3836,
"step": 194
},
{
"beta_dpo/beta_used": 0.06694042682647705,
"beta_dpo/beta_used_raw": 0.06694042682647705,
"beta_dpo/gap_mean": 4.624754905700684,
"beta_dpo/gap_std": 9.953048706054688,
"beta_dpo/mask_keep_frac": 0.625,
"epoch": 0.2947845804988662,
"grad_norm": 16.01689338684082,
"learning_rate": 4.4569318740967043e-07,
"logits/chosen": 0.9506034851074219,
"logits/rejected": 1.1181230545043945,
"loss": 1.1767,
"step": 195
},
{
"beta_dpo/beta_used": 0.1468542218208313,
"beta_dpo/beta_used_raw": 0.1468542218208313,
"beta_dpo/gap_mean": 4.510015487670898,
"beta_dpo/gap_std": 9.975471496582031,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.2962962962962963,
"grad_norm": 29.909286499023438,
"learning_rate": 4.448676271745197e-07,
"logits/chosen": 1.5411239862442017,
"logits/rejected": 1.5550258159637451,
"loss": 1.0114,
"step": 196
},
{
"beta_dpo/beta_used": 0.1864607036113739,
"beta_dpo/beta_used_raw": 0.1864607036113739,
"beta_dpo/gap_mean": 4.715234756469727,
"beta_dpo/gap_std": 10.445943832397461,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.29780801209372637,
"grad_norm": 40.97339630126953,
"learning_rate": 4.440366160729392e-07,
"logits/chosen": 2.425356149673462,
"logits/rejected": 1.9260857105255127,
"loss": 1.0392,
"step": 197
},
{
"beta_dpo/beta_used": 0.1677129566669464,
"beta_dpo/beta_used_raw": 0.1677129566669464,
"beta_dpo/gap_mean": 4.945888996124268,
"beta_dpo/gap_std": 10.412927627563477,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.29931972789115646,
"grad_norm": 28.744064331054688,
"learning_rate": 4.432001773500957e-07,
"logits/chosen": 1.90618896484375,
"logits/rejected": 1.743265151977539,
"loss": 0.9825,
"step": 198
},
{
"beta_dpo/beta_used": 0.14470118284225464,
"beta_dpo/beta_used_raw": 0.14254896342754364,
"beta_dpo/gap_mean": 4.955746650695801,
"beta_dpo/gap_std": 10.591995239257812,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.30083144368858655,
"grad_norm": 35.50462341308594,
"learning_rate": 4.4235833440297856e-07,
"logits/chosen": 1.899355411529541,
"logits/rejected": 1.5145988464355469,
"loss": 0.9911,
"step": 199
},
{
"beta_dpo/beta_used": 0.19559510052204132,
"beta_dpo/beta_used_raw": 0.19559510052204132,
"beta_dpo/gap_mean": 5.2142181396484375,
"beta_dpo/gap_std": 10.851507186889648,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.30234315948601664,
"grad_norm": 38.11675262451172,
"learning_rate": 4.415111107797445e-07,
"logits/chosen": 1.70686674118042,
"logits/rejected": 1.0834190845489502,
"loss": 1.0452,
"step": 200
},
{
"epoch": 0.30234315948601664,
"eval_beta_dpo/beta_used": 0.1302367001771927,
"eval_beta_dpo/beta_used_raw": 0.12496456503868103,
"eval_beta_dpo/gap_mean": 5.1820478439331055,
"eval_beta_dpo/gap_std": 10.96353816986084,
"eval_beta_dpo/mask_keep_frac": 1.0,
"eval_logits/chosen": 1.6393016576766968,
"eval_logits/rejected": 1.5120911598205566,
"eval_loss": 0.6041610240936279,
"eval_runtime": 42.6034,
"eval_samples_per_second": 54.057,
"eval_steps_per_second": 1.69,
"step": 200
},
{
"beta_dpo/beta_used": 0.05950671434402466,
"beta_dpo/beta_used_raw": 0.05950671434402466,
"beta_dpo/gap_mean": 5.228536605834961,
"beta_dpo/gap_std": 10.815942764282227,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.30385487528344673,
"grad_norm": 18.87432861328125,
"learning_rate": 4.4065853017905953e-07,
"logits/chosen": 2.147465229034424,
"logits/rejected": 2.1975698471069336,
"loss": 1.1811,
"step": 201
},
{
"beta_dpo/beta_used": 0.1301541030406952,
"beta_dpo/beta_used_raw": 0.1301541030406952,
"beta_dpo/gap_mean": 5.4303483963012695,
"beta_dpo/gap_std": 10.73287582397461,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.30536659108087677,
"grad_norm": 34.74480056762695,
"learning_rate": 4.3980061644943575e-07,
"logits/chosen": 1.184483289718628,
"logits/rejected": 0.7701964378356934,
"loss": 1.0818,
"step": 202
},
{
"beta_dpo/beta_used": 0.10333988070487976,
"beta_dpo/beta_used_raw": 0.10333988070487976,
"beta_dpo/gap_mean": 5.64778995513916,
"beta_dpo/gap_std": 10.710124969482422,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.30687830687830686,
"grad_norm": 24.69707679748535,
"learning_rate": 4.3893739358856455e-07,
"logits/chosen": 2.1665844917297363,
"logits/rejected": 1.642435073852539,
"loss": 1.0076,
"step": 203
},
{
"beta_dpo/beta_used": 0.05128917843103409,
"beta_dpo/beta_used_raw": 0.05128917843103409,
"beta_dpo/gap_mean": 5.986999988555908,
"beta_dpo/gap_std": 10.805131912231445,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.30839002267573695,
"grad_norm": 18.80857276916504,
"learning_rate": 4.380688857426449e-07,
"logits/chosen": 1.4700884819030762,
"logits/rejected": 1.0781567096710205,
"loss": 1.2208,
"step": 204
},
{
"beta_dpo/beta_used": 0.10645169019699097,
"beta_dpo/beta_used_raw": 0.09335462003946304,
"beta_dpo/gap_mean": 5.741988182067871,
"beta_dpo/gap_std": 10.960041046142578,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.30990173847316704,
"grad_norm": 24.979455947875977,
"learning_rate": 4.3719511720570814e-07,
"logits/chosen": 2.1651041507720947,
"logits/rejected": 1.954960823059082,
"loss": 1.2116,
"step": 205
},
{
"beta_dpo/beta_used": 0.019261833280324936,
"beta_dpo/beta_used_raw": -0.05605250597000122,
"beta_dpo/gap_mean": 5.41782808303833,
"beta_dpo/gap_std": 11.143302917480469,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.31141345427059713,
"grad_norm": 5.909642219543457,
"learning_rate": 4.363161124189387e-07,
"logits/chosen": 2.501114845275879,
"logits/rejected": 2.2312614917755127,
"loss": 1.3189,
"step": 206
},
{
"beta_dpo/beta_used": 0.04499204084277153,
"beta_dpo/beta_used_raw": 0.04499204084277153,
"beta_dpo/gap_mean": 5.597379207611084,
"beta_dpo/gap_std": 11.20595932006836,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.3129251700680272,
"grad_norm": 16.016199111938477,
"learning_rate": 4.3543189596998986e-07,
"logits/chosen": 1.4587275981903076,
"logits/rejected": 1.1297156810760498,
"loss": 1.2432,
"step": 207
},
{
"beta_dpo/beta_used": 0.15474805235862732,
"beta_dpo/beta_used_raw": 0.15474805235862732,
"beta_dpo/gap_mean": 5.2720513343811035,
"beta_dpo/gap_std": 11.097529411315918,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.3144368858654573,
"grad_norm": 35.47798538208008,
"learning_rate": 4.3454249259229664e-07,
"logits/chosen": 1.320824384689331,
"logits/rejected": 1.2546792030334473,
"loss": 1.0041,
"step": 208
},
{
"beta_dpo/beta_used": 0.24433788657188416,
"beta_dpo/beta_used_raw": 0.24433788657188416,
"beta_dpo/gap_mean": 5.672760486602783,
"beta_dpo/gap_std": 11.376781463623047,
"beta_dpo/mask_keep_frac": 0.625,
"epoch": 0.31594860166288735,
"grad_norm": 56.8420524597168,
"learning_rate": 4.336479271643833e-07,
"logits/chosen": 1.5092371702194214,
"logits/rejected": 1.3591229915618896,
"loss": 1.0248,
"step": 209
},
{
"beta_dpo/beta_used": 0.18511611223220825,
"beta_dpo/beta_used_raw": 0.18511611223220825,
"beta_dpo/gap_mean": 5.985712051391602,
"beta_dpo/gap_std": 11.49533462524414,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.31746031746031744,
"grad_norm": 31.574161529541016,
"learning_rate": 4.327482247091679e-07,
"logits/chosen": 1.6522598266601562,
"logits/rejected": 1.1164844036102295,
"loss": 0.9775,
"step": 210
},
{
"beta_dpo/beta_used": 0.015916820615530014,
"beta_dpo/beta_used_raw": 0.015916820615530014,
"beta_dpo/gap_mean": 6.245479106903076,
"beta_dpo/gap_std": 11.601383209228516,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.31897203325774753,
"grad_norm": 5.433824062347412,
"learning_rate": 4.3184341039326217e-07,
"logits/chosen": 2.0384957790374756,
"logits/rejected": 1.6139662265777588,
"loss": 1.3211,
"step": 211
},
{
"beta_dpo/beta_used": 0.27274635434150696,
"beta_dpo/beta_used_raw": 0.27274635434150696,
"beta_dpo/gap_mean": 6.564366340637207,
"beta_dpo/gap_std": 11.817914962768555,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.3204837490551776,
"grad_norm": 52.9380989074707,
"learning_rate": 4.309335095262675e-07,
"logits/chosen": 1.7562899589538574,
"logits/rejected": 1.825326919555664,
"loss": 0.7789,
"step": 212
},
{
"beta_dpo/beta_used": 0.012187526561319828,
"beta_dpo/beta_used_raw": 0.0010126382112503052,
"beta_dpo/gap_mean": 6.627925872802734,
"beta_dpo/gap_std": 12.03477954864502,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.3219954648526077,
"grad_norm": 4.3769941329956055,
"learning_rate": 4.3001854756006724e-07,
"logits/chosen": 1.3175151348114014,
"logits/rejected": 1.5719773769378662,
"loss": 1.3265,
"step": 213
},
{
"beta_dpo/beta_used": 0.03768792375922203,
"beta_dpo/beta_used_raw": 0.01986430399119854,
"beta_dpo/gap_mean": 6.281346321105957,
"beta_dpo/gap_std": 11.880584716796875,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.3235071806500378,
"grad_norm": 11.269039154052734,
"learning_rate": 4.290985500881143e-07,
"logits/chosen": 1.5611655712127686,
"logits/rejected": 1.757429838180542,
"loss": 1.2343,
"step": 214
},
{
"beta_dpo/beta_used": 0.17131496965885162,
"beta_dpo/beta_used_raw": 0.17131496965885162,
"beta_dpo/gap_mean": 6.383757591247559,
"beta_dpo/gap_std": 11.635639190673828,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.3250188964474679,
"grad_norm": 48.6794548034668,
"learning_rate": 4.281735428447157e-07,
"logits/chosen": 1.0305719375610352,
"logits/rejected": 0.8588269352912903,
"loss": 1.1715,
"step": 215
},
{
"beta_dpo/beta_used": 0.09299275279045105,
"beta_dpo/beta_used_raw": 0.09046853333711624,
"beta_dpo/gap_mean": 6.680278301239014,
"beta_dpo/gap_std": 11.714441299438477,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.32653061224489793,
"grad_norm": 21.09217071533203,
"learning_rate": 4.2724355170431247e-07,
"logits/chosen": 2.2120964527130127,
"logits/rejected": 2.084207534790039,
"loss": 1.1185,
"step": 216
},
{
"beta_dpo/beta_used": 0.04985278844833374,
"beta_dpo/beta_used_raw": 0.04985278844833374,
"beta_dpo/gap_mean": 6.69057559967041,
"beta_dpo/gap_std": 11.938087463378906,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.328042328042328,
"grad_norm": 11.565571784973145,
"learning_rate": 4.26308602680756e-07,
"logits/chosen": 2.1804494857788086,
"logits/rejected": 1.817223072052002,
"loss": 1.1928,
"step": 217
},
{
"beta_dpo/beta_used": 0.12828893959522247,
"beta_dpo/beta_used_raw": 0.11110377311706543,
"beta_dpo/gap_mean": 6.170825958251953,
"beta_dpo/gap_std": 11.909095764160156,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.3295540438397581,
"grad_norm": 34.02565383911133,
"learning_rate": 4.253687219265803e-07,
"logits/chosen": 1.4786970615386963,
"logits/rejected": 1.3133083581924438,
"loss": 1.0952,
"step": 218
},
{
"beta_dpo/beta_used": 0.0129962507635355,
"beta_dpo/beta_used_raw": 0.004371422342956066,
"beta_dpo/gap_mean": 6.15762996673584,
"beta_dpo/gap_std": 11.741506576538086,
"beta_dpo/mask_keep_frac": 0.625,
"epoch": 0.3310657596371882,
"grad_norm": 5.00832986831665,
"learning_rate": 4.2442393573227043e-07,
"logits/chosen": 1.4401828050613403,
"logits/rejected": 1.342416763305664,
"loss": 1.3225,
"step": 219
},
{
"beta_dpo/beta_used": 0.028154663741588593,
"beta_dpo/beta_used_raw": 0.028154663741588593,
"beta_dpo/gap_mean": 6.027561187744141,
"beta_dpo/gap_std": 11.516753196716309,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.3325774754346183,
"grad_norm": 7.318558692932129,
"learning_rate": 4.234742705255272e-07,
"logits/chosen": 1.6872892379760742,
"logits/rejected": 1.3944776058197021,
"loss": 1.2728,
"step": 220
},
{
"beta_dpo/beta_used": 0.09166809916496277,
"beta_dpo/beta_used_raw": 0.09166809916496277,
"beta_dpo/gap_mean": 6.0580058097839355,
"beta_dpo/gap_std": 11.597650527954102,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.3340891912320484,
"grad_norm": 19.652095794677734,
"learning_rate": 4.22519752870528e-07,
"logits/chosen": 1.3477516174316406,
"logits/rejected": 1.0663343667984009,
"loss": 1.089,
"step": 221
},
{
"beta_dpo/beta_used": 0.16691642999649048,
"beta_dpo/beta_used_raw": 0.16691642999649048,
"beta_dpo/gap_mean": 6.380154609680176,
"beta_dpo/gap_std": 11.571673393249512,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.3356009070294785,
"grad_norm": 42.29206085205078,
"learning_rate": 4.2156040946718343e-07,
"logits/chosen": 2.014256477355957,
"logits/rejected": 1.907914161682129,
"loss": 1.0351,
"step": 222
},
{
"beta_dpo/beta_used": 0.06735613942146301,
"beta_dpo/beta_used_raw": 0.06735613942146301,
"beta_dpo/gap_mean": 6.677520751953125,
"beta_dpo/gap_std": 11.566620826721191,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.3371126228269085,
"grad_norm": 16.786890029907227,
"learning_rate": 4.2059626715039065e-07,
"logits/chosen": 1.4429056644439697,
"logits/rejected": 1.3182603120803833,
"loss": 1.1409,
"step": 223
},
{
"beta_dpo/beta_used": 0.06260553002357483,
"beta_dpo/beta_used_raw": 0.06260553002357483,
"beta_dpo/gap_mean": 6.619193077087402,
"beta_dpo/gap_std": 11.379542350769043,
"beta_dpo/mask_keep_frac": 0.625,
"epoch": 0.3386243386243386,
"grad_norm": 15.259867668151855,
"learning_rate": 4.1962735288928304e-07,
"logits/chosen": 2.385403633117676,
"logits/rejected": 2.2249648571014404,
"loss": 1.113,
"step": 224
},
{
"beta_dpo/beta_used": 0.04934084415435791,
"beta_dpo/beta_used_raw": 0.038659606128931046,
"beta_dpo/gap_mean": 6.746703147888184,
"beta_dpo/gap_std": 11.56619644165039,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.3401360544217687,
"grad_norm": 13.229011535644531,
"learning_rate": 4.186536937864752e-07,
"logits/chosen": 1.5810472965240479,
"logits/rejected": 1.0562224388122559,
"loss": 1.1953,
"step": 225
},
{
"beta_dpo/beta_used": 0.07097682356834412,
"beta_dpo/beta_used_raw": 0.07097682356834412,
"beta_dpo/gap_mean": 6.604011535644531,
"beta_dpo/gap_std": 11.779237747192383,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.3416477702191988,
"grad_norm": 16.87116241455078,
"learning_rate": 4.176753170773052e-07,
"logits/chosen": 1.5594934225082397,
"logits/rejected": 1.3470158576965332,
"loss": 1.1094,
"step": 226
},
{
"beta_dpo/beta_used": 0.16113229095935822,
"beta_dpo/beta_used_raw": 0.16113229095935822,
"beta_dpo/gap_mean": 6.743369102478027,
"beta_dpo/gap_std": 12.084405899047852,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.3431594860166289,
"grad_norm": 35.86256790161133,
"learning_rate": 4.166922501290729e-07,
"logits/chosen": 1.400483250617981,
"logits/rejected": 1.3121165037155151,
"loss": 0.9626,
"step": 227
},
{
"beta_dpo/beta_used": 0.05211072787642479,
"beta_dpo/beta_used_raw": 0.018288929015398026,
"beta_dpo/gap_mean": 6.6497087478637695,
"beta_dpo/gap_std": 12.260591506958008,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.34467120181405897,
"grad_norm": 16.787290573120117,
"learning_rate": 4.1570452044027405e-07,
"logits/chosen": 1.9922467470169067,
"logits/rejected": 1.8599324226379395,
"loss": 1.2106,
"step": 228
},
{
"beta_dpo/beta_used": 0.16892960667610168,
"beta_dpo/beta_used_raw": 0.16892960667610168,
"beta_dpo/gap_mean": 6.767346382141113,
"beta_dpo/gap_std": 12.224922180175781,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.34618291761148906,
"grad_norm": 29.138980865478516,
"learning_rate": 4.147121556398312e-07,
"logits/chosen": 1.9473903179168701,
"logits/rejected": 1.702131748199463,
"loss": 0.8738,
"step": 229
},
{
"beta_dpo/beta_used": 0.11095847934484482,
"beta_dpo/beta_used_raw": 0.11095847934484482,
"beta_dpo/gap_mean": 6.441825866699219,
"beta_dpo/gap_std": 12.481451988220215,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.3476946334089191,
"grad_norm": 26.85580062866211,
"learning_rate": 4.137151834863213e-07,
"logits/chosen": 1.0509746074676514,
"logits/rejected": 1.3630282878875732,
"loss": 1.0661,
"step": 230
},
{
"beta_dpo/beta_used": 0.197315976023674,
"beta_dpo/beta_used_raw": 0.197315976023674,
"beta_dpo/gap_mean": 6.801876068115234,
"beta_dpo/gap_std": 12.54133129119873,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.3492063492063492,
"grad_norm": 51.88249206542969,
"learning_rate": 4.1271363186719835e-07,
"logits/chosen": 0.7952204942703247,
"logits/rejected": 0.5976537466049194,
"loss": 0.968,
"step": 231
},
{
"beta_dpo/beta_used": 0.04585336521267891,
"beta_dpo/beta_used_raw": 0.04585336521267891,
"beta_dpo/gap_mean": 6.546322345733643,
"beta_dpo/gap_std": 12.779912948608398,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.3507180650037793,
"grad_norm": 11.413716316223145,
"learning_rate": 4.1170752879801436e-07,
"logits/chosen": 1.2692692279815674,
"logits/rejected": 1.3366895914077759,
"loss": 1.2179,
"step": 232
},
{
"beta_dpo/beta_used": 0.1407492756843567,
"beta_dpo/beta_used_raw": 0.046730317175388336,
"beta_dpo/gap_mean": 6.603410720825195,
"beta_dpo/gap_std": 12.996603012084961,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.35222978080120937,
"grad_norm": 42.801273345947266,
"learning_rate": 4.106969024216348e-07,
"logits/chosen": 1.7770150899887085,
"logits/rejected": 1.4710367918014526,
"loss": 1.186,
"step": 233
},
{
"beta_dpo/beta_used": 0.12140364944934845,
"beta_dpo/beta_used_raw": 0.10727863758802414,
"beta_dpo/gap_mean": 6.156139373779297,
"beta_dpo/gap_std": 13.207222938537598,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.35374149659863946,
"grad_norm": 38.265342712402344,
"learning_rate": 4.09681781007452e-07,
"logits/chosen": 0.5206916332244873,
"logits/rejected": 0.37996482849121094,
"loss": 1.2569,
"step": 234
},
{
"beta_dpo/beta_used": 0.11484545469284058,
"beta_dpo/beta_used_raw": 0.10733015090227127,
"beta_dpo/gap_mean": 6.554360389709473,
"beta_dpo/gap_std": 12.979118347167969,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.35525321239606955,
"grad_norm": 50.358585357666016,
"learning_rate": 4.08662192950594e-07,
"logits/chosen": 1.2737979888916016,
"logits/rejected": 1.3781143426895142,
"loss": 1.2159,
"step": 235
},
{
"beta_dpo/beta_used": 0.19294582307338715,
"beta_dpo/beta_used_raw": 0.18594704568386078,
"beta_dpo/gap_mean": 6.644139289855957,
"beta_dpo/gap_std": 13.24412727355957,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.35676492819349964,
"grad_norm": 50.509952545166016,
"learning_rate": 4.076381667711306e-07,
"logits/chosen": 1.7458560466766357,
"logits/rejected": 1.6359169483184814,
"loss": 1.0928,
"step": 236
},
{
"beta_dpo/beta_used": 0.09629102051258087,
"beta_dpo/beta_used_raw": 0.03170393407344818,
"beta_dpo/gap_mean": 6.329275608062744,
"beta_dpo/gap_std": 13.261556625366211,
"beta_dpo/mask_keep_frac": 0.625,
"epoch": 0.35827664399092973,
"grad_norm": 18.643909454345703,
"learning_rate": 4.066097311132753e-07,
"logits/chosen": 1.4134365320205688,
"logits/rejected": 1.3123092651367188,
"loss": 1.0787,
"step": 237
},
{
"beta_dpo/beta_used": 0.11005407571792603,
"beta_dpo/beta_used_raw": 0.11005407571792603,
"beta_dpo/gap_mean": 6.265144348144531,
"beta_dpo/gap_std": 13.122074127197266,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.35978835978835977,
"grad_norm": 21.7652587890625,
"learning_rate": 4.0557691474458414e-07,
"logits/chosen": 1.703669786453247,
"logits/rejected": 1.675083875656128,
"loss": 1.0768,
"step": 238
},
{
"beta_dpo/beta_used": 0.11901892721652985,
"beta_dpo/beta_used_raw": 0.11901892721652985,
"beta_dpo/gap_mean": 6.37298583984375,
"beta_dpo/gap_std": 13.223270416259766,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.36130007558578986,
"grad_norm": 27.12213134765625,
"learning_rate": 4.045397465551513e-07,
"logits/chosen": 1.5994396209716797,
"logits/rejected": 1.4502242803573608,
"loss": 1.1095,
"step": 239
},
{
"beta_dpo/beta_used": 0.3060862421989441,
"beta_dpo/beta_used_raw": 0.3060862421989441,
"beta_dpo/gap_mean": 7.014960289001465,
"beta_dpo/gap_std": 13.332306861877441,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.36281179138321995,
"grad_norm": 71.83600616455078,
"learning_rate": 4.0349825555680045e-07,
"logits/chosen": 1.3177558183670044,
"logits/rejected": 1.2535611391067505,
"loss": 0.9217,
"step": 240
},
{
"beta_dpo/beta_used": 0.058323122560977936,
"beta_dpo/beta_used_raw": 0.058323122560977936,
"beta_dpo/gap_mean": 7.006235599517822,
"beta_dpo/gap_std": 13.238981246948242,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.36432350718065004,
"grad_norm": 15.872024536132812,
"learning_rate": 4.0245247088227377e-07,
"logits/chosen": 1.207369327545166,
"logits/rejected": 1.034806489944458,
"loss": 1.1805,
"step": 241
},
{
"beta_dpo/beta_used": 0.11554928123950958,
"beta_dpo/beta_used_raw": 0.10966099053621292,
"beta_dpo/gap_mean": 7.3681416511535645,
"beta_dpo/gap_std": 13.11304759979248,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.36583522297808013,
"grad_norm": 21.17530059814453,
"learning_rate": 4.0140242178441665e-07,
"logits/chosen": 0.5199865698814392,
"logits/rejected": 0.37630772590637207,
"loss": 1.0416,
"step": 242
},
{
"beta_dpo/beta_used": 0.0793665200471878,
"beta_dpo/beta_used_raw": 0.06487854570150375,
"beta_dpo/gap_mean": 7.23813533782959,
"beta_dpo/gap_std": 12.889257431030273,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.3673469387755102,
"grad_norm": 24.303476333618164,
"learning_rate": 4.003481376353596e-07,
"logits/chosen": 1.7084109783172607,
"logits/rejected": 1.7079315185546875,
"loss": 1.2085,
"step": 243
},
{
"beta_dpo/beta_used": 0.11602146923542023,
"beta_dpo/beta_used_raw": 0.11602146923542023,
"beta_dpo/gap_mean": 7.612434387207031,
"beta_dpo/gap_std": 12.60782241821289,
"beta_dpo/mask_keep_frac": 0.625,
"epoch": 0.3688586545729403,
"grad_norm": 25.81456184387207,
"learning_rate": 3.9928964792569654e-07,
"logits/chosen": 1.3396780490875244,
"logits/rejected": 1.2662789821624756,
"loss": 0.9081,
"step": 244
},
{
"beta_dpo/beta_used": 0.17276377975940704,
"beta_dpo/beta_used_raw": 0.17276377975940704,
"beta_dpo/gap_mean": 8.035795211791992,
"beta_dpo/gap_std": 12.561846733093262,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.37037037037037035,
"grad_norm": 28.689769744873047,
"learning_rate": 3.982269822636601e-07,
"logits/chosen": 1.5368680953979492,
"logits/rejected": 1.4403884410858154,
"loss": 0.7303,
"step": 245
},
{
"beta_dpo/beta_used": 0.07997963577508926,
"beta_dpo/beta_used_raw": 0.07323883473873138,
"beta_dpo/gap_mean": 8.246414184570312,
"beta_dpo/gap_std": 12.713071823120117,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.37188208616780044,
"grad_norm": 24.240697860717773,
"learning_rate": 3.971601703742932e-07,
"logits/chosen": 1.8209779262542725,
"logits/rejected": 1.883533239364624,
"loss": 1.0866,
"step": 246
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.14714615046977997,
"beta_dpo/gap_mean": 7.606928825378418,
"beta_dpo/gap_std": 12.773536682128906,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.37339380196523053,
"grad_norm": 0.31187567114830017,
"learning_rate": 3.960892420986177e-07,
"logits/chosen": 0.9519743323326111,
"logits/rejected": 0.7347662448883057,
"loss": 1.3828,
"step": 247
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.08996061980724335,
"beta_dpo/gap_mean": 7.225895881652832,
"beta_dpo/gap_std": 12.812955856323242,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.3749055177626606,
"grad_norm": 0.3030962347984314,
"learning_rate": 3.9501422739279953e-07,
"logits/chosen": 1.2478711605072021,
"logits/rejected": 1.4599595069885254,
"loss": 1.3826,
"step": 248
},
{
"beta_dpo/beta_used": 0.038103874772787094,
"beta_dpo/beta_used_raw": 0.012837713584303856,
"beta_dpo/gap_mean": 6.602936744689941,
"beta_dpo/gap_std": 12.938857078552246,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.3764172335600907,
"grad_norm": 11.848731994628906,
"learning_rate": 3.9393515632731094e-07,
"logits/chosen": 2.3792171478271484,
"logits/rejected": 2.3942737579345703,
"loss": 1.2594,
"step": 249
},
{
"beta_dpo/beta_used": 0.195104718208313,
"beta_dpo/beta_used_raw": 0.195104718208313,
"beta_dpo/gap_mean": 6.806901931762695,
"beta_dpo/gap_std": 13.18899917602539,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.3779289493575208,
"grad_norm": 35.285213470458984,
"learning_rate": 3.9285205908608934e-07,
"logits/chosen": 1.5496623516082764,
"logits/rejected": 1.4084426164627075,
"loss": 0.8475,
"step": 250
},
{
"beta_dpo/beta_used": 0.03262628987431526,
"beta_dpo/beta_used_raw": -0.007272530347108841,
"beta_dpo/gap_mean": 6.856196880340576,
"beta_dpo/gap_std": 13.041912078857422,
"beta_dpo/mask_keep_frac": 0.5625,
"epoch": 0.3794406651549509,
"grad_norm": 11.482399940490723,
"learning_rate": 3.9176496596569265e-07,
"logits/chosen": 1.570683240890503,
"logits/rejected": 1.6037306785583496,
"loss": 1.2477,
"step": 251
},
{
"beta_dpo/beta_used": 0.009753710590302944,
"beta_dpo/beta_used_raw": -0.043241649866104126,
"beta_dpo/gap_mean": 6.3600311279296875,
"beta_dpo/gap_std": 12.867057800292969,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.38095238095238093,
"grad_norm": 2.8048999309539795,
"learning_rate": 3.9067390737445254e-07,
"logits/chosen": 1.5606493949890137,
"logits/rejected": 1.436813473701477,
"loss": 1.3433,
"step": 252
},
{
"beta_dpo/beta_used": 0.011095372959971428,
"beta_dpo/beta_used_raw": -0.00485480111092329,
"beta_dpo/gap_mean": 6.311408996582031,
"beta_dpo/gap_std": 12.606225967407227,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.382464096749811,
"grad_norm": 3.6359784603118896,
"learning_rate": 3.8957891383162304e-07,
"logits/chosen": 1.5644431114196777,
"logits/rejected": 1.4657902717590332,
"loss": 1.3373,
"step": 253
},
{
"beta_dpo/beta_used": 0.038476165384054184,
"beta_dpo/beta_used_raw": 0.006513316184282303,
"beta_dpo/gap_mean": 6.202248573303223,
"beta_dpo/gap_std": 12.51596450805664,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.3839758125472411,
"grad_norm": 13.625349998474121,
"learning_rate": 3.884800159665276e-07,
"logits/chosen": 1.114762306213379,
"logits/rejected": 0.9899729490280151,
"loss": 1.2823,
"step": 254
},
{
"beta_dpo/beta_used": 0.21120937168598175,
"beta_dpo/beta_used_raw": 0.16640348732471466,
"beta_dpo/gap_mean": 6.46546745300293,
"beta_dpo/gap_std": 12.447296142578125,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.3854875283446712,
"grad_norm": 41.72684097290039,
"learning_rate": 3.873772445177015e-07,
"logits/chosen": 1.5619221925735474,
"logits/rejected": 1.3914833068847656,
"loss": 1.0468,
"step": 255
},
{
"beta_dpo/beta_used": 0.10800749063491821,
"beta_dpo/beta_used_raw": 0.05630078166723251,
"beta_dpo/gap_mean": 6.492776870727539,
"beta_dpo/gap_std": 12.558290481567383,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.3869992441421013,
"grad_norm": 33.22319030761719,
"learning_rate": 3.862706303320329e-07,
"logits/chosen": 1.068698525428772,
"logits/rejected": 0.9799892902374268,
"loss": 1.2679,
"step": 256
},
{
"beta_dpo/beta_used": 0.023676693439483643,
"beta_dpo/beta_used_raw": 0.015448857098817825,
"beta_dpo/gap_mean": 6.576137065887451,
"beta_dpo/gap_std": 12.788522720336914,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.3885109599395314,
"grad_norm": 7.665389537811279,
"learning_rate": 3.851602043638994e-07,
"logits/chosen": 1.521716594696045,
"logits/rejected": 1.2976162433624268,
"loss": 1.2872,
"step": 257
},
{
"beta_dpo/beta_used": 0.127933531999588,
"beta_dpo/beta_used_raw": 0.11033637076616287,
"beta_dpo/gap_mean": 6.729240417480469,
"beta_dpo/gap_std": 12.479969024658203,
"beta_dpo/mask_keep_frac": 0.625,
"epoch": 0.3900226757369615,
"grad_norm": 31.811519622802734,
"learning_rate": 3.840459976743023e-07,
"logits/chosen": 1.6439337730407715,
"logits/rejected": 1.4948757886886597,
"loss": 0.9875,
"step": 258
},
{
"beta_dpo/beta_used": 0.2876676619052887,
"beta_dpo/beta_used_raw": 0.2876676619052887,
"beta_dpo/gap_mean": 7.496322154998779,
"beta_dpo/gap_std": 12.738725662231445,
"beta_dpo/mask_keep_frac": 0.625,
"epoch": 0.3915343915343915,
"grad_norm": 69.39122772216797,
"learning_rate": 3.8292804142999796e-07,
"logits/chosen": 1.76267409324646,
"logits/rejected": 1.7653789520263672,
"loss": 0.8932,
"step": 259
},
{
"beta_dpo/beta_used": 0.003968134988099337,
"beta_dpo/beta_used_raw": -0.08447183668613434,
"beta_dpo/gap_mean": 7.647377014160156,
"beta_dpo/gap_std": 12.91828441619873,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.3930461073318216,
"grad_norm": 1.4290848970413208,
"learning_rate": 3.818063669026256e-07,
"logits/chosen": 1.6024353504180908,
"logits/rejected": 1.4254289865493774,
"loss": 1.3641,
"step": 260
},
{
"beta_dpo/beta_used": 0.09260217845439911,
"beta_dpo/beta_used_raw": 0.09260217845439911,
"beta_dpo/gap_mean": 7.191786766052246,
"beta_dpo/gap_std": 12.985448837280273,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.3945578231292517,
"grad_norm": 23.355159759521484,
"learning_rate": 3.806810054678331e-07,
"logits/chosen": 1.637736201286316,
"logits/rejected": 2.0598158836364746,
"loss": 1.1513,
"step": 261
},
{
"beta_dpo/beta_used": 0.055604852735996246,
"beta_dpo/beta_used_raw": 0.020321451127529144,
"beta_dpo/gap_mean": 6.929043769836426,
"beta_dpo/gap_std": 12.48460578918457,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.3960695389266818,
"grad_norm": 14.879576683044434,
"learning_rate": 3.7955198860439887e-07,
"logits/chosen": 1.6142950057983398,
"logits/rejected": 1.5240156650543213,
"loss": 1.1614,
"step": 262
},
{
"beta_dpo/beta_used": 0.06638128310441971,
"beta_dpo/beta_used_raw": 0.0491025447845459,
"beta_dpo/gap_mean": 7.156874179840088,
"beta_dpo/gap_std": 12.549823760986328,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.3975812547241119,
"grad_norm": 17.282392501831055,
"learning_rate": 3.784193478933516e-07,
"logits/chosen": 1.7361516952514648,
"logits/rejected": 1.5868427753448486,
"loss": 1.1135,
"step": 263
},
{
"beta_dpo/beta_used": 0.0455821193754673,
"beta_dpo/beta_used_raw": 0.009301692247390747,
"beta_dpo/gap_mean": 7.250235557556152,
"beta_dpo/gap_std": 12.345619201660156,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.39909297052154197,
"grad_norm": 15.046016693115234,
"learning_rate": 3.7728311501708674e-07,
"logits/chosen": 1.4044766426086426,
"logits/rejected": 1.309002161026001,
"loss": 1.1949,
"step": 264
},
{
"beta_dpo/beta_used": 0.27329275012016296,
"beta_dpo/beta_used_raw": 0.27329275012016296,
"beta_dpo/gap_mean": 7.598145961761475,
"beta_dpo/gap_std": 12.671415328979492,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.40060468631897206,
"grad_norm": 47.54646301269531,
"learning_rate": 3.7614332175848027e-07,
"logits/chosen": 1.227288007736206,
"logits/rejected": 1.1892151832580566,
"loss": 0.7855,
"step": 265
},
{
"beta_dpo/beta_used": 0.09071945399045944,
"beta_dpo/beta_used_raw": 0.09071945399045944,
"beta_dpo/gap_mean": 7.997687816619873,
"beta_dpo/gap_std": 12.852567672729492,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.4021164021164021,
"grad_norm": 20.935165405273438,
"learning_rate": 3.75e-07,
"logits/chosen": 2.1720075607299805,
"logits/rejected": 1.9046260118484497,
"loss": 1.0269,
"step": 266
},
{
"beta_dpo/beta_used": 0.1523996889591217,
"beta_dpo/beta_used_raw": 0.1523996889591217,
"beta_dpo/gap_mean": 7.685354709625244,
"beta_dpo/gap_std": 12.811508178710938,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.4036281179138322,
"grad_norm": 20.654251098632812,
"learning_rate": 3.738531817228131e-07,
"logits/chosen": 1.4944243431091309,
"logits/rejected": 1.2840352058410645,
"loss": 0.8099,
"step": 267
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.14010438323020935,
"beta_dpo/gap_mean": 7.252220630645752,
"beta_dpo/gap_std": 12.721076965332031,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.4051398337112623,
"grad_norm": 0.2638804614543915,
"learning_rate": 3.7270289900589204e-07,
"logits/chosen": 1.4701387882232666,
"logits/rejected": 1.3832581043243408,
"loss": 1.3831,
"step": 268
},
{
"beta_dpo/beta_used": 0.16782352328300476,
"beta_dpo/beta_used_raw": 0.16782352328300476,
"beta_dpo/gap_mean": 7.114346504211426,
"beta_dpo/gap_std": 12.626228332519531,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.40665154950869237,
"grad_norm": 31.832950592041016,
"learning_rate": 3.7154918402511714e-07,
"logits/chosen": 1.55270516872406,
"logits/rejected": 1.756973147392273,
"loss": 0.9503,
"step": 269
},
{
"beta_dpo/beta_used": 0.00175630790181458,
"beta_dpo/beta_used_raw": -0.00642303517088294,
"beta_dpo/gap_mean": 7.145930767059326,
"beta_dpo/gap_std": 12.572525024414062,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.40816326530612246,
"grad_norm": 0.5824019312858582,
"learning_rate": 3.7039206905237656e-07,
"logits/chosen": 1.472477674484253,
"logits/rejected": 1.4838604927062988,
"loss": 1.3767,
"step": 270
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.030945777893066406,
"beta_dpo/gap_mean": 6.868946075439453,
"beta_dpo/gap_std": 12.882951736450195,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.40967498110355255,
"grad_norm": 0.305041640996933,
"learning_rate": 3.692315864546635e-07,
"logits/chosen": 1.6203490495681763,
"logits/rejected": 1.2593576908111572,
"loss": 1.382,
"step": 271
},
{
"beta_dpo/beta_used": 0.26175782084465027,
"beta_dpo/beta_used_raw": 0.26175782084465027,
"beta_dpo/gap_mean": 7.0912322998046875,
"beta_dpo/gap_std": 12.92785358428955,
"beta_dpo/mask_keep_frac": 0.5,
"epoch": 0.41118669690098264,
"grad_norm": 46.45206069946289,
"learning_rate": 3.6806776869317067e-07,
"logits/chosen": 2.0279994010925293,
"logits/rejected": 2.015707492828369,
"loss": 0.7908,
"step": 272
},
{
"beta_dpo/beta_used": 0.12874624133110046,
"beta_dpo/beta_used_raw": 0.12874624133110046,
"beta_dpo/gap_mean": 7.3807525634765625,
"beta_dpo/gap_std": 13.080790519714355,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.4126984126984127,
"grad_norm": 32.41847610473633,
"learning_rate": 3.669006483223828e-07,
"logits/chosen": 1.844411849975586,
"logits/rejected": 1.5684620141983032,
"loss": 1.0028,
"step": 273
},
{
"beta_dpo/beta_used": 0.14629867672920227,
"beta_dpo/beta_used_raw": 0.14629867672920227,
"beta_dpo/gap_mean": 7.499863624572754,
"beta_dpo/gap_std": 13.270782470703125,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.41421012849584277,
"grad_norm": 31.732133865356445,
"learning_rate": 3.657302579891656e-07,
"logits/chosen": 1.6515002250671387,
"logits/rejected": 1.8607064485549927,
"loss": 0.9586,
"step": 274
},
{
"beta_dpo/beta_used": 0.09432569891214371,
"beta_dpo/beta_used_raw": 0.06229160353541374,
"beta_dpo/gap_mean": 7.779719829559326,
"beta_dpo/gap_std": 13.27018928527832,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.41572184429327286,
"grad_norm": 23.149499893188477,
"learning_rate": 3.645566304318526e-07,
"logits/chosen": 1.7068809270858765,
"logits/rejected": 1.6265830993652344,
"loss": 1.0502,
"step": 275
},
{
"beta_dpo/beta_used": 0.1937197744846344,
"beta_dpo/beta_used_raw": 0.1937197744846344,
"beta_dpo/gap_mean": 8.17389965057373,
"beta_dpo/gap_std": 13.258670806884766,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.41723356009070295,
"grad_norm": 36.68490982055664,
"learning_rate": 3.633797984793294e-07,
"logits/chosen": 1.0973702669143677,
"logits/rejected": 1.1315345764160156,
"loss": 0.8111,
"step": 276
},
{
"beta_dpo/beta_used": 0.04128948226571083,
"beta_dpo/beta_used_raw": -0.01279013603925705,
"beta_dpo/gap_mean": 7.719527721405029,
"beta_dpo/gap_std": 13.315411567687988,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.41874527588813304,
"grad_norm": 13.151788711547852,
"learning_rate": 3.6219979505011555e-07,
"logits/chosen": 0.9874433875083923,
"logits/rejected": 0.9136591553688049,
"loss": 1.2261,
"step": 277
},
{
"beta_dpo/beta_used": 0.08919985592365265,
"beta_dpo/beta_used_raw": 0.03976030275225639,
"beta_dpo/gap_mean": 7.159448623657227,
"beta_dpo/gap_std": 13.33280086517334,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.42025699168556313,
"grad_norm": 36.67196273803711,
"learning_rate": 3.6101665315144353e-07,
"logits/chosen": 1.772204875946045,
"logits/rejected": 1.61760413646698,
"loss": 1.3171,
"step": 278
},
{
"beta_dpo/beta_used": 0.2875370979309082,
"beta_dpo/beta_used_raw": 0.2875370979309082,
"beta_dpo/gap_mean": 7.724452495574951,
"beta_dpo/gap_std": 13.336029052734375,
"beta_dpo/mask_keep_frac": 0.5625,
"epoch": 0.4217687074829932,
"grad_norm": 42.04952621459961,
"learning_rate": 3.5983040587833563e-07,
"logits/chosen": 1.4997611045837402,
"logits/rejected": 1.4169633388519287,
"loss": 0.6457,
"step": 279
},
{
"beta_dpo/beta_used": 0.1273547261953354,
"beta_dpo/beta_used_raw": 0.1273547261953354,
"beta_dpo/gap_mean": 8.448027610778809,
"beta_dpo/gap_std": 13.09150505065918,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.42328042328042326,
"grad_norm": 22.6016902923584,
"learning_rate": 3.586410864126781e-07,
"logits/chosen": 1.3335264921188354,
"logits/rejected": 1.205428123474121,
"loss": 0.789,
"step": 280
},
{
"beta_dpo/beta_used": 0.09392253309488297,
"beta_dpo/beta_used_raw": 0.09392253309488297,
"beta_dpo/gap_mean": 8.494600296020508,
"beta_dpo/gap_std": 13.018985748291016,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.42479213907785335,
"grad_norm": 17.760608673095703,
"learning_rate": 3.574487280222929e-07,
"logits/chosen": 1.7236804962158203,
"logits/rejected": 1.7596588134765625,
"loss": 0.9312,
"step": 281
},
{
"beta_dpo/beta_used": 0.07099371403455734,
"beta_dpo/beta_used_raw": 0.0519629567861557,
"beta_dpo/gap_mean": 8.544626235961914,
"beta_dpo/gap_std": 13.091127395629883,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.42630385487528344,
"grad_norm": 19.166406631469727,
"learning_rate": 3.562533640600075e-07,
"logits/chosen": 1.3285274505615234,
"logits/rejected": 1.0681095123291016,
"loss": 1.123,
"step": 282
},
{
"beta_dpo/beta_used": 0.05208796635270119,
"beta_dpo/beta_used_raw": 0.05208796635270119,
"beta_dpo/gap_mean": 8.585311889648438,
"beta_dpo/gap_std": 13.22861099243164,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.42781557067271353,
"grad_norm": 14.542259216308594,
"learning_rate": 3.550550279627215e-07,
"logits/chosen": 1.4741332530975342,
"logits/rejected": 1.3183352947235107,
"loss": 1.1539,
"step": 283
},
{
"beta_dpo/beta_used": 0.09346778690814972,
"beta_dpo/beta_used_raw": 0.09346778690814972,
"beta_dpo/gap_mean": 8.709510803222656,
"beta_dpo/gap_std": 13.263925552368164,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.4293272864701436,
"grad_norm": 23.085359573364258,
"learning_rate": 3.5385375325047163e-07,
"logits/chosen": 1.9889543056488037,
"logits/rejected": 1.9929530620574951,
"loss": 0.9258,
"step": 284
},
{
"beta_dpo/beta_used": 0.03014732524752617,
"beta_dpo/beta_used_raw": -0.04297472536563873,
"beta_dpo/gap_mean": 8.226021766662598,
"beta_dpo/gap_std": 13.281122207641602,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.4308390022675737,
"grad_norm": 13.746954917907715,
"learning_rate": 3.5264957352549375e-07,
"logits/chosen": 1.8868443965911865,
"logits/rejected": 1.9052425622940063,
"loss": 1.2401,
"step": 285
},
{
"beta_dpo/beta_used": 0.11650238931179047,
"beta_dpo/beta_used_raw": 0.11650238931179047,
"beta_dpo/gap_mean": 8.26309585571289,
"beta_dpo/gap_std": 13.527618408203125,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.4323507180650038,
"grad_norm": 27.496360778808594,
"learning_rate": 3.514425224712835e-07,
"logits/chosen": 1.469580888748169,
"logits/rejected": 1.596300482749939,
"loss": 0.9446,
"step": 286
},
{
"beta_dpo/beta_used": 0.07653540372848511,
"beta_dpo/beta_used_raw": 0.07653540372848511,
"beta_dpo/gap_mean": 8.673506736755371,
"beta_dpo/gap_std": 13.553279876708984,
"beta_dpo/mask_keep_frac": 0.625,
"epoch": 0.43386243386243384,
"grad_norm": 22.05826187133789,
"learning_rate": 3.502326338516534e-07,
"logits/chosen": 1.3630534410476685,
"logits/rejected": 0.8927639126777649,
"loss": 1.0864,
"step": 287
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.024330193176865578,
"beta_dpo/gap_mean": 8.601228713989258,
"beta_dpo/gap_std": 13.768001556396484,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.43537414965986393,
"grad_norm": 0.32395094633102417,
"learning_rate": 3.490199415097892e-07,
"logits/chosen": 0.8396840691566467,
"logits/rejected": 0.8194477558135986,
"loss": 1.3798,
"step": 288
},
{
"beta_dpo/beta_used": 0.0935235470533371,
"beta_dpo/beta_used_raw": 0.0935235470533371,
"beta_dpo/gap_mean": 8.546222686767578,
"beta_dpo/gap_std": 13.85302448272705,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.436885865457294,
"grad_norm": 19.63606071472168,
"learning_rate": 3.4780447936730247e-07,
"logits/chosen": 1.1697039604187012,
"logits/rejected": 1.3744932413101196,
"loss": 0.9199,
"step": 289
},
{
"beta_dpo/beta_used": 0.0780460461974144,
"beta_dpo/beta_used_raw": 0.0780460461974144,
"beta_dpo/gap_mean": 8.599958419799805,
"beta_dpo/gap_std": 13.72342300415039,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.4383975812547241,
"grad_norm": 17.42572593688965,
"learning_rate": 3.465862814232821e-07,
"logits/chosen": 1.5614783763885498,
"logits/rejected": 1.4205409288406372,
"loss": 1.0388,
"step": 290
},
{
"beta_dpo/beta_used": 0.1943441778421402,
"beta_dpo/beta_used_raw": 0.17343299090862274,
"beta_dpo/gap_mean": 8.35032844543457,
"beta_dpo/gap_std": 13.765388488769531,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.4399092970521542,
"grad_norm": 57.41442108154297,
"learning_rate": 3.4536538175334343e-07,
"logits/chosen": 1.446760892868042,
"logits/rejected": 1.3593605756759644,
"loss": 1.1068,
"step": 291
},
{
"beta_dpo/beta_used": 0.09613867104053497,
"beta_dpo/beta_used_raw": 0.09613867104053497,
"beta_dpo/gap_mean": 8.555745124816895,
"beta_dpo/gap_std": 14.046996116638184,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.4414210128495843,
"grad_norm": 27.844497680664062,
"learning_rate": 3.4414181450867465e-07,
"logits/chosen": 1.3435033559799194,
"logits/rejected": 1.224219560623169,
"loss": 1.0311,
"step": 292
},
{
"beta_dpo/beta_used": 0.275066614151001,
"beta_dpo/beta_used_raw": 0.275066614151001,
"beta_dpo/gap_mean": 8.830119132995605,
"beta_dpo/gap_std": 14.660975456237793,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.4429327286470144,
"grad_norm": 62.09123992919922,
"learning_rate": 3.4291561391508185e-07,
"logits/chosen": 1.338533878326416,
"logits/rejected": 1.4490954875946045,
"loss": 0.8841,
"step": 293
},
{
"beta_dpo/beta_used": 0.14097994565963745,
"beta_dpo/beta_used_raw": 0.07972858846187592,
"beta_dpo/gap_mean": 9.071542739868164,
"beta_dpo/gap_std": 14.639909744262695,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.4444444444444444,
"grad_norm": 29.380847930908203,
"learning_rate": 3.4168681427203153e-07,
"logits/chosen": 1.6176725625991821,
"logits/rejected": 1.4952876567840576,
"loss": 0.9302,
"step": 294
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.07594194263219833,
"beta_dpo/gap_mean": 8.659149169921875,
"beta_dpo/gap_std": 14.577923774719238,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.4459561602418745,
"grad_norm": 0.29677557945251465,
"learning_rate": 3.4045544995169125e-07,
"logits/chosen": 1.670243501663208,
"logits/rejected": 1.410202980041504,
"loss": 1.3802,
"step": 295
},
{
"beta_dpo/beta_used": 0.034500446170568466,
"beta_dpo/beta_used_raw": 0.024874616414308548,
"beta_dpo/gap_mean": 8.963502883911133,
"beta_dpo/gap_std": 14.65359115600586,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.4474678760393046,
"grad_norm": 12.611300468444824,
"learning_rate": 3.392215553979679e-07,
"logits/chosen": 1.4267054796218872,
"logits/rejected": 1.1156208515167236,
"loss": 1.2078,
"step": 296
},
{
"beta_dpo/beta_used": 0.087165467441082,
"beta_dpo/beta_used_raw": 0.087165467441082,
"beta_dpo/gap_mean": 8.99653148651123,
"beta_dpo/gap_std": 14.433828353881836,
"beta_dpo/mask_keep_frac": 0.625,
"epoch": 0.4489795918367347,
"grad_norm": 37.467220306396484,
"learning_rate": 3.3798516512554485e-07,
"logits/chosen": 1.572485089302063,
"logits/rejected": 1.3337376117706299,
"loss": 1.0566,
"step": 297
},
{
"beta_dpo/beta_used": 0.10362537205219269,
"beta_dpo/beta_used_raw": 0.10362537205219269,
"beta_dpo/gap_mean": 9.2714262008667,
"beta_dpo/gap_std": 14.870027542114258,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.4504913076341648,
"grad_norm": 23.68560028076172,
"learning_rate": 3.367463137189156e-07,
"logits/chosen": 2.0254149436950684,
"logits/rejected": 2.0223116874694824,
"loss": 0.9876,
"step": 298
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.07137240469455719,
"beta_dpo/gap_mean": 8.997974395751953,
"beta_dpo/gap_std": 14.866207122802734,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.4520030234315949,
"grad_norm": 0.31590133905410767,
"learning_rate": 3.355050358314172e-07,
"logits/chosen": 1.4713513851165771,
"logits/rejected": 1.425032138824463,
"loss": 1.3806,
"step": 299
},
{
"beta_dpo/beta_used": 0.044669389724731445,
"beta_dpo/beta_used_raw": 0.044669389724731445,
"beta_dpo/gap_mean": 8.637369155883789,
"beta_dpo/gap_std": 15.14453125,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.45351473922902497,
"grad_norm": 13.89785099029541,
"learning_rate": 3.3426136618426043e-07,
"logits/chosen": 1.490415334701538,
"logits/rejected": 1.5728942155838013,
"loss": 1.1502,
"step": 300
},
{
"epoch": 0.45351473922902497,
"eval_beta_dpo/beta_used": 0.14202851057052612,
"eval_beta_dpo/beta_used_raw": 0.12426428496837616,
"eval_beta_dpo/gap_mean": 8.545414924621582,
"eval_beta_dpo/gap_std": 15.385650634765625,
"eval_beta_dpo/mask_keep_frac": 1.0,
"eval_logits/chosen": 1.4621630907058716,
"eval_logits/rejected": 1.3383522033691406,
"eval_loss": 0.6282562613487244,
"eval_runtime": 42.7236,
"eval_samples_per_second": 53.905,
"eval_steps_per_second": 1.685,
"step": 300
},
{
"beta_dpo/beta_used": 0.062314994633197784,
"beta_dpo/beta_used_raw": 0.053756166249513626,
"beta_dpo/gap_mean": 8.569022178649902,
"beta_dpo/gap_std": 15.335243225097656,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.455026455026455,
"grad_norm": 26.874900817871094,
"learning_rate": 3.3301533956555885e-07,
"logits/chosen": 1.6456646919250488,
"logits/rejected": 1.4892668724060059,
"loss": 1.2159,
"step": 301
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.09743010997772217,
"beta_dpo/gap_mean": 7.805597305297852,
"beta_dpo/gap_std": 15.338693618774414,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.4565381708238851,
"grad_norm": 0.30480095744132996,
"learning_rate": 3.317669908293554e-07,
"logits/chosen": 0.9524801969528198,
"logits/rejected": 0.733474612236023,
"loss": 1.382,
"step": 302
},
{
"beta_dpo/beta_used": 0.13179755210876465,
"beta_dpo/beta_used_raw": 0.13179755210876465,
"beta_dpo/gap_mean": 8.191202163696289,
"beta_dpo/gap_std": 15.557994842529297,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.4580498866213152,
"grad_norm": 35.628971099853516,
"learning_rate": 3.3051635489464793e-07,
"logits/chosen": 1.696607232093811,
"logits/rejected": 1.6582088470458984,
"loss": 1.0721,
"step": 303
},
{
"beta_dpo/beta_used": 0.11351722478866577,
"beta_dpo/beta_used_raw": 0.11351722478866577,
"beta_dpo/gap_mean": 8.502336502075195,
"beta_dpo/gap_std": 15.554780960083008,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.4595616024187453,
"grad_norm": 18.954330444335938,
"learning_rate": 3.292634667444117e-07,
"logits/chosen": 1.550957202911377,
"logits/rejected": 1.3838510513305664,
"loss": 0.8149,
"step": 304
},
{
"beta_dpo/beta_used": 0.15219593048095703,
"beta_dpo/beta_used_raw": 0.15219593048095703,
"beta_dpo/gap_mean": 8.879709243774414,
"beta_dpo/gap_std": 15.658042907714844,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.46107331821617537,
"grad_norm": 49.46084213256836,
"learning_rate": 3.280083614246217e-07,
"logits/chosen": 1.1397596597671509,
"logits/rejected": 1.192775011062622,
"loss": 1.0138,
"step": 305
},
{
"beta_dpo/beta_used": 0.0675729289650917,
"beta_dpo/beta_used_raw": -0.029155783355236053,
"beta_dpo/gap_mean": 8.949745178222656,
"beta_dpo/gap_std": 15.603281021118164,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.46258503401360546,
"grad_norm": 14.935590744018555,
"learning_rate": 3.267510740432719e-07,
"logits/chosen": 1.4761111736297607,
"logits/rejected": 1.2490260601043701,
"loss": 1.0791,
"step": 306
},
{
"beta_dpo/beta_used": 0.06433078646659851,
"beta_dpo/beta_used_raw": 0.032508764415979385,
"beta_dpo/gap_mean": 8.469474792480469,
"beta_dpo/gap_std": 15.57625961303711,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.46409674981103555,
"grad_norm": 17.987796783447266,
"learning_rate": 3.2549163976939285e-07,
"logits/chosen": 1.498106598854065,
"logits/rejected": 1.4667487144470215,
"loss": 1.2054,
"step": 307
},
{
"beta_dpo/beta_used": 0.10494223982095718,
"beta_dpo/beta_used_raw": 0.08697421848773956,
"beta_dpo/gap_mean": 8.70065975189209,
"beta_dpo/gap_std": 15.584355354309082,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.4656084656084656,
"grad_norm": 33.857913970947266,
"learning_rate": 3.2423009383206874e-07,
"logits/chosen": 1.12371826171875,
"logits/rejected": 1.2730367183685303,
"loss": 1.1937,
"step": 308
},
{
"beta_dpo/beta_used": 0.05916978791356087,
"beta_dpo/beta_used_raw": 0.05702737346291542,
"beta_dpo/gap_mean": 8.90184211730957,
"beta_dpo/gap_std": 15.37250804901123,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.4671201814058957,
"grad_norm": 15.79340648651123,
"learning_rate": 3.229664715194511e-07,
"logits/chosen": 1.3931810855865479,
"logits/rejected": 1.3838417530059814,
"loss": 1.1177,
"step": 309
},
{
"beta_dpo/beta_used": 0.07162805646657944,
"beta_dpo/beta_used_raw": 0.029773060232400894,
"beta_dpo/gap_mean": 8.199593544006348,
"beta_dpo/gap_std": 15.24665641784668,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.46863189720332576,
"grad_norm": 95.10002899169922,
"learning_rate": 3.2170080817777257e-07,
"logits/chosen": 1.5242080688476562,
"logits/rejected": 1.6771780252456665,
"loss": 1.1889,
"step": 310
},
{
"beta_dpo/beta_used": 0.11595961451530457,
"beta_dpo/beta_used_raw": 0.06024138256907463,
"beta_dpo/gap_mean": 8.104015350341797,
"beta_dpo/gap_std": 15.395170211791992,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.47014361300075586,
"grad_norm": 29.715646743774414,
"learning_rate": 3.204331392103574e-07,
"logits/chosen": 1.322619915008545,
"logits/rejected": 1.3766727447509766,
"loss": 1.0216,
"step": 311
},
{
"beta_dpo/beta_used": 0.047369327396154404,
"beta_dpo/beta_used_raw": 0.038489848375320435,
"beta_dpo/gap_mean": 8.375673294067383,
"beta_dpo/gap_std": 15.524256706237793,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.47165532879818595,
"grad_norm": 14.85205078125,
"learning_rate": 3.1916350007663176e-07,
"logits/chosen": 1.7834012508392334,
"logits/rejected": 1.7123432159423828,
"loss": 1.192,
"step": 312
},
{
"beta_dpo/beta_used": 0.17071092128753662,
"beta_dpo/beta_used_raw": 0.17071092128753662,
"beta_dpo/gap_mean": 8.300642967224121,
"beta_dpo/gap_std": 15.795055389404297,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.47316704459561604,
"grad_norm": 38.206764221191406,
"learning_rate": 3.178919262911314e-07,
"logits/chosen": 1.3894063234329224,
"logits/rejected": 1.470247507095337,
"loss": 0.9714,
"step": 313
},
{
"beta_dpo/beta_used": 0.19798541069030762,
"beta_dpo/beta_used_raw": 0.19798541069030762,
"beta_dpo/gap_mean": 8.686609268188477,
"beta_dpo/gap_std": 15.914053916931152,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.47467876039304613,
"grad_norm": 43.56782531738281,
"learning_rate": 3.166184534225087e-07,
"logits/chosen": 1.5680770874023438,
"logits/rejected": 1.4430394172668457,
"loss": 0.9567,
"step": 314
},
{
"beta_dpo/beta_used": 0.03711218759417534,
"beta_dpo/beta_used_raw": -0.0003358498215675354,
"beta_dpo/gap_mean": 8.705648422241211,
"beta_dpo/gap_std": 15.65288257598877,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.47619047619047616,
"grad_norm": 12.367003440856934,
"learning_rate": 3.1534311709253723e-07,
"logits/chosen": 1.3505761623382568,
"logits/rejected": 1.321984052658081,
"loss": 1.2002,
"step": 315
},
{
"beta_dpo/beta_used": 0.13538314402103424,
"beta_dpo/beta_used_raw": 0.08412972092628479,
"beta_dpo/gap_mean": 8.707979202270508,
"beta_dpo/gap_std": 15.149721145629883,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.47770219198790626,
"grad_norm": 23.324026107788086,
"learning_rate": 3.1406595297511564e-07,
"logits/chosen": 1.2432329654693604,
"logits/rejected": 0.7631069421768188,
"loss": 0.9318,
"step": 316
},
{
"beta_dpo/beta_used": 0.09405739605426788,
"beta_dpo/beta_used_raw": 0.09405739605426788,
"beta_dpo/gap_mean": 9.40979290008545,
"beta_dpo/gap_std": 15.358884811401367,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.47921390778533635,
"grad_norm": 24.697673797607422,
"learning_rate": 3.1278699679526975e-07,
"logits/chosen": 1.263979196548462,
"logits/rejected": 1.1331511735916138,
"loss": 1.0274,
"step": 317
},
{
"beta_dpo/beta_used": 0.01103425957262516,
"beta_dpo/beta_used_raw": -0.07546316087245941,
"beta_dpo/gap_mean": 9.228424072265625,
"beta_dpo/gap_std": 15.684703826904297,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.48072562358276644,
"grad_norm": 3.974104642868042,
"learning_rate": 3.1150628432815336e-07,
"logits/chosen": 1.6318895816802979,
"logits/rejected": 1.7524826526641846,
"loss": 1.3091,
"step": 318
},
{
"beta_dpo/beta_used": 0.059235621243715286,
"beta_dpo/beta_used_raw": 0.03933485597372055,
"beta_dpo/gap_mean": 9.031213760375977,
"beta_dpo/gap_std": 15.591960906982422,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.48223733938019653,
"grad_norm": 15.467484474182129,
"learning_rate": 3.1022385139804707e-07,
"logits/chosen": 1.1446490287780762,
"logits/rejected": 0.9336162805557251,
"loss": 1.1188,
"step": 319
},
{
"beta_dpo/beta_used": 0.039282217621803284,
"beta_dpo/beta_used_raw": -0.03736276924610138,
"beta_dpo/gap_mean": 8.335182189941406,
"beta_dpo/gap_std": 15.608378410339355,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.4837490551776266,
"grad_norm": 13.889800071716309,
"learning_rate": 3.0893973387735683e-07,
"logits/chosen": 1.0464937686920166,
"logits/rejected": 0.97780442237854,
"loss": 1.2383,
"step": 320
},
{
"beta_dpo/beta_used": 0.15540650486946106,
"beta_dpo/beta_used_raw": 0.0922960415482521,
"beta_dpo/gap_mean": 8.612838745117188,
"beta_dpo/gap_std": 15.559803009033203,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.4852607709750567,
"grad_norm": 41.25529098510742,
"learning_rate": 3.0765396768561004e-07,
"logits/chosen": 1.2193944454193115,
"logits/rejected": 1.1687374114990234,
"loss": 0.9509,
"step": 321
},
{
"beta_dpo/beta_used": 0.2929460108280182,
"beta_dpo/beta_used_raw": 0.2929460108280182,
"beta_dpo/gap_mean": 8.876199722290039,
"beta_dpo/gap_std": 15.44024658203125,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.48677248677248675,
"grad_norm": 59.753841400146484,
"learning_rate": 3.063665887884511e-07,
"logits/chosen": 1.8868658542633057,
"logits/rejected": 1.554215431213379,
"loss": 0.7392,
"step": 322
},
{
"beta_dpo/beta_used": 0.11718940734863281,
"beta_dpo/beta_used_raw": 0.11718940734863281,
"beta_dpo/gap_mean": 8.979276657104492,
"beta_dpo/gap_std": 15.77180290222168,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.48828420256991684,
"grad_norm": 33.23453903198242,
"learning_rate": 3.0507763319663517e-07,
"logits/chosen": 1.370314121246338,
"logits/rejected": 1.3247946500778198,
"loss": 0.969,
"step": 323
},
{
"beta_dpo/beta_used": 0.1240943893790245,
"beta_dpo/beta_used_raw": 0.1240943893790245,
"beta_dpo/gap_mean": 9.192289352416992,
"beta_dpo/gap_std": 16.003286361694336,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.4897959183673469,
"grad_norm": 24.16273307800293,
"learning_rate": 3.0378713696502097e-07,
"logits/chosen": 1.34813392162323,
"logits/rejected": 1.3388067483901978,
"loss": 0.9585,
"step": 324
},
{
"beta_dpo/beta_used": 0.18239615857601166,
"beta_dpo/beta_used_raw": 0.17694588005542755,
"beta_dpo/gap_mean": 9.277650833129883,
"beta_dpo/gap_std": 15.912029266357422,
"beta_dpo/mask_keep_frac": 1.0,
"epoch": 0.491307634164777,
"grad_norm": 62.159393310546875,
"learning_rate": 3.0249513619156206e-07,
"logits/chosen": 1.887860655784607,
"logits/rejected": 1.854949951171875,
"loss": 1.0449,
"step": 325
},
{
"beta_dpo/beta_used": 0.05071749910712242,
"beta_dpo/beta_used_raw": -0.0035161487758159637,
"beta_dpo/gap_mean": 8.763511657714844,
"beta_dpo/gap_std": 15.70359992980957,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.4928193499622071,
"grad_norm": 21.598426818847656,
"learning_rate": 3.012016670162977e-07,
"logits/chosen": 1.5172902345657349,
"logits/rejected": 1.4291995763778687,
"loss": 1.2608,
"step": 326
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.13349460065364838,
"beta_dpo/gap_mean": 8.288639068603516,
"beta_dpo/gap_std": 15.90619945526123,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.4943310657596372,
"grad_norm": 0.3719576895236969,
"learning_rate": 2.99906765620341e-07,
"logits/chosen": 0.9552278518676758,
"logits/rejected": 0.8223298788070679,
"loss": 1.3816,
"step": 327
},
{
"beta_dpo/beta_used": 0.03803815692663193,
"beta_dpo/beta_used_raw": -0.006140265613794327,
"beta_dpo/gap_mean": 8.157548904418945,
"beta_dpo/gap_std": 15.70623779296875,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.4958427815570673,
"grad_norm": 14.157182693481445,
"learning_rate": 2.9861046822486766e-07,
"logits/chosen": 1.2303402423858643,
"logits/rejected": 1.089834451675415,
"loss": 1.2513,
"step": 328
},
{
"beta_dpo/beta_used": 0.030700990930199623,
"beta_dpo/beta_used_raw": -0.0006328783929347992,
"beta_dpo/gap_mean": 8.243142127990723,
"beta_dpo/gap_std": 15.664817810058594,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.4973544973544973,
"grad_norm": 11.445138931274414,
"learning_rate": 2.9731281109010253e-07,
"logits/chosen": 1.5622519254684448,
"logits/rejected": 1.2833863496780396,
"loss": 1.2369,
"step": 329
},
{
"beta_dpo/beta_used": 0.1286730319261551,
"beta_dpo/beta_used_raw": 0.10966426879167557,
"beta_dpo/gap_mean": 8.6826171875,
"beta_dpo/gap_std": 16.049409866333008,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.4988662131519274,
"grad_norm": 29.91504669189453,
"learning_rate": 2.9601383051430505e-07,
"logits/chosen": 1.5348610877990723,
"logits/rejected": 1.484304666519165,
"loss": 0.9868,
"step": 330
},
{
"beta_dpo/beta_used": 0.12080587446689606,
"beta_dpo/beta_used_raw": 0.12080587446689606,
"beta_dpo/gap_mean": 9.420382499694824,
"beta_dpo/gap_std": 16.184450149536133,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.5003779289493575,
"grad_norm": 27.50879669189453,
"learning_rate": 2.947135628327544e-07,
"logits/chosen": 1.2042312622070312,
"logits/rejected": 0.9803166389465332,
"loss": 0.8804,
"step": 331
},
{
"beta_dpo/beta_used": 0.02131238579750061,
"beta_dpo/beta_used_raw": -0.05307789891958237,
"beta_dpo/gap_mean": 9.491806030273438,
"beta_dpo/gap_std": 16.1578369140625,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.5018896447467877,
"grad_norm": 8.015897750854492,
"learning_rate": 2.934120444167326e-07,
"logits/chosen": 1.0138837099075317,
"logits/rejected": 0.9243895411491394,
"loss": 1.2392,
"step": 332
},
{
"beta_dpo/beta_used": 0.08504879474639893,
"beta_dpo/beta_used_raw": 0.07206695526838303,
"beta_dpo/gap_mean": 9.9852294921875,
"beta_dpo/gap_std": 15.994035720825195,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.5034013605442177,
"grad_norm": 20.979957580566406,
"learning_rate": 2.921093116725076e-07,
"logits/chosen": 1.406402349472046,
"logits/rejected": 1.2631025314331055,
"loss": 0.9934,
"step": 333
},
{
"beta_dpo/beta_used": 0.07348217070102692,
"beta_dpo/beta_used_raw": 0.04673399776220322,
"beta_dpo/gap_mean": 9.785324096679688,
"beta_dpo/gap_std": 16.334577560424805,
"beta_dpo/mask_keep_frac": 0.5625,
"epoch": 0.5049130763416477,
"grad_norm": 17.75541877746582,
"learning_rate": 2.9080540104031484e-07,
"logits/chosen": 1.5521423816680908,
"logits/rejected": 1.1226956844329834,
"loss": 1.1227,
"step": 334
},
{
"beta_dpo/beta_used": 0.08314938098192215,
"beta_dpo/beta_used_raw": 0.08314938098192215,
"beta_dpo/gap_mean": 9.443593978881836,
"beta_dpo/gap_std": 17.190528869628906,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.5064247921390779,
"grad_norm": 21.105512619018555,
"learning_rate": 2.895003489933375e-07,
"logits/chosen": 1.9064218997955322,
"logits/rejected": 1.5738611221313477,
"loss": 1.169,
"step": 335
},
{
"beta_dpo/beta_used": 0.02810182236135006,
"beta_dpo/beta_used_raw": -0.02267879620194435,
"beta_dpo/gap_mean": 9.678264617919922,
"beta_dpo/gap_std": 17.16312026977539,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.5079365079365079,
"grad_norm": 10.10340404510498,
"learning_rate": 2.8819419203668675e-07,
"logits/chosen": 1.4943532943725586,
"logits/rejected": 1.4970781803131104,
"loss": 1.2165,
"step": 336
},
{
"beta_dpo/beta_used": 0.06397874653339386,
"beta_dpo/beta_used_raw": 0.06397874653339386,
"beta_dpo/gap_mean": 9.417184829711914,
"beta_dpo/gap_std": 17.194931030273438,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.509448223733938,
"grad_norm": 17.00838851928711,
"learning_rate": 2.8688696670638053e-07,
"logits/chosen": 1.0973150730133057,
"logits/rejected": 1.0407588481903076,
"loss": 1.1465,
"step": 337
},
{
"beta_dpo/beta_used": 0.007970977574586868,
"beta_dpo/beta_used_raw": -0.026147443801164627,
"beta_dpo/gap_mean": 8.84959602355957,
"beta_dpo/gap_std": 16.847551345825195,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.5109599395313681,
"grad_norm": 3.7325870990753174,
"learning_rate": 2.8557870956832133e-07,
"logits/chosen": 1.1141670942306519,
"logits/rejected": 1.0108463764190674,
"loss": 1.3312,
"step": 338
},
{
"beta_dpo/beta_used": 0.21938511729240417,
"beta_dpo/beta_used_raw": 0.21938511729240417,
"beta_dpo/gap_mean": 9.198416709899902,
"beta_dpo/gap_std": 16.58599090576172,
"beta_dpo/mask_keep_frac": 0.625,
"epoch": 0.5124716553287982,
"grad_norm": 52.04912567138672,
"learning_rate": 2.842694572172736e-07,
"logits/chosen": 1.6343212127685547,
"logits/rejected": 1.2077702283859253,
"loss": 1.0037,
"step": 339
},
{
"beta_dpo/beta_used": 0.01670524850487709,
"beta_dpo/beta_used_raw": 0.01670524850487709,
"beta_dpo/gap_mean": 9.110252380371094,
"beta_dpo/gap_std": 16.77898406982422,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.5139833711262283,
"grad_norm": 5.165824890136719,
"learning_rate": 2.8295924627584004e-07,
"logits/chosen": 1.2082417011260986,
"logits/rejected": 1.0188452005386353,
"loss": 1.2743,
"step": 340
},
{
"beta_dpo/beta_used": 0.3238562047481537,
"beta_dpo/beta_used_raw": 0.24568364024162292,
"beta_dpo/gap_mean": 9.692229270935059,
"beta_dpo/gap_std": 16.962947845458984,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.5154950869236583,
"grad_norm": 52.99232864379883,
"learning_rate": 2.816481133934373e-07,
"logits/chosen": 1.5995168685913086,
"logits/rejected": 1.4972131252288818,
"loss": 0.9498,
"step": 341
},
{
"beta_dpo/beta_used": 0.15345560014247894,
"beta_dpo/beta_used_raw": 0.07748877257108688,
"beta_dpo/gap_mean": 9.49228286743164,
"beta_dpo/gap_std": 16.987442016601562,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.5170068027210885,
"grad_norm": 44.00324630737305,
"learning_rate": 2.8033609524527046e-07,
"logits/chosen": 1.4441843032836914,
"logits/rejected": 1.491701364517212,
"loss": 1.3024,
"step": 342
},
{
"beta_dpo/beta_used": 0.08163314312696457,
"beta_dpo/beta_used_raw": -0.1123107373714447,
"beta_dpo/gap_mean": 9.09318733215332,
"beta_dpo/gap_std": 16.66823387145996,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.5185185185185185,
"grad_norm": 20.944377899169922,
"learning_rate": 2.7902322853130753e-07,
"logits/chosen": 1.295729637145996,
"logits/rejected": 1.4093396663665771,
"loss": 1.0645,
"step": 343
},
{
"beta_dpo/beta_used": 0.09156259149312973,
"beta_dpo/beta_used_raw": 0.09156259149312973,
"beta_dpo/gap_mean": 9.121692657470703,
"beta_dpo/gap_std": 16.57402229309082,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.5200302343159486,
"grad_norm": 24.541534423828125,
"learning_rate": 2.7770954997525274e-07,
"logits/chosen": 1.721388339996338,
"logits/rejected": 1.4250373840332031,
"loss": 1.1417,
"step": 344
},
{
"beta_dpo/beta_used": 0.19141808152198792,
"beta_dpo/beta_used_raw": 0.19141808152198792,
"beta_dpo/gap_mean": 9.139965057373047,
"beta_dpo/gap_std": 16.651588439941406,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.5215419501133787,
"grad_norm": 40.367469787597656,
"learning_rate": 2.7639509632351927e-07,
"logits/chosen": 1.4737789630889893,
"logits/rejected": 1.4216864109039307,
"loss": 0.7941,
"step": 345
},
{
"beta_dpo/beta_used": 0.09768233448266983,
"beta_dpo/beta_used_raw": 0.09768233448266983,
"beta_dpo/gap_mean": 9.070549011230469,
"beta_dpo/gap_std": 16.83832550048828,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.5230536659108088,
"grad_norm": 27.266891479492188,
"learning_rate": 2.7507990434420123e-07,
"logits/chosen": 1.2734328508377075,
"logits/rejected": 1.149863839149475,
"loss": 1.2551,
"step": 346
},
{
"beta_dpo/beta_used": 0.02022167667746544,
"beta_dpo/beta_used_raw": -0.12329346686601639,
"beta_dpo/gap_mean": 8.993759155273438,
"beta_dpo/gap_std": 17.17070198059082,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.5245653817082389,
"grad_norm": 7.088632583618164,
"learning_rate": 2.737640108260456e-07,
"logits/chosen": 1.9005743265151978,
"logits/rejected": 1.7757856845855713,
"loss": 1.2606,
"step": 347
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.09922761470079422,
"beta_dpo/gap_mean": 9.317571640014648,
"beta_dpo/gap_std": 17.419286727905273,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.5260770975056689,
"grad_norm": 0.33556678891181946,
"learning_rate": 2.724474525774229e-07,
"logits/chosen": 1.6129817962646484,
"logits/rejected": 1.5542514324188232,
"loss": 1.3806,
"step": 348
},
{
"beta_dpo/beta_used": 0.17093956470489502,
"beta_dpo/beta_used_raw": 0.17093956470489502,
"beta_dpo/gap_mean": 9.49870777130127,
"beta_dpo/gap_std": 17.64126205444336,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.527588813303099,
"grad_norm": 42.406131744384766,
"learning_rate": 2.711302664252973e-07,
"logits/chosen": 1.3581299781799316,
"logits/rejected": 1.0760269165039062,
"loss": 1.1208,
"step": 349
},
{
"beta_dpo/beta_used": 0.16534699499607086,
"beta_dpo/beta_used_raw": 0.16534699499607086,
"beta_dpo/gap_mean": 10.416524887084961,
"beta_dpo/gap_std": 17.57219886779785,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.5291005291005291,
"grad_norm": 27.851696014404297,
"learning_rate": 2.698124892141971e-07,
"logits/chosen": 1.536478042602539,
"logits/rejected": 1.4709566831588745,
"loss": 0.8406,
"step": 350
},
{
"beta_dpo/beta_used": 0.19340933859348297,
"beta_dpo/beta_used_raw": 0.19340933859348297,
"beta_dpo/gap_mean": 10.431373596191406,
"beta_dpo/gap_std": 17.373821258544922,
"beta_dpo/mask_keep_frac": 0.625,
"epoch": 0.5306122448979592,
"grad_norm": 49.59145736694336,
"learning_rate": 2.6849415780518357e-07,
"logits/chosen": 1.3625681400299072,
"logits/rejected": 0.9931057691574097,
"loss": 0.7915,
"step": 351
},
{
"beta_dpo/beta_used": 0.06435239315032959,
"beta_dpo/beta_used_raw": 0.02895892783999443,
"beta_dpo/gap_mean": 10.19567584991455,
"beta_dpo/gap_std": 17.5516357421875,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.5321239606953893,
"grad_norm": 24.568038940429688,
"learning_rate": 2.6717530907482024e-07,
"logits/chosen": 1.1959900856018066,
"logits/rejected": 1.2047438621520996,
"loss": 1.2503,
"step": 352
},
{
"beta_dpo/beta_used": 0.20792317390441895,
"beta_dpo/beta_used_raw": 0.20792317390441895,
"beta_dpo/gap_mean": 9.921865463256836,
"beta_dpo/gap_std": 17.66985321044922,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.5336356764928194,
"grad_norm": 59.17499923706055,
"learning_rate": 2.658559799141411e-07,
"logits/chosen": 1.4057915210723877,
"logits/rejected": 1.0512161254882812,
"loss": 0.9536,
"step": 353
},
{
"beta_dpo/beta_used": 0.11257414519786835,
"beta_dpo/beta_used_raw": 0.11257414519786835,
"beta_dpo/gap_mean": 10.141077041625977,
"beta_dpo/gap_std": 17.43115997314453,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.5351473922902494,
"grad_norm": 26.042173385620117,
"learning_rate": 2.6453620722761895e-07,
"logits/chosen": 1.1837090253829956,
"logits/rejected": 1.1246318817138672,
"loss": 0.9353,
"step": 354
},
{
"beta_dpo/beta_used": 0.05702915042638779,
"beta_dpo/beta_used_raw": 0.05702915042638779,
"beta_dpo/gap_mean": 10.287775039672852,
"beta_dpo/gap_std": 17.543479919433594,
"beta_dpo/mask_keep_frac": 1.0,
"epoch": 0.5366591080876795,
"grad_norm": 12.683091163635254,
"learning_rate": 2.632160279321328e-07,
"logits/chosen": 1.833913803100586,
"logits/rejected": 1.4571876525878906,
"loss": 1.0623,
"step": 355
},
{
"beta_dpo/beta_used": 0.1175190806388855,
"beta_dpo/beta_used_raw": 0.1175190806388855,
"beta_dpo/gap_mean": 10.312297821044922,
"beta_dpo/gap_std": 17.510921478271484,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.5381708238851096,
"grad_norm": 24.64614486694336,
"learning_rate": 2.618954789559356e-07,
"logits/chosen": 1.7809038162231445,
"logits/rejected": 1.408195972442627,
"loss": 0.9348,
"step": 356
},
{
"beta_dpo/beta_used": 0.027612989768385887,
"beta_dpo/beta_used_raw": -0.05110103636980057,
"beta_dpo/gap_mean": 10.247565269470215,
"beta_dpo/gap_std": 17.473857879638672,
"beta_dpo/mask_keep_frac": 0.5625,
"epoch": 0.5396825396825397,
"grad_norm": 10.66695499420166,
"learning_rate": 2.6057459723762076e-07,
"logits/chosen": 1.309868335723877,
"logits/rejected": 0.9117208123207092,
"loss": 1.2033,
"step": 357
},
{
"beta_dpo/beta_used": 0.10322414338588715,
"beta_dpo/beta_used_raw": 0.10322414338588715,
"beta_dpo/gap_mean": 10.111295700073242,
"beta_dpo/gap_std": 17.269624710083008,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.5411942554799698,
"grad_norm": 29.352130889892578,
"learning_rate": 2.5925341972508954e-07,
"logits/chosen": 0.8893525004386902,
"logits/rejected": 0.9518415927886963,
"loss": 0.9401,
"step": 358
},
{
"beta_dpo/beta_used": 0.011187026277184486,
"beta_dpo/beta_used_raw": -0.0717354491353035,
"beta_dpo/gap_mean": 9.60980224609375,
"beta_dpo/gap_std": 16.95635986328125,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.5427059712773998,
"grad_norm": 6.215968608856201,
"learning_rate": 2.579319833745169e-07,
"logits/chosen": 1.6677427291870117,
"logits/rejected": 1.8107473850250244,
"loss": 1.3071,
"step": 359
},
{
"beta_dpo/beta_used": 0.052680958062410355,
"beta_dpo/beta_used_raw": -0.013556074351072311,
"beta_dpo/gap_mean": 9.709366798400879,
"beta_dpo/gap_std": 16.987445831298828,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.54421768707483,
"grad_norm": 16.22953987121582,
"learning_rate": 2.5661032514931834e-07,
"logits/chosen": 1.0773403644561768,
"logits/rejected": 0.6769781708717346,
"loss": 1.1578,
"step": 360
},
{
"beta_dpo/beta_used": 0.08674684911966324,
"beta_dpo/beta_used_raw": 0.0780373364686966,
"beta_dpo/gap_mean": 9.90941333770752,
"beta_dpo/gap_std": 17.054189682006836,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.54572940287226,
"grad_norm": 20.603309631347656,
"learning_rate": 2.552884820191154e-07,
"logits/chosen": 1.3290486335754395,
"logits/rejected": 1.2161469459533691,
"loss": 1.0395,
"step": 361
},
{
"beta_dpo/beta_used": 0.13358384370803833,
"beta_dpo/beta_used_raw": 0.09427288174629211,
"beta_dpo/gap_mean": 9.608295440673828,
"beta_dpo/gap_std": 16.78176498413086,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.54724111866969,
"grad_norm": 25.390155792236328,
"learning_rate": 2.53966490958702e-07,
"logits/chosen": 1.6379358768463135,
"logits/rejected": 1.2990127801895142,
"loss": 1.0946,
"step": 362
},
{
"beta_dpo/beta_used": 0.12207494676113129,
"beta_dpo/beta_used_raw": 0.12207494676113129,
"beta_dpo/gap_mean": 10.08150577545166,
"beta_dpo/gap_std": 16.658737182617188,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.5487528344671202,
"grad_norm": 32.736759185791016,
"learning_rate": 2.526443889470099e-07,
"logits/chosen": 1.4099113941192627,
"logits/rejected": 0.814749002456665,
"loss": 0.843,
"step": 363
},
{
"beta_dpo/beta_used": 0.13381703197956085,
"beta_dpo/beta_used_raw": 0.13381703197956085,
"beta_dpo/gap_mean": 10.593009948730469,
"beta_dpo/gap_std": 17.01114845275879,
"beta_dpo/mask_keep_frac": 0.625,
"epoch": 0.5502645502645502,
"grad_norm": 44.58602523803711,
"learning_rate": 2.513222129660744e-07,
"logits/chosen": 1.493070125579834,
"logits/rejected": 1.1450066566467285,
"loss": 1.0415,
"step": 364
},
{
"beta_dpo/beta_used": 0.04936742037534714,
"beta_dpo/beta_used_raw": 0.04936742037534714,
"beta_dpo/gap_mean": 10.426152229309082,
"beta_dpo/gap_std": 16.404680252075195,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.5517762660619804,
"grad_norm": 12.130597114562988,
"learning_rate": 2.5e-07,
"logits/chosen": 1.6267120838165283,
"logits/rejected": 1.6452577114105225,
"loss": 1.0652,
"step": 365
},
{
"beta_dpo/beta_used": 0.07397685199975967,
"beta_dpo/beta_used_raw": 0.07397685199975967,
"beta_dpo/gap_mean": 10.14107894897461,
"beta_dpo/gap_std": 16.657485961914062,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.5532879818594104,
"grad_norm": 18.45604133605957,
"learning_rate": 2.486777870339255e-07,
"logits/chosen": 1.6944191455841064,
"logits/rejected": 1.6772571802139282,
"loss": 1.0363,
"step": 366
},
{
"beta_dpo/beta_used": 0.07414257526397705,
"beta_dpo/beta_used_raw": 0.0723227709531784,
"beta_dpo/gap_mean": 9.737115859985352,
"beta_dpo/gap_std": 16.248645782470703,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.5547996976568406,
"grad_norm": 17.768054962158203,
"learning_rate": 2.4735561105299014e-07,
"logits/chosen": 1.6562645435333252,
"logits/rejected": 1.3367321491241455,
"loss": 1.044,
"step": 367
},
{
"beta_dpo/beta_used": 0.12580986320972443,
"beta_dpo/beta_used_raw": 0.1125003844499588,
"beta_dpo/gap_mean": 9.598726272583008,
"beta_dpo/gap_std": 16.235118865966797,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.5563114134542706,
"grad_norm": 30.600473403930664,
"learning_rate": 2.46033509041298e-07,
"logits/chosen": 1.194272518157959,
"logits/rejected": 1.42368483543396,
"loss": 0.9831,
"step": 368
},
{
"beta_dpo/beta_used": 0.022077616304159164,
"beta_dpo/beta_used_raw": 0.022077616304159164,
"beta_dpo/gap_mean": 9.324756622314453,
"beta_dpo/gap_std": 16.485767364501953,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.5578231292517006,
"grad_norm": 7.461435794830322,
"learning_rate": 2.447115179808846e-07,
"logits/chosen": 1.2820395231246948,
"logits/rejected": 0.9497278332710266,
"loss": 1.2487,
"step": 369
},
{
"beta_dpo/beta_used": 0.23217812180519104,
"beta_dpo/beta_used_raw": 0.23217812180519104,
"beta_dpo/gap_mean": 9.887712478637695,
"beta_dpo/gap_std": 16.934602737426758,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.5593348450491308,
"grad_norm": 57.79582977294922,
"learning_rate": 2.4338967485068164e-07,
"logits/chosen": 1.8189184665679932,
"logits/rejected": 1.7541735172271729,
"loss": 0.9118,
"step": 370
},
{
"beta_dpo/beta_used": 0.11088813841342926,
"beta_dpo/beta_used_raw": 0.09978704899549484,
"beta_dpo/gap_mean": 9.728986740112305,
"beta_dpo/gap_std": 17.203359603881836,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.5608465608465608,
"grad_norm": 31.766374588012695,
"learning_rate": 2.420680166254831e-07,
"logits/chosen": 2.12520432472229,
"logits/rejected": 2.003981113433838,
"loss": 1.1124,
"step": 371
},
{
"beta_dpo/beta_used": 0.12042045593261719,
"beta_dpo/beta_used_raw": 0.08532939851284027,
"beta_dpo/gap_mean": 9.579994201660156,
"beta_dpo/gap_std": 17.36013412475586,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.562358276643991,
"grad_norm": 43.5960578918457,
"learning_rate": 2.4074658027491044e-07,
"logits/chosen": 1.1307945251464844,
"logits/rejected": 0.8450255990028381,
"loss": 1.3893,
"step": 372
},
{
"beta_dpo/beta_used": 0.015285984613001347,
"beta_dpo/beta_used_raw": 0.011251095682382584,
"beta_dpo/gap_mean": 9.891489028930664,
"beta_dpo/gap_std": 17.593887329101562,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.563869992441421,
"grad_norm": 6.121160507202148,
"learning_rate": 2.394254027623792e-07,
"logits/chosen": 1.755456805229187,
"logits/rejected": 1.4722647666931152,
"loss": 1.2721,
"step": 373
},
{
"beta_dpo/beta_used": 0.3823484182357788,
"beta_dpo/beta_used_raw": 0.3823484182357788,
"beta_dpo/gap_mean": 10.362098693847656,
"beta_dpo/gap_std": 17.728212356567383,
"beta_dpo/mask_keep_frac": 0.625,
"epoch": 0.5653817082388511,
"grad_norm": 103.98133087158203,
"learning_rate": 2.381045210440644e-07,
"logits/chosen": 1.1939418315887451,
"logits/rejected": 0.8221108913421631,
"loss": 1.0492,
"step": 374
},
{
"beta_dpo/beta_used": 0.10127855837345123,
"beta_dpo/beta_used_raw": 0.10127855837345123,
"beta_dpo/gap_mean": 10.388755798339844,
"beta_dpo/gap_std": 17.460376739501953,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.5668934240362812,
"grad_norm": 22.008392333984375,
"learning_rate": 2.3678397206786715e-07,
"logits/chosen": 1.6911749839782715,
"logits/rejected": 1.3473389148712158,
"loss": 0.8548,
"step": 375
},
{
"beta_dpo/beta_used": 0.07813645899295807,
"beta_dpo/beta_used_raw": 0.07813645899295807,
"beta_dpo/gap_mean": 10.727385520935059,
"beta_dpo/gap_std": 17.87335205078125,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.5684051398337112,
"grad_norm": 21.66082000732422,
"learning_rate": 2.3546379277238103e-07,
"logits/chosen": 0.9075419902801514,
"logits/rejected": 1.0352015495300293,
"loss": 1.15,
"step": 376
},
{
"beta_dpo/beta_used": 0.043544746935367584,
"beta_dpo/beta_used_raw": 0.043544746935367584,
"beta_dpo/gap_mean": 10.423905372619629,
"beta_dpo/gap_std": 17.556154251098633,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.5699168556311414,
"grad_norm": 10.022980690002441,
"learning_rate": 2.3414402008585886e-07,
"logits/chosen": 1.7869625091552734,
"logits/rejected": 1.7410027980804443,
"loss": 1.1213,
"step": 377
},
{
"beta_dpo/beta_used": 0.028150945901870728,
"beta_dpo/beta_used_raw": 0.023979444056749344,
"beta_dpo/gap_mean": 9.919803619384766,
"beta_dpo/gap_std": 17.070709228515625,
"beta_dpo/mask_keep_frac": 0.625,
"epoch": 0.5714285714285714,
"grad_norm": 9.40400505065918,
"learning_rate": 2.3282469092517977e-07,
"logits/chosen": 1.5019217729568481,
"logits/rejected": 1.3452924489974976,
"loss": 1.2086,
"step": 378
},
{
"beta_dpo/beta_used": 0.29144594073295593,
"beta_dpo/beta_used_raw": 0.29144594073295593,
"beta_dpo/gap_mean": 10.127754211425781,
"beta_dpo/gap_std": 17.333236694335938,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.5729402872260015,
"grad_norm": 100.93724060058594,
"learning_rate": 2.3150584219481643e-07,
"logits/chosen": 1.4389901161193848,
"logits/rejected": 1.281882882118225,
"loss": 0.9985,
"step": 379
},
{
"beta_dpo/beta_used": 0.37376725673675537,
"beta_dpo/beta_used_raw": 0.37376725673675537,
"beta_dpo/gap_mean": 10.577753067016602,
"beta_dpo/gap_std": 17.541908264160156,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.5744520030234316,
"grad_norm": 66.6102523803711,
"learning_rate": 2.3018751078580283e-07,
"logits/chosen": 1.1558235883712769,
"logits/rejected": 1.2484815120697021,
"loss": 0.8812,
"step": 380
},
{
"beta_dpo/beta_used": 0.06865327805280685,
"beta_dpo/beta_used_raw": 0.010333731770515442,
"beta_dpo/gap_mean": 10.033830642700195,
"beta_dpo/gap_std": 17.42238998413086,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.5759637188208617,
"grad_norm": 19.617332458496094,
"learning_rate": 2.288697335747027e-07,
"logits/chosen": 1.6343696117401123,
"logits/rejected": 1.4790246486663818,
"loss": 1.1353,
"step": 381
},
{
"beta_dpo/beta_used": 0.06595531105995178,
"beta_dpo/beta_used_raw": -0.021138787269592285,
"beta_dpo/gap_mean": 9.706841468811035,
"beta_dpo/gap_std": 17.24261474609375,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.5774754346182918,
"grad_norm": 16.492521286010742,
"learning_rate": 2.2755254742257706e-07,
"logits/chosen": 1.7128905057907104,
"logits/rejected": 1.4657937288284302,
"loss": 1.1008,
"step": 382
},
{
"beta_dpo/beta_used": 0.19009645283222198,
"beta_dpo/beta_used_raw": 0.19009645283222198,
"beta_dpo/gap_mean": 9.920913696289062,
"beta_dpo/gap_std": 17.623497009277344,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.5789871504157218,
"grad_norm": 43.94319534301758,
"learning_rate": 2.2623598917395436e-07,
"logits/chosen": 1.3148654699325562,
"logits/rejected": 1.2381043434143066,
"loss": 0.926,
"step": 383
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.06760846078395844,
"beta_dpo/gap_mean": 10.044574737548828,
"beta_dpo/gap_std": 17.557830810546875,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.5804988662131519,
"grad_norm": 0.30568212270736694,
"learning_rate": 2.2492009565579875e-07,
"logits/chosen": 1.3191230297088623,
"logits/rejected": 1.470552921295166,
"loss": 1.379,
"step": 384
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.08156859129667282,
"beta_dpo/gap_mean": 10.185813903808594,
"beta_dpo/gap_std": 17.263328552246094,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.582010582010582,
"grad_norm": 0.2859506607055664,
"learning_rate": 2.2360490367648084e-07,
"logits/chosen": 1.271431565284729,
"logits/rejected": 1.160420298576355,
"loss": 1.3792,
"step": 385
},
{
"beta_dpo/beta_used": 0.0168894175440073,
"beta_dpo/beta_used_raw": 0.012856299057602882,
"beta_dpo/gap_mean": 9.815888404846191,
"beta_dpo/gap_std": 17.33496856689453,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.5835222978080121,
"grad_norm": 6.143118858337402,
"learning_rate": 2.2229045002474724e-07,
"logits/chosen": 1.4685890674591064,
"logits/rejected": 1.161041259765625,
"loss": 1.2644,
"step": 386
},
{
"beta_dpo/beta_used": 0.03634551912546158,
"beta_dpo/beta_used_raw": -0.024081122130155563,
"beta_dpo/gap_mean": 9.894031524658203,
"beta_dpo/gap_std": 17.30755615234375,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.5850340136054422,
"grad_norm": 15.074189186096191,
"learning_rate": 2.209767714686924e-07,
"logits/chosen": 1.6589672565460205,
"logits/rejected": 1.4474884271621704,
"loss": 1.2562,
"step": 387
},
{
"beta_dpo/beta_used": 0.16004110872745514,
"beta_dpo/beta_used_raw": 0.08152688294649124,
"beta_dpo/gap_mean": 9.766632080078125,
"beta_dpo/gap_std": 17.252300262451172,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.5865457294028723,
"grad_norm": 66.86634063720703,
"learning_rate": 2.1966390475472954e-07,
"logits/chosen": 1.8072469234466553,
"logits/rejected": 1.4911160469055176,
"loss": 1.1245,
"step": 388
},
{
"beta_dpo/beta_used": 0.09485035389661789,
"beta_dpo/beta_used_raw": 0.050514545291662216,
"beta_dpo/gap_mean": 9.639822006225586,
"beta_dpo/gap_std": 16.98550796508789,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.5880574452003023,
"grad_norm": 32.758907318115234,
"learning_rate": 2.1835188660656265e-07,
"logits/chosen": 1.597560167312622,
"logits/rejected": 1.378977656364441,
"loss": 1.3376,
"step": 389
},
{
"beta_dpo/beta_used": 0.02094288542866707,
"beta_dpo/beta_used_raw": -0.003135114908218384,
"beta_dpo/gap_mean": 9.789543151855469,
"beta_dpo/gap_std": 16.734346389770508,
"beta_dpo/mask_keep_frac": 0.625,
"epoch": 0.5895691609977324,
"grad_norm": 7.311458587646484,
"learning_rate": 2.170407537241599e-07,
"logits/chosen": 1.0474040508270264,
"logits/rejected": 0.9567930698394775,
"loss": 1.2509,
"step": 390
},
{
"beta_dpo/beta_used": 0.20918205380439758,
"beta_dpo/beta_used_raw": 0.20918205380439758,
"beta_dpo/gap_mean": 10.458446502685547,
"beta_dpo/gap_std": 17.252222061157227,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.5910808767951625,
"grad_norm": 55.30534362792969,
"learning_rate": 2.1573054278272636e-07,
"logits/chosen": 1.5581355094909668,
"logits/rejected": 1.4264538288116455,
"loss": 1.0945,
"step": 391
},
{
"beta_dpo/beta_used": 0.16335958242416382,
"beta_dpo/beta_used_raw": 0.16335958242416382,
"beta_dpo/gap_mean": 11.235108375549316,
"beta_dpo/gap_std": 17.644351959228516,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.5925925925925926,
"grad_norm": 34.14118576049805,
"learning_rate": 2.1442129043167873e-07,
"logits/chosen": 1.8860807418823242,
"logits/rejected": 1.8984272480010986,
"loss": 0.841,
"step": 392
},
{
"beta_dpo/beta_used": 0.006513515952974558,
"beta_dpo/beta_used_raw": -0.04768542945384979,
"beta_dpo/gap_mean": 11.507149696350098,
"beta_dpo/gap_std": 17.389968872070312,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.5941043083900227,
"grad_norm": 2.505648612976074,
"learning_rate": 2.131130332936195e-07,
"logits/chosen": 0.96453857421875,
"logits/rejected": 0.9612942934036255,
"loss": 1.3245,
"step": 393
},
{
"beta_dpo/beta_used": 0.09355347603559494,
"beta_dpo/beta_used_raw": 0.07451394945383072,
"beta_dpo/gap_mean": 11.174118041992188,
"beta_dpo/gap_std": 16.89433479309082,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.5956160241874527,
"grad_norm": 21.494396209716797,
"learning_rate": 2.1180580796331323e-07,
"logits/chosen": 1.8636196851730347,
"logits/rejected": 1.477508783340454,
"loss": 0.964,
"step": 394
},
{
"beta_dpo/beta_used": 0.055694933980703354,
"beta_dpo/beta_used_raw": -0.04353347793221474,
"beta_dpo/gap_mean": 10.723880767822266,
"beta_dpo/gap_std": 16.56464385986328,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.5971277399848829,
"grad_norm": 14.470325469970703,
"learning_rate": 2.104996510066625e-07,
"logits/chosen": 1.576015830039978,
"logits/rejected": 1.2373056411743164,
"loss": 1.1004,
"step": 395
},
{
"beta_dpo/beta_used": 0.1036173403263092,
"beta_dpo/beta_used_raw": 0.1036173403263092,
"beta_dpo/gap_mean": 10.984663963317871,
"beta_dpo/gap_std": 16.140155792236328,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.5986394557823129,
"grad_norm": 19.261430740356445,
"learning_rate": 2.0919459895968517e-07,
"logits/chosen": 1.497571587562561,
"logits/rejected": 1.4676814079284668,
"loss": 0.6805,
"step": 396
},
{
"beta_dpo/beta_used": 0.04165812209248543,
"beta_dpo/beta_used_raw": 0.029148761183023453,
"beta_dpo/gap_mean": 10.157581329345703,
"beta_dpo/gap_std": 15.98454475402832,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.600151171579743,
"grad_norm": 14.16849136352539,
"learning_rate": 2.078906883274924e-07,
"logits/chosen": 1.41060209274292,
"logits/rejected": 1.3652551174163818,
"loss": 1.1823,
"step": 397
},
{
"beta_dpo/beta_used": 0.07533486187458038,
"beta_dpo/beta_used_raw": 0.07533486187458038,
"beta_dpo/gap_mean": 10.367633819580078,
"beta_dpo/gap_std": 16.406509399414062,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.6016628873771731,
"grad_norm": 20.259742736816406,
"learning_rate": 2.065879555832674e-07,
"logits/chosen": 1.3536689281463623,
"logits/rejected": 0.9888167381286621,
"loss": 1.118,
"step": 398
},
{
"beta_dpo/beta_used": 0.042631130665540695,
"beta_dpo/beta_used_raw": -0.033679697662591934,
"beta_dpo/gap_mean": 10.726426124572754,
"beta_dpo/gap_std": 16.663000106811523,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.6031746031746031,
"grad_norm": 14.0131254196167,
"learning_rate": 2.052864371672457e-07,
"logits/chosen": 1.7245910167694092,
"logits/rejected": 1.4539391994476318,
"loss": 1.1571,
"step": 399
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.17764988541603088,
"beta_dpo/gap_mean": 10.223685264587402,
"beta_dpo/gap_std": 16.621475219726562,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.6046863189720333,
"grad_norm": 0.3649911880493164,
"learning_rate": 2.0398616948569493e-07,
"logits/chosen": 1.7496578693389893,
"logits/rejected": 1.7105956077575684,
"loss": 1.3806,
"step": 400
},
{
"epoch": 0.6046863189720333,
"eval_beta_dpo/beta_used": 0.1405337005853653,
"eval_beta_dpo/beta_used_raw": 0.1188855767250061,
"eval_beta_dpo/gap_mean": 9.9655179977417,
"eval_beta_dpo/gap_std": 16.57029914855957,
"eval_beta_dpo/mask_keep_frac": 1.0,
"eval_logits/chosen": 1.4474022388458252,
"eval_logits/rejected": 1.3215140104293823,
"eval_loss": 0.6464195251464844,
"eval_runtime": 42.5646,
"eval_samples_per_second": 54.106,
"eval_steps_per_second": 1.692,
"step": 400
},
{
"beta_dpo/beta_used": 0.17774954438209534,
"beta_dpo/beta_used_raw": 0.17774954438209534,
"beta_dpo/gap_mean": 10.540786743164062,
"beta_dpo/gap_std": 16.417646408081055,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.6061980347694633,
"grad_norm": 23.48154640197754,
"learning_rate": 2.0268718890989752e-07,
"logits/chosen": 0.7174030542373657,
"logits/rejected": 0.683144211769104,
"loss": 0.6649,
"step": 401
},
{
"beta_dpo/beta_used": 0.14793431758880615,
"beta_dpo/beta_used_raw": 0.14793431758880615,
"beta_dpo/gap_mean": 10.927159309387207,
"beta_dpo/gap_std": 16.666133880615234,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.6077097505668935,
"grad_norm": 36.50631332397461,
"learning_rate": 2.013895317751323e-07,
"logits/chosen": 1.9912834167480469,
"logits/rejected": 1.5349533557891846,
"loss": 0.8941,
"step": 402
},
{
"beta_dpo/beta_used": 0.07585098594427109,
"beta_dpo/beta_used_raw": -0.03452427685260773,
"beta_dpo/gap_mean": 11.218865394592285,
"beta_dpo/gap_std": 17.455312728881836,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.6092214663643235,
"grad_norm": 17.68917465209961,
"learning_rate": 2.0009323437965898e-07,
"logits/chosen": 1.6365692615509033,
"logits/rejected": 1.36814284324646,
"loss": 1.0721,
"step": 403
},
{
"beta_dpo/beta_used": 0.16760532557964325,
"beta_dpo/beta_used_raw": 0.16760532557964325,
"beta_dpo/gap_mean": 11.889843940734863,
"beta_dpo/gap_std": 17.469863891601562,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.6107331821617535,
"grad_norm": 30.121400833129883,
"learning_rate": 1.9879833298370237e-07,
"logits/chosen": 1.6164308786392212,
"logits/rejected": 1.756433129310608,
"loss": 0.8641,
"step": 404
},
{
"beta_dpo/beta_used": 0.05700894072651863,
"beta_dpo/beta_used_raw": -0.1042499840259552,
"beta_dpo/gap_mean": 11.69076156616211,
"beta_dpo/gap_std": 17.205629348754883,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.6122448979591837,
"grad_norm": 18.09614372253418,
"learning_rate": 1.975048638084379e-07,
"logits/chosen": 1.9099351167678833,
"logits/rejected": 1.6809766292572021,
"loss": 1.0989,
"step": 405
},
{
"beta_dpo/beta_used": 0.2577747702598572,
"beta_dpo/beta_used_raw": 0.2577747702598572,
"beta_dpo/gap_mean": 11.609317779541016,
"beta_dpo/gap_std": 17.342086791992188,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.6137566137566137,
"grad_norm": 38.84489059448242,
"learning_rate": 1.9621286303497914e-07,
"logits/chosen": 1.4694623947143555,
"logits/rejected": 0.9219260215759277,
"loss": 0.7517,
"step": 406
},
{
"beta_dpo/beta_used": 0.09797775745391846,
"beta_dpo/beta_used_raw": -0.01702454686164856,
"beta_dpo/gap_mean": 11.227970123291016,
"beta_dpo/gap_std": 17.10640525817871,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.6152683295540439,
"grad_norm": 30.631189346313477,
"learning_rate": 1.9492236680336483e-07,
"logits/chosen": 1.7332323789596558,
"logits/rejected": 1.6841402053833008,
"loss": 1.0343,
"step": 407
},
{
"beta_dpo/beta_used": 0.020529722794890404,
"beta_dpo/beta_used_raw": -0.0022693034261465073,
"beta_dpo/gap_mean": 11.435039520263672,
"beta_dpo/gap_std": 16.896324157714844,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.6167800453514739,
"grad_norm": 7.630633354187012,
"learning_rate": 1.9363341121154895e-07,
"logits/chosen": 1.5824682712554932,
"logits/rejected": 1.3653960227966309,
"loss": 1.2268,
"step": 408
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.31050288677215576,
"beta_dpo/gap_mean": 11.112115859985352,
"beta_dpo/gap_std": 17.006160736083984,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.618291761148904,
"grad_norm": 0.287369966506958,
"learning_rate": 1.9234603231438994e-07,
"logits/chosen": 1.9061976671218872,
"logits/rejected": 2.0974600315093994,
"loss": 1.3822,
"step": 409
},
{
"beta_dpo/beta_used": 0.07312033325433731,
"beta_dpo/beta_used_raw": 0.04204032942652702,
"beta_dpo/gap_mean": 11.199286460876465,
"beta_dpo/gap_std": 16.997974395751953,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.6198034769463341,
"grad_norm": 22.923328399658203,
"learning_rate": 1.9106026612264315e-07,
"logits/chosen": 1.3687880039215088,
"logits/rejected": 1.413557529449463,
"loss": 1.08,
"step": 410
},
{
"beta_dpo/beta_used": 0.07877589762210846,
"beta_dpo/beta_used_raw": 0.04541406035423279,
"beta_dpo/gap_mean": 11.319759368896484,
"beta_dpo/gap_std": 17.231704711914062,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.6213151927437641,
"grad_norm": 25.793495178222656,
"learning_rate": 1.8977614860195296e-07,
"logits/chosen": 1.318861961364746,
"logits/rejected": 1.349341869354248,
"loss": 1.1873,
"step": 411
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.09030976891517639,
"beta_dpo/gap_mean": 11.505082130432129,
"beta_dpo/gap_std": 17.38372802734375,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.6228269085411943,
"grad_norm": 0.42393141984939575,
"learning_rate": 1.8849371567184662e-07,
"logits/chosen": 1.7599655389785767,
"logits/rejected": 1.9180841445922852,
"loss": 1.378,
"step": 412
},
{
"beta_dpo/beta_used": 0.05450423061847687,
"beta_dpo/beta_used_raw": -0.029762066900730133,
"beta_dpo/gap_mean": 11.014404296875,
"beta_dpo/gap_std": 17.597949981689453,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.6243386243386243,
"grad_norm": 19.526386260986328,
"learning_rate": 1.872130032047302e-07,
"logits/chosen": 0.9658557772636414,
"logits/rejected": 0.658934473991394,
"loss": 1.1757,
"step": 413
},
{
"beta_dpo/beta_used": 0.0588601678609848,
"beta_dpo/beta_used_raw": 0.021088402718305588,
"beta_dpo/gap_mean": 11.073143005371094,
"beta_dpo/gap_std": 17.767539978027344,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.6258503401360545,
"grad_norm": 17.184946060180664,
"learning_rate": 1.8593404702488436e-07,
"logits/chosen": 1.2476868629455566,
"logits/rejected": 1.127249002456665,
"loss": 1.1053,
"step": 414
},
{
"beta_dpo/beta_used": 0.11692694574594498,
"beta_dpo/beta_used_raw": 0.06981240957975388,
"beta_dpo/gap_mean": 11.076013565063477,
"beta_dpo/gap_std": 18.022043228149414,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.6273620559334845,
"grad_norm": 56.64924621582031,
"learning_rate": 1.846568829074628e-07,
"logits/chosen": 1.4461565017700195,
"logits/rejected": 1.6651735305786133,
"loss": 1.5016,
"step": 415
},
{
"beta_dpo/beta_used": 0.01583676040172577,
"beta_dpo/beta_used_raw": -0.06813767552375793,
"beta_dpo/gap_mean": 10.323577880859375,
"beta_dpo/gap_std": 18.21762466430664,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.6288737717309146,
"grad_norm": 5.505179405212402,
"learning_rate": 1.8338154657749128e-07,
"logits/chosen": 1.4125094413757324,
"logits/rejected": 1.162198781967163,
"loss": 1.2861,
"step": 416
},
{
"beta_dpo/beta_used": 0.15743154287338257,
"beta_dpo/beta_used_raw": 0.11534958332777023,
"beta_dpo/gap_mean": 10.844054222106934,
"beta_dpo/gap_std": 18.56551742553711,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.6303854875283447,
"grad_norm": 34.31191635131836,
"learning_rate": 1.8210807370886849e-07,
"logits/chosen": 1.694962501525879,
"logits/rejected": 1.656688928604126,
"loss": 1.2438,
"step": 417
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.11418265104293823,
"beta_dpo/gap_mean": 10.605875015258789,
"beta_dpo/gap_std": 18.41242790222168,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.6318972033257747,
"grad_norm": 0.30966585874557495,
"learning_rate": 1.8083649992336825e-07,
"logits/chosen": 2.298833131790161,
"logits/rejected": 2.1432628631591797,
"loss": 1.3801,
"step": 418
},
{
"beta_dpo/beta_used": 0.22758902609348297,
"beta_dpo/beta_used_raw": 0.22758902609348297,
"beta_dpo/gap_mean": 11.043691635131836,
"beta_dpo/gap_std": 18.33002281188965,
"beta_dpo/mask_keep_frac": 0.625,
"epoch": 0.6334089191232048,
"grad_norm": 28.69251823425293,
"learning_rate": 1.7956686078964255e-07,
"logits/chosen": 1.6633756160736084,
"logits/rejected": 1.2387137413024902,
"loss": 0.7578,
"step": 419
},
{
"beta_dpo/beta_used": 0.038611479103565216,
"beta_dpo/beta_used_raw": -0.06987833231687546,
"beta_dpo/gap_mean": 10.993532180786133,
"beta_dpo/gap_std": 18.756433486938477,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.6349206349206349,
"grad_norm": 11.58281421661377,
"learning_rate": 1.782991918222275e-07,
"logits/chosen": 1.3712520599365234,
"logits/rejected": 1.3558213710784912,
"loss": 1.2225,
"step": 420
},
{
"beta_dpo/beta_used": 0.2914969325065613,
"beta_dpo/beta_used_raw": 0.2914969325065613,
"beta_dpo/gap_mean": 10.698333740234375,
"beta_dpo/gap_std": 19.29578399658203,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.636432350718065,
"grad_norm": 79.03998565673828,
"learning_rate": 1.7703352848054887e-07,
"logits/chosen": 2.0994133949279785,
"logits/rejected": 1.548837661743164,
"loss": 1.6998,
"step": 421
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.10875105112791061,
"beta_dpo/gap_mean": 10.545480728149414,
"beta_dpo/gap_std": 19.398765563964844,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.6379440665154951,
"grad_norm": 0.35578790307044983,
"learning_rate": 1.7576990616793137e-07,
"logits/chosen": 1.771589756011963,
"logits/rejected": 1.5337142944335938,
"loss": 1.3794,
"step": 422
},
{
"beta_dpo/beta_used": 0.20271146297454834,
"beta_dpo/beta_used_raw": 0.20271146297454834,
"beta_dpo/gap_mean": 10.905920028686523,
"beta_dpo/gap_std": 19.073719024658203,
"beta_dpo/mask_keep_frac": 0.5625,
"epoch": 0.6394557823129252,
"grad_norm": 37.11328125,
"learning_rate": 1.745083602306071e-07,
"logits/chosen": 1.7254886627197266,
"logits/rejected": 1.634531021118164,
"loss": 0.8132,
"step": 423
},
{
"beta_dpo/beta_used": 0.08102002739906311,
"beta_dpo/beta_used_raw": 0.05924910679459572,
"beta_dpo/gap_mean": 11.04400634765625,
"beta_dpo/gap_std": 18.683273315429688,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.6409674981103552,
"grad_norm": 20.051164627075195,
"learning_rate": 1.7324892595672804e-07,
"logits/chosen": 1.4589695930480957,
"logits/rejected": 1.436366319656372,
"loss": 1.0276,
"step": 424
},
{
"beta_dpo/beta_used": 0.19659112393856049,
"beta_dpo/beta_used_raw": 0.15749159455299377,
"beta_dpo/gap_mean": 11.374787330627441,
"beta_dpo/gap_std": 18.452198028564453,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.6424792139077853,
"grad_norm": 60.02362823486328,
"learning_rate": 1.7199163857537824e-07,
"logits/chosen": 1.6291189193725586,
"logits/rejected": 1.6020748615264893,
"loss": 1.4147,
"step": 425
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.274090051651001,
"beta_dpo/gap_mean": 10.909863471984863,
"beta_dpo/gap_std": 18.784690856933594,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.6439909297052154,
"grad_norm": 0.31475549936294556,
"learning_rate": 1.7073653325558828e-07,
"logits/chosen": 1.2150731086730957,
"logits/rejected": 1.204681634902954,
"loss": 1.3823,
"step": 426
},
{
"beta_dpo/beta_used": 0.022039199247956276,
"beta_dpo/beta_used_raw": -0.037459395825862885,
"beta_dpo/gap_mean": 10.606595039367676,
"beta_dpo/gap_std": 18.83213996887207,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.6455026455026455,
"grad_norm": 8.438140869140625,
"learning_rate": 1.6948364510535218e-07,
"logits/chosen": 0.9421446919441223,
"logits/rejected": 0.9893728494644165,
"loss": 1.2392,
"step": 427
},
{
"beta_dpo/beta_used": 0.09935323894023895,
"beta_dpo/beta_used_raw": 0.09935323894023895,
"beta_dpo/gap_mean": 10.691535949707031,
"beta_dpo/gap_std": 18.80581283569336,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.6470143613000756,
"grad_norm": 27.139223098754883,
"learning_rate": 1.6823300917064458e-07,
"logits/chosen": 1.3903647661209106,
"logits/rejected": 1.6309527158737183,
"loss": 1.0183,
"step": 428
},
{
"beta_dpo/beta_used": 0.30730801820755005,
"beta_dpo/beta_used_raw": 0.30730801820755005,
"beta_dpo/gap_mean": 10.742720603942871,
"beta_dpo/gap_std": 18.884178161621094,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.6485260770975056,
"grad_norm": 71.05332946777344,
"learning_rate": 1.669846604344412e-07,
"logits/chosen": 1.3571021556854248,
"logits/rejected": 1.5711731910705566,
"loss": 1.1584,
"step": 429
},
{
"beta_dpo/beta_used": 0.19743552803993225,
"beta_dpo/beta_used_raw": 0.19743552803993225,
"beta_dpo/gap_mean": 11.363592147827148,
"beta_dpo/gap_std": 19.35413360595703,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.6500377928949358,
"grad_norm": 39.574615478515625,
"learning_rate": 1.6573863381573954e-07,
"logits/chosen": 1.2930231094360352,
"logits/rejected": 1.2674870491027832,
"loss": 0.8395,
"step": 430
},
{
"beta_dpo/beta_used": 0.060200098901987076,
"beta_dpo/beta_used_raw": 0.05172666907310486,
"beta_dpo/gap_mean": 11.444803237915039,
"beta_dpo/gap_std": 19.580089569091797,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.6515495086923658,
"grad_norm": 18.198490142822266,
"learning_rate": 1.6449496416858282e-07,
"logits/chosen": 0.9209311604499817,
"logits/rejected": 0.6936602592468262,
"loss": 1.1647,
"step": 431
},
{
"beta_dpo/beta_used": 0.1908065676689148,
"beta_dpo/beta_used_raw": 0.1908065676689148,
"beta_dpo/gap_mean": 11.477436065673828,
"beta_dpo/gap_std": 19.80697250366211,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.6530612244897959,
"grad_norm": 57.51634979248047,
"learning_rate": 1.632536862810844e-07,
"logits/chosen": 1.5439039468765259,
"logits/rejected": 1.8144121170043945,
"loss": 0.8417,
"step": 432
},
{
"beta_dpo/beta_used": 0.15694357454776764,
"beta_dpo/beta_used_raw": 0.15694357454776764,
"beta_dpo/gap_mean": 11.754386901855469,
"beta_dpo/gap_std": 20.23770523071289,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.654572940287226,
"grad_norm": 51.7684440612793,
"learning_rate": 1.6201483487445515e-07,
"logits/chosen": 1.8874328136444092,
"logits/rejected": 1.811736822128296,
"loss": 1.1361,
"step": 433
},
{
"beta_dpo/beta_used": 0.17589187622070312,
"beta_dpo/beta_used_raw": 0.17589187622070312,
"beta_dpo/gap_mean": 12.395316123962402,
"beta_dpo/gap_std": 20.479772567749023,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.656084656084656,
"grad_norm": 76.38113403320312,
"learning_rate": 1.6077844460203204e-07,
"logits/chosen": 1.633279800415039,
"logits/rejected": 1.4938979148864746,
"loss": 1.4175,
"step": 434
},
{
"beta_dpo/beta_used": 0.05628956854343414,
"beta_dpo/beta_used_raw": 0.011582344770431519,
"beta_dpo/gap_mean": 11.693523406982422,
"beta_dpo/gap_std": 20.01717758178711,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.6575963718820862,
"grad_norm": 19.633525848388672,
"learning_rate": 1.5954455004830878e-07,
"logits/chosen": 1.6384367942810059,
"logits/rejected": 1.6007449626922607,
"loss": 1.1721,
"step": 435
},
{
"beta_dpo/beta_used": 0.11185856908559799,
"beta_dpo/beta_used_raw": 0.11185856908559799,
"beta_dpo/gap_mean": 11.502742767333984,
"beta_dpo/gap_std": 19.742431640625,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.6591080876795162,
"grad_norm": 28.773508071899414,
"learning_rate": 1.5831318572796847e-07,
"logits/chosen": 1.2981607913970947,
"logits/rejected": 1.4774749279022217,
"loss": 1.1959,
"step": 436
},
{
"beta_dpo/beta_used": 0.0465235635638237,
"beta_dpo/beta_used_raw": 0.0465235635638237,
"beta_dpo/gap_mean": 11.57576847076416,
"beta_dpo/gap_std": 19.905479431152344,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.6606198034769464,
"grad_norm": 11.688758850097656,
"learning_rate": 1.5708438608491815e-07,
"logits/chosen": 1.605779767036438,
"logits/rejected": 1.3303096294403076,
"loss": 1.0708,
"step": 437
},
{
"beta_dpo/beta_used": 0.17932583391666412,
"beta_dpo/beta_used_raw": 0.11518719792366028,
"beta_dpo/gap_mean": 11.271271705627441,
"beta_dpo/gap_std": 19.939411163330078,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.6621315192743764,
"grad_norm": 51.175689697265625,
"learning_rate": 1.558581854913253e-07,
"logits/chosen": 1.687558889389038,
"logits/rejected": 1.3356046676635742,
"loss": 1.0644,
"step": 438
},
{
"beta_dpo/beta_used": 0.04887852445244789,
"beta_dpo/beta_used_raw": 0.04887852445244789,
"beta_dpo/gap_mean": 11.624393463134766,
"beta_dpo/gap_std": 19.695316314697266,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.6636432350718064,
"grad_norm": 12.64301872253418,
"learning_rate": 1.5463461824665658e-07,
"logits/chosen": 1.8754761219024658,
"logits/rejected": 1.7036700248718262,
"loss": 1.0783,
"step": 439
},
{
"beta_dpo/beta_used": 0.194175124168396,
"beta_dpo/beta_used_raw": 0.194175124168396,
"beta_dpo/gap_mean": 12.504231452941895,
"beta_dpo/gap_std": 19.355581283569336,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.6651549508692366,
"grad_norm": 42.490901947021484,
"learning_rate": 1.534137185767178e-07,
"logits/chosen": 1.1825112104415894,
"logits/rejected": 0.6874880790710449,
"loss": 0.7751,
"step": 440
},
{
"beta_dpo/beta_used": 0.1130920946598053,
"beta_dpo/beta_used_raw": -0.008399426937103271,
"beta_dpo/gap_mean": 12.945587158203125,
"beta_dpo/gap_std": 19.07444953918457,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.6666666666666666,
"grad_norm": 29.434162139892578,
"learning_rate": 1.521955206326976e-07,
"logits/chosen": 1.2224631309509277,
"logits/rejected": 0.8368812799453735,
"loss": 1.2321,
"step": 441
},
{
"beta_dpo/beta_used": 0.12853366136550903,
"beta_dpo/beta_used_raw": -0.03535076975822449,
"beta_dpo/gap_mean": 12.509725570678711,
"beta_dpo/gap_std": 18.713966369628906,
"beta_dpo/mask_keep_frac": 0.625,
"epoch": 0.6681783824640968,
"grad_norm": 40.52631759643555,
"learning_rate": 1.5098005849021078e-07,
"logits/chosen": 1.8482825756072998,
"logits/rejected": 1.715338945388794,
"loss": 1.0151,
"step": 442
},
{
"beta_dpo/beta_used": 0.02449873648583889,
"beta_dpo/beta_used_raw": 0.022881096228957176,
"beta_dpo/gap_mean": 12.855720520019531,
"beta_dpo/gap_std": 19.116792678833008,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.6696900982615268,
"grad_norm": 9.385747909545898,
"learning_rate": 1.4976736614834662e-07,
"logits/chosen": 1.205538272857666,
"logits/rejected": 1.0337432622909546,
"loss": 1.1801,
"step": 443
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.18675100803375244,
"beta_dpo/gap_mean": 12.065677642822266,
"beta_dpo/gap_std": 19.102123260498047,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.671201814058957,
"grad_norm": 0.38620585203170776,
"learning_rate": 1.4855747752871654e-07,
"logits/chosen": 1.5884000062942505,
"logits/rejected": 1.4578423500061035,
"loss": 1.3789,
"step": 444
},
{
"beta_dpo/beta_used": 0.09681466966867447,
"beta_dpo/beta_used_raw": 0.09337137639522552,
"beta_dpo/gap_mean": 12.104242324829102,
"beta_dpo/gap_std": 19.160350799560547,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.672713529856387,
"grad_norm": 29.804115295410156,
"learning_rate": 1.473504264745062e-07,
"logits/chosen": 1.7354657649993896,
"logits/rejected": 1.8415591716766357,
"loss": 1.1427,
"step": 445
},
{
"beta_dpo/beta_used": 0.2682816982269287,
"beta_dpo/beta_used_raw": 0.2269507795572281,
"beta_dpo/gap_mean": 12.445560455322266,
"beta_dpo/gap_std": 18.5366153717041,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.674225245653817,
"grad_norm": 55.44277572631836,
"learning_rate": 1.461462467495284e-07,
"logits/chosen": 1.4882698059082031,
"logits/rejected": 1.455931544303894,
"loss": 1.0786,
"step": 446
},
{
"beta_dpo/beta_used": 0.07030583918094635,
"beta_dpo/beta_used_raw": 0.07030583918094635,
"beta_dpo/gap_mean": 12.943078994750977,
"beta_dpo/gap_std": 18.398422241210938,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.6757369614512472,
"grad_norm": 16.960407257080078,
"learning_rate": 1.4494497203727843e-07,
"logits/chosen": 1.478388786315918,
"logits/rejected": 1.0553447008132935,
"loss": 0.918,
"step": 447
},
{
"beta_dpo/beta_used": 0.11920321732759476,
"beta_dpo/beta_used_raw": 0.11920321732759476,
"beta_dpo/gap_mean": 12.699634552001953,
"beta_dpo/gap_std": 18.55364227294922,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.6772486772486772,
"grad_norm": 20.711158752441406,
"learning_rate": 1.4374663593999256e-07,
"logits/chosen": 1.8860962390899658,
"logits/rejected": 1.671408772468567,
"loss": 0.9085,
"step": 448
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.20916791260242462,
"beta_dpo/gap_mean": 11.895469665527344,
"beta_dpo/gap_std": 18.483585357666016,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.6787603930461074,
"grad_norm": 0.3559426963329315,
"learning_rate": 1.4255127197770707e-07,
"logits/chosen": 0.9508600831031799,
"logits/rejected": 0.7793235182762146,
"loss": 1.3797,
"step": 449
},
{
"beta_dpo/beta_used": 0.12843473255634308,
"beta_dpo/beta_used_raw": 0.12843473255634308,
"beta_dpo/gap_mean": 11.255586624145508,
"beta_dpo/gap_std": 18.451894760131836,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.6802721088435374,
"grad_norm": 21.258712768554688,
"learning_rate": 1.4135891358732205e-07,
"logits/chosen": 1.2503652572631836,
"logits/rejected": 0.761991560459137,
"loss": 0.8848,
"step": 450
},
{
"beta_dpo/beta_used": 0.0015382266137748957,
"beta_dpo/beta_used_raw": -0.05390516668558121,
"beta_dpo/gap_mean": 10.862272262573242,
"beta_dpo/gap_std": 18.184444427490234,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.6817838246409675,
"grad_norm": 0.5277694463729858,
"learning_rate": 1.4016959412166437e-07,
"logits/chosen": 1.491100549697876,
"logits/rejected": 1.0314542055130005,
"loss": 1.3738,
"step": 451
},
{
"beta_dpo/beta_used": 0.033183373510837555,
"beta_dpo/beta_used_raw": -0.02967868000268936,
"beta_dpo/gap_mean": 10.793105125427246,
"beta_dpo/gap_std": 18.46420669555664,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.6832955404383976,
"grad_norm": 12.034192085266113,
"learning_rate": 1.3898334684855645e-07,
"logits/chosen": 1.0540153980255127,
"logits/rejected": 0.7840179204940796,
"loss": 1.2231,
"step": 452
},
{
"beta_dpo/beta_used": 0.06273314356803894,
"beta_dpo/beta_used_raw": 0.06273314356803894,
"beta_dpo/gap_mean": 10.497642517089844,
"beta_dpo/gap_std": 18.56969451904297,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.6848072562358276,
"grad_norm": 16.02577781677246,
"learning_rate": 1.3780020494988445e-07,
"logits/chosen": 1.416991949081421,
"logits/rejected": 1.2110953330993652,
"loss": 1.1344,
"step": 453
},
{
"beta_dpo/beta_used": 0.1319224089384079,
"beta_dpo/beta_used_raw": 0.1319224089384079,
"beta_dpo/gap_mean": 10.963911056518555,
"beta_dpo/gap_std": 19.008258819580078,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.6863189720332578,
"grad_norm": 30.299396514892578,
"learning_rate": 1.366202015206706e-07,
"logits/chosen": 1.5142550468444824,
"logits/rejected": 1.539805293083191,
"loss": 1.1613,
"step": 454
},
{
"beta_dpo/beta_used": 0.0735812857747078,
"beta_dpo/beta_used_raw": 0.0735812857747078,
"beta_dpo/gap_mean": 11.458322525024414,
"beta_dpo/gap_std": 18.766376495361328,
"beta_dpo/mask_keep_frac": 0.625,
"epoch": 0.6878306878306878,
"grad_norm": 19.040212631225586,
"learning_rate": 1.354433695681474e-07,
"logits/chosen": 1.2673579454421997,
"logits/rejected": 1.1849486827850342,
"loss": 0.9185,
"step": 455
},
{
"beta_dpo/beta_used": 0.09154469519853592,
"beta_dpo/beta_used_raw": 0.04548892751336098,
"beta_dpo/gap_mean": 11.764167785644531,
"beta_dpo/gap_std": 18.304719924926758,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.6893424036281179,
"grad_norm": 17.924081802368164,
"learning_rate": 1.3426974201083439e-07,
"logits/chosen": 1.1485925912857056,
"logits/rejected": 0.867099404335022,
"loss": 0.9742,
"step": 456
},
{
"beta_dpo/beta_used": 0.002911860356107354,
"beta_dpo/beta_used_raw": -0.021841388195753098,
"beta_dpo/gap_mean": 11.507458686828613,
"beta_dpo/gap_std": 17.799766540527344,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.690854119425548,
"grad_norm": 1.2298791408538818,
"learning_rate": 1.3309935167761717e-07,
"logits/chosen": 1.556707501411438,
"logits/rejected": 1.3103752136230469,
"loss": 1.3574,
"step": 457
},
{
"beta_dpo/beta_used": 0.06613724678754807,
"beta_dpo/beta_used_raw": 0.06613724678754807,
"beta_dpo/gap_mean": 11.640986442565918,
"beta_dpo/gap_std": 17.569747924804688,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.6923658352229781,
"grad_norm": 15.145212173461914,
"learning_rate": 1.3193223130682936e-07,
"logits/chosen": 1.1925835609436035,
"logits/rejected": 0.9567406177520752,
"loss": 0.9984,
"step": 458
},
{
"beta_dpo/beta_used": 0.1462525725364685,
"beta_dpo/beta_used_raw": 0.06357168406248093,
"beta_dpo/gap_mean": 11.50616455078125,
"beta_dpo/gap_std": 17.23088836669922,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.6938775510204082,
"grad_norm": 33.99520492553711,
"learning_rate": 1.3076841354533658e-07,
"logits/chosen": 1.9023911952972412,
"logits/rejected": 1.8868764638900757,
"loss": 0.9019,
"step": 459
},
{
"beta_dpo/beta_used": 0.05953259766101837,
"beta_dpo/beta_used_raw": 0.035434067249298096,
"beta_dpo/gap_mean": 12.219179153442383,
"beta_dpo/gap_std": 17.305801391601562,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.6953892668178382,
"grad_norm": 20.19550895690918,
"learning_rate": 1.2960793094762345e-07,
"logits/chosen": 1.5630019903182983,
"logits/rejected": 1.032307505607605,
"loss": 1.1575,
"step": 460
},
{
"beta_dpo/beta_used": 0.15666146576404572,
"beta_dpo/beta_used_raw": 0.05879899859428406,
"beta_dpo/gap_mean": 12.488428115844727,
"beta_dpo/gap_std": 17.192520141601562,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.6969009826152683,
"grad_norm": 33.026939392089844,
"learning_rate": 1.2845081597488286e-07,
"logits/chosen": 1.8508528470993042,
"logits/rejected": 1.592889428138733,
"loss": 0.8799,
"step": 461
},
{
"beta_dpo/beta_used": 0.25797462463378906,
"beta_dpo/beta_used_raw": 0.25797462463378906,
"beta_dpo/gap_mean": 12.744247436523438,
"beta_dpo/gap_std": 17.581214904785156,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.6984126984126984,
"grad_norm": 68.12858581542969,
"learning_rate": 1.27297100994108e-07,
"logits/chosen": 1.4536336660385132,
"logits/rejected": 1.4048317670822144,
"loss": 0.9012,
"step": 462
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.036650676280260086,
"beta_dpo/gap_mean": 12.505157470703125,
"beta_dpo/gap_std": 17.675983428955078,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.6999244142101285,
"grad_norm": 0.3852461278438568,
"learning_rate": 1.2614681827718695e-07,
"logits/chosen": 1.7947087287902832,
"logits/rejected": 1.8371453285217285,
"loss": 1.3769,
"step": 463
},
{
"beta_dpo/beta_used": 0.10625768452882767,
"beta_dpo/beta_used_raw": 0.10625768452882767,
"beta_dpo/gap_mean": 12.234790802001953,
"beta_dpo/gap_std": 18.48796272277832,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.7014361300075586,
"grad_norm": 24.380142211914062,
"learning_rate": 1.2500000000000005e-07,
"logits/chosen": 1.3279081583023071,
"logits/rejected": 1.2735958099365234,
"loss": 0.9684,
"step": 464
},
{
"beta_dpo/beta_used": 0.0729127824306488,
"beta_dpo/beta_used_raw": 0.0729127824306488,
"beta_dpo/gap_mean": 12.408645629882812,
"beta_dpo/gap_std": 18.752695083618164,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.7029478458049887,
"grad_norm": 20.065946578979492,
"learning_rate": 1.238566782415197e-07,
"logits/chosen": 1.4167413711547852,
"logits/rejected": 1.2189738750457764,
"loss": 1.037,
"step": 465
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.14980760216712952,
"beta_dpo/gap_mean": 11.742490768432617,
"beta_dpo/gap_std": 18.520854949951172,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.7044595616024187,
"grad_norm": 0.4699816405773163,
"learning_rate": 1.2271688498291334e-07,
"logits/chosen": 1.2016394138336182,
"logits/rejected": 1.34425950050354,
"loss": 1.3787,
"step": 466
},
{
"beta_dpo/beta_used": 0.0710422620177269,
"beta_dpo/beta_used_raw": 0.07022541761398315,
"beta_dpo/gap_mean": 11.67038345336914,
"beta_dpo/gap_std": 18.223949432373047,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.7059712773998488,
"grad_norm": 20.220792770385742,
"learning_rate": 1.2158065210664848e-07,
"logits/chosen": 0.9855274558067322,
"logits/rejected": 0.5498029589653015,
"loss": 1.1138,
"step": 467
},
{
"beta_dpo/beta_used": 0.13923662900924683,
"beta_dpo/beta_used_raw": 0.037644751369953156,
"beta_dpo/gap_mean": 11.826444625854492,
"beta_dpo/gap_std": 18.259021759033203,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.7074829931972789,
"grad_norm": 93.8460693359375,
"learning_rate": 1.204480113956011e-07,
"logits/chosen": 1.6846306324005127,
"logits/rejected": 1.574007511138916,
"loss": 1.3031,
"step": 468
},
{
"beta_dpo/beta_used": 0.1386784166097641,
"beta_dpo/beta_used_raw": 0.0496968999505043,
"beta_dpo/gap_mean": 11.931732177734375,
"beta_dpo/gap_std": 17.935604095458984,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.708994708994709,
"grad_norm": 50.79655456542969,
"learning_rate": 1.1931899453216697e-07,
"logits/chosen": 1.7242679595947266,
"logits/rejected": 1.4564831256866455,
"loss": 1.1023,
"step": 469
},
{
"beta_dpo/beta_used": 0.12113356590270996,
"beta_dpo/beta_used_raw": 0.10899336636066437,
"beta_dpo/gap_mean": 11.499549865722656,
"beta_dpo/gap_std": 17.669376373291016,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.7105064247921391,
"grad_norm": 30.518417358398438,
"learning_rate": 1.1819363309737438e-07,
"logits/chosen": 1.467283844947815,
"logits/rejected": 1.138906478881836,
"loss": 1.0102,
"step": 470
},
{
"beta_dpo/beta_used": 0.2756751775741577,
"beta_dpo/beta_used_raw": 0.2756751775741577,
"beta_dpo/gap_mean": 11.841851234436035,
"beta_dpo/gap_std": 17.771278381347656,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.7120181405895691,
"grad_norm": 46.136444091796875,
"learning_rate": 1.1707195857000215e-07,
"logits/chosen": 1.4691767692565918,
"logits/rejected": 1.3501659631729126,
"loss": 0.7718,
"step": 471
},
{
"beta_dpo/beta_used": 0.04809433966875076,
"beta_dpo/beta_used_raw": -0.059414975345134735,
"beta_dpo/gap_mean": 12.08486557006836,
"beta_dpo/gap_std": 18.567523956298828,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.7135298563869993,
"grad_norm": 19.517423629760742,
"learning_rate": 1.1595400232569768e-07,
"logits/chosen": 1.2809163331985474,
"logits/rejected": 1.3538299798965454,
"loss": 1.215,
"step": 472
},
{
"beta_dpo/beta_used": 0.16872605681419373,
"beta_dpo/beta_used_raw": 0.16872605681419373,
"beta_dpo/gap_mean": 12.269996643066406,
"beta_dpo/gap_std": 19.072513580322266,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.7150415721844293,
"grad_norm": 46.318756103515625,
"learning_rate": 1.1483979563610069e-07,
"logits/chosen": 1.4568700790405273,
"logits/rejected": 0.9656409025192261,
"loss": 0.8288,
"step": 473
},
{
"beta_dpo/beta_used": 0.01852474734187126,
"beta_dpo/beta_used_raw": -0.02408505789935589,
"beta_dpo/gap_mean": 11.981610298156738,
"beta_dpo/gap_std": 18.998138427734375,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.7165532879818595,
"grad_norm": 7.194610118865967,
"learning_rate": 1.1372936966796709e-07,
"logits/chosen": 2.111870288848877,
"logits/rejected": 1.9162969589233398,
"loss": 1.234,
"step": 474
},
{
"beta_dpo/beta_used": 0.2989564538002014,
"beta_dpo/beta_used_raw": 0.2989564538002014,
"beta_dpo/gap_mean": 12.381606101989746,
"beta_dpo/gap_std": 18.669628143310547,
"beta_dpo/mask_keep_frac": 0.625,
"epoch": 0.7180650037792895,
"grad_norm": 51.27621841430664,
"learning_rate": 1.126227554822985e-07,
"logits/chosen": 0.9997602701187134,
"logits/rejected": 1.075520396232605,
"loss": 0.5417,
"step": 475
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.08130905032157898,
"beta_dpo/gap_mean": 12.294593811035156,
"beta_dpo/gap_std": 18.475555419921875,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.7195767195767195,
"grad_norm": 0.35033127665519714,
"learning_rate": 1.1151998403347243e-07,
"logits/chosen": 1.256063461303711,
"logits/rejected": 1.1085220575332642,
"loss": 1.3772,
"step": 476
},
{
"beta_dpo/beta_used": 0.05871342495083809,
"beta_dpo/beta_used_raw": -0.007116403430700302,
"beta_dpo/gap_mean": 11.625802040100098,
"beta_dpo/gap_std": 18.592151641845703,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.7210884353741497,
"grad_norm": 20.247365951538086,
"learning_rate": 1.1042108616837692e-07,
"logits/chosen": 1.5608341693878174,
"logits/rejected": 1.4778110980987549,
"loss": 1.1472,
"step": 477
},
{
"beta_dpo/beta_used": 0.012469051405787468,
"beta_dpo/beta_used_raw": 0.011093566194176674,
"beta_dpo/gap_mean": 11.136640548706055,
"beta_dpo/gap_std": 19.246501922607422,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.7226001511715797,
"grad_norm": 4.416438579559326,
"learning_rate": 1.0932609262554746e-07,
"logits/chosen": 1.4383872747421265,
"logits/rejected": 1.3677079677581787,
"loss": 1.2937,
"step": 478
},
{
"beta_dpo/beta_used": 0.07337600737810135,
"beta_dpo/beta_used_raw": 0.07337600737810135,
"beta_dpo/gap_mean": 10.765281677246094,
"beta_dpo/gap_std": 19.35091209411621,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.7241118669690099,
"grad_norm": 19.005462646484375,
"learning_rate": 1.0823503403430734e-07,
"logits/chosen": 0.7993252277374268,
"logits/rejected": 0.21372252702713013,
"loss": 1.0513,
"step": 479
},
{
"beta_dpo/beta_used": 0.2798859179019928,
"beta_dpo/beta_used_raw": 0.2798859179019928,
"beta_dpo/gap_mean": 10.71806526184082,
"beta_dpo/gap_std": 19.444379806518555,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.7256235827664399,
"grad_norm": 71.94794464111328,
"learning_rate": 1.0714794091391072e-07,
"logits/chosen": 1.1766527891159058,
"logits/rejected": 1.1873457431793213,
"loss": 1.2252,
"step": 480
},
{
"beta_dpo/beta_used": 0.058205485343933105,
"beta_dpo/beta_used_raw": 0.058205485343933105,
"beta_dpo/gap_mean": 10.897520065307617,
"beta_dpo/gap_std": 19.466140747070312,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.72713529856387,
"grad_norm": 15.312066078186035,
"learning_rate": 1.0606484367268906e-07,
"logits/chosen": 1.0759093761444092,
"logits/rejected": 1.222165584564209,
"loss": 1.0585,
"step": 481
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.010358155705034733,
"beta_dpo/gap_mean": 10.749324798583984,
"beta_dpo/gap_std": 19.947269439697266,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.7286470143613001,
"grad_norm": 0.4036189615726471,
"learning_rate": 1.0498577260720048e-07,
"logits/chosen": 1.2716319561004639,
"logits/rejected": 1.175731897354126,
"loss": 1.3772,
"step": 482
},
{
"beta_dpo/beta_used": 0.0638606920838356,
"beta_dpo/beta_used_raw": 0.057676542550325394,
"beta_dpo/gap_mean": 11.183818817138672,
"beta_dpo/gap_std": 19.975425720214844,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.7301587301587301,
"grad_norm": 22.85965347290039,
"learning_rate": 1.0391075790138232e-07,
"logits/chosen": 1.5366387367248535,
"logits/rejected": 1.613889217376709,
"loss": 1.1919,
"step": 483
},
{
"beta_dpo/beta_used": 0.040944814682006836,
"beta_dpo/beta_used_raw": 0.019066521897912025,
"beta_dpo/gap_mean": 11.034358978271484,
"beta_dpo/gap_std": 19.370399475097656,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.7316704459561603,
"grad_norm": 15.710288047790527,
"learning_rate": 1.0283982962570681e-07,
"logits/chosen": 1.4977807998657227,
"logits/rejected": 1.473586082458496,
"loss": 1.1728,
"step": 484
},
{
"beta_dpo/beta_used": 0.22265413403511047,
"beta_dpo/beta_used_raw": 0.2221376746892929,
"beta_dpo/gap_mean": 11.262885093688965,
"beta_dpo/gap_std": 18.92700958251953,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.7331821617535903,
"grad_norm": 79.26393127441406,
"learning_rate": 1.0177301773633992e-07,
"logits/chosen": 1.5991811752319336,
"logits/rejected": 1.406498908996582,
"loss": 1.0811,
"step": 485
},
{
"beta_dpo/beta_used": 0.18111911416053772,
"beta_dpo/beta_used_raw": 0.13571885228157043,
"beta_dpo/gap_mean": 11.03097152709961,
"beta_dpo/gap_std": 19.104738235473633,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.7346938775510204,
"grad_norm": 62.48919677734375,
"learning_rate": 1.007103520743035e-07,
"logits/chosen": 1.260819911956787,
"logits/rejected": 0.939678430557251,
"loss": 1.2927,
"step": 486
},
{
"beta_dpo/beta_used": 0.16297666728496552,
"beta_dpo/beta_used_raw": 0.16297666728496552,
"beta_dpo/gap_mean": 11.155773162841797,
"beta_dpo/gap_std": 19.169147491455078,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.7362055933484505,
"grad_norm": 59.966793060302734,
"learning_rate": 9.965186236464046e-08,
"logits/chosen": 1.0779931545257568,
"logits/rejected": 1.3338478803634644,
"loss": 1.0975,
"step": 487
},
{
"beta_dpo/beta_used": 0.2435801774263382,
"beta_dpo/beta_used_raw": 0.23413166403770447,
"beta_dpo/gap_mean": 11.18185806274414,
"beta_dpo/gap_std": 19.392467498779297,
"beta_dpo/mask_keep_frac": 0.5625,
"epoch": 0.7377173091458806,
"grad_norm": 92.96924591064453,
"learning_rate": 9.859757821558337e-08,
"logits/chosen": 1.9946186542510986,
"logits/rejected": 1.7588841915130615,
"loss": 1.2888,
"step": 488
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.1315448135137558,
"beta_dpo/gap_mean": 10.578689575195312,
"beta_dpo/gap_std": 18.989961624145508,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.7392290249433107,
"grad_norm": 0.4756178855895996,
"learning_rate": 9.754752911772615e-08,
"logits/chosen": 1.6091415882110596,
"logits/rejected": 1.572596549987793,
"loss": 1.3803,
"step": 489
},
{
"beta_dpo/beta_used": 0.27745407819747925,
"beta_dpo/beta_used_raw": 0.27745407819747925,
"beta_dpo/gap_mean": 10.56434440612793,
"beta_dpo/gap_std": 19.33395767211914,
"beta_dpo/mask_keep_frac": 0.625,
"epoch": 0.7407407407407407,
"grad_norm": 55.287017822265625,
"learning_rate": 9.650174444319956e-08,
"logits/chosen": 2.1048507690429688,
"logits/rejected": 2.1106972694396973,
"loss": 0.9254,
"step": 490
},
{
"beta_dpo/beta_used": 0.15799355506896973,
"beta_dpo/beta_used_raw": 0.05918142944574356,
"beta_dpo/gap_mean": 11.070003509521484,
"beta_dpo/gap_std": 19.092586517333984,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.7422524565381708,
"grad_norm": 38.35470199584961,
"learning_rate": 9.546025344484868e-08,
"logits/chosen": 1.3452179431915283,
"logits/rejected": 1.4264013767242432,
"loss": 1.0867,
"step": 491
},
{
"beta_dpo/beta_used": 0.09264776110649109,
"beta_dpo/beta_used_raw": -0.04411589354276657,
"beta_dpo/gap_mean": 10.30017375946045,
"beta_dpo/gap_std": 18.976940155029297,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.7437641723356009,
"grad_norm": 41.40193557739258,
"learning_rate": 9.442308525541589e-08,
"logits/chosen": 1.6056393384933472,
"logits/rejected": 1.0043350458145142,
"loss": 1.148,
"step": 492
},
{
"beta_dpo/beta_used": 0.22295749187469482,
"beta_dpo/beta_used_raw": 0.22295749187469482,
"beta_dpo/gap_mean": 10.42473030090332,
"beta_dpo/gap_std": 19.106109619140625,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.745275888133031,
"grad_norm": 57.83147430419922,
"learning_rate": 9.339026888672468e-08,
"logits/chosen": 1.8143576383590698,
"logits/rejected": 1.6363078355789185,
"loss": 1.0346,
"step": 493
},
{
"beta_dpo/beta_used": 0.07032950222492218,
"beta_dpo/beta_used_raw": 0.07032950222492218,
"beta_dpo/gap_mean": 10.655120849609375,
"beta_dpo/gap_std": 19.331012725830078,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.7467876039304611,
"grad_norm": 29.111852645874023,
"learning_rate": 9.236183322886945e-08,
"logits/chosen": 0.8167870044708252,
"logits/rejected": 0.7540128231048584,
"loss": 1.1794,
"step": 494
},
{
"beta_dpo/beta_used": 0.1115037351846695,
"beta_dpo/beta_used_raw": 0.028124667704105377,
"beta_dpo/gap_mean": 10.31930160522461,
"beta_dpo/gap_std": 19.684932708740234,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.7482993197278912,
"grad_norm": 31.925792694091797,
"learning_rate": 9.133780704940594e-08,
"logits/chosen": 1.4023044109344482,
"logits/rejected": 1.3672239780426025,
"loss": 1.1475,
"step": 495
},
{
"beta_dpo/beta_used": 0.08872908353805542,
"beta_dpo/beta_used_raw": 0.08872908353805542,
"beta_dpo/gap_mean": 10.631675720214844,
"beta_dpo/gap_std": 19.648696899414062,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.7498110355253212,
"grad_norm": 24.766550064086914,
"learning_rate": 9.031821899254797e-08,
"logits/chosen": 1.710012435913086,
"logits/rejected": 1.3257718086242676,
"loss": 0.9777,
"step": 496
},
{
"beta_dpo/beta_used": 0.10120611637830734,
"beta_dpo/beta_used_raw": 0.07404671609401703,
"beta_dpo/gap_mean": 10.990039825439453,
"beta_dpo/gap_std": 19.455825805664062,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.7513227513227513,
"grad_norm": 28.4753475189209,
"learning_rate": 8.930309757836516e-08,
"logits/chosen": 1.7872216701507568,
"logits/rejected": 1.4520567655563354,
"loss": 1.1345,
"step": 497
},
{
"beta_dpo/beta_used": 0.12970580160617828,
"beta_dpo/beta_used_raw": 0.07501335442066193,
"beta_dpo/gap_mean": 11.042081832885742,
"beta_dpo/gap_std": 19.401321411132812,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.7528344671201814,
"grad_norm": 53.85708236694336,
"learning_rate": 8.829247120198563e-08,
"logits/chosen": 1.6962220668792725,
"logits/rejected": 1.4640264511108398,
"loss": 1.1085,
"step": 498
},
{
"beta_dpo/beta_used": 0.22401860356330872,
"beta_dpo/beta_used_raw": 0.16938845813274384,
"beta_dpo/gap_mean": 11.08438491821289,
"beta_dpo/gap_std": 19.733095169067383,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.7543461829176115,
"grad_norm": 50.10066223144531,
"learning_rate": 8.728636813280163e-08,
"logits/chosen": 1.4561519622802734,
"logits/rejected": 1.1343849897384644,
"loss": 1.5875,
"step": 499
},
{
"beta_dpo/beta_used": 0.10568296164274216,
"beta_dpo/beta_used_raw": 0.09129762649536133,
"beta_dpo/gap_mean": 10.93176555633545,
"beta_dpo/gap_std": 19.578086853027344,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.7558578987150416,
"grad_norm": 42.66399383544922,
"learning_rate": 8.628481651367875e-08,
"logits/chosen": 0.674248218536377,
"logits/rejected": 0.703514814376831,
"loss": 1.3396,
"step": 500
},
{
"epoch": 0.7558578987150416,
"eval_beta_dpo/beta_used": 0.1532546579837799,
"eval_beta_dpo/beta_used_raw": 0.12686675786972046,
"eval_beta_dpo/gap_mean": 11.020356178283691,
"eval_beta_dpo/gap_std": 19.520551681518555,
"eval_beta_dpo/mask_keep_frac": 1.0,
"eval_logits/chosen": 1.398350715637207,
"eval_logits/rejected": 1.2734830379486084,
"eval_loss": 0.6756347417831421,
"eval_runtime": 42.7215,
"eval_samples_per_second": 53.907,
"eval_steps_per_second": 1.685,
"step": 500
},
{
"beta_dpo/beta_used": 0.10934046655893326,
"beta_dpo/beta_used_raw": 0.01821669191122055,
"beta_dpo/gap_mean": 10.712957382202148,
"beta_dpo/gap_std": 18.95332908630371,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.7573696145124716,
"grad_norm": 32.02262496948242,
"learning_rate": 8.528784436016878e-08,
"logits/chosen": 1.116631269454956,
"logits/rejected": 1.147378921508789,
"loss": 1.1377,
"step": 501
},
{
"beta_dpo/beta_used": 0.0046331086196005344,
"beta_dpo/beta_used_raw": -0.068320132791996,
"beta_dpo/gap_mean": 10.554143905639648,
"beta_dpo/gap_std": 18.39632225036621,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.7588813303099018,
"grad_norm": 2.11391019821167,
"learning_rate": 8.4295479559726e-08,
"logits/chosen": 1.6380161046981812,
"logits/rejected": 1.426564335823059,
"loss": 1.3467,
"step": 502
},
{
"beta_dpo/beta_used": 0.2738369405269623,
"beta_dpo/beta_used_raw": 0.2738369405269623,
"beta_dpo/gap_mean": 10.669290542602539,
"beta_dpo/gap_std": 18.408435821533203,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.7603930461073318,
"grad_norm": 65.43000793457031,
"learning_rate": 8.330774987092712e-08,
"logits/chosen": 1.4922473430633545,
"logits/rejected": 1.5633766651153564,
"loss": 1.2771,
"step": 503
},
{
"beta_dpo/beta_used": 0.13454070687294006,
"beta_dpo/beta_used_raw": 0.13454070687294006,
"beta_dpo/gap_mean": 11.21607780456543,
"beta_dpo/gap_std": 18.295108795166016,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.7619047619047619,
"grad_norm": 33.310768127441406,
"learning_rate": 8.232468292269479e-08,
"logits/chosen": 1.6152197122573853,
"logits/rejected": 1.4100103378295898,
"loss": 0.7788,
"step": 504
},
{
"beta_dpo/beta_used": 0.11502599716186523,
"beta_dpo/beta_used_raw": 0.09766162186861038,
"beta_dpo/gap_mean": 11.323755264282227,
"beta_dpo/gap_std": 18.809341430664062,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.763416477702192,
"grad_norm": 64.08597564697266,
"learning_rate": 8.134630621352483e-08,
"logits/chosen": 1.4845128059387207,
"logits/rejected": 1.2584877014160156,
"loss": 1.5933,
"step": 505
},
{
"beta_dpo/beta_used": 0.15312659740447998,
"beta_dpo/beta_used_raw": 0.15312659740447998,
"beta_dpo/gap_mean": 10.811168670654297,
"beta_dpo/gap_std": 19.353378295898438,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.764928193499622,
"grad_norm": 45.99757385253906,
"learning_rate": 8.037264711071698e-08,
"logits/chosen": 1.3323204517364502,
"logits/rejected": 1.4588356018066406,
"loss": 1.3287,
"step": 506
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.04676675423979759,
"beta_dpo/gap_mean": 10.543416976928711,
"beta_dpo/gap_std": 19.518844604492188,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.7664399092970522,
"grad_norm": 0.33119961619377136,
"learning_rate": 7.940373284960933e-08,
"logits/chosen": 1.1214375495910645,
"logits/rejected": 1.2219690084457397,
"loss": 1.378,
"step": 507
},
{
"beta_dpo/beta_used": 0.13913913071155548,
"beta_dpo/beta_used_raw": 0.006052389740943909,
"beta_dpo/gap_mean": 10.641485214233398,
"beta_dpo/gap_std": 19.354970932006836,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.7679516250944822,
"grad_norm": 53.24369430541992,
"learning_rate": 7.843959053281663e-08,
"logits/chosen": 1.505273699760437,
"logits/rejected": 1.071250081062317,
"loss": 1.2156,
"step": 508
},
{
"beta_dpo/beta_used": 0.06729910522699356,
"beta_dpo/beta_used_raw": 0.06729910522699356,
"beta_dpo/gap_mean": 11.085270881652832,
"beta_dpo/gap_std": 18.95514678955078,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.7694633408919124,
"grad_norm": 21.316387176513672,
"learning_rate": 7.748024712947204e-08,
"logits/chosen": 0.9079450368881226,
"logits/rejected": 1.0837373733520508,
"loss": 1.0235,
"step": 509
},
{
"beta_dpo/beta_used": 0.09236538410186768,
"beta_dpo/beta_used_raw": 0.035030972212553024,
"beta_dpo/gap_mean": 11.523921012878418,
"beta_dpo/gap_std": 18.953319549560547,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.7709750566893424,
"grad_norm": 39.52785110473633,
"learning_rate": 7.652572947447272e-08,
"logits/chosen": 1.3003780841827393,
"logits/rejected": 1.1664865016937256,
"loss": 1.1283,
"step": 510
},
{
"beta_dpo/beta_used": 0.13791456818580627,
"beta_dpo/beta_used_raw": 0.13791456818580627,
"beta_dpo/gap_mean": 11.82453441619873,
"beta_dpo/gap_std": 19.1085205078125,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.7724867724867724,
"grad_norm": 33.120235443115234,
"learning_rate": 7.557606426772961e-08,
"logits/chosen": 1.7288661003112793,
"logits/rejected": 1.3858253955841064,
"loss": 0.9967,
"step": 511
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.3171396851539612,
"beta_dpo/gap_mean": 11.691259384155273,
"beta_dpo/gap_std": 19.083370208740234,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.7739984882842026,
"grad_norm": 0.3699307441711426,
"learning_rate": 7.463127807341966e-08,
"logits/chosen": 1.2584737539291382,
"logits/rejected": 1.4007148742675781,
"loss": 1.3816,
"step": 512
},
{
"beta_dpo/beta_used": 0.05727185308933258,
"beta_dpo/beta_used_raw": 0.05727185308933258,
"beta_dpo/gap_mean": 11.749606132507324,
"beta_dpo/gap_std": 18.947818756103516,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.7755102040816326,
"grad_norm": 17.115013122558594,
"learning_rate": 7.369139731924401e-08,
"logits/chosen": 1.8472414016723633,
"logits/rejected": 1.7471710443496704,
"loss": 1.0532,
"step": 513
},
{
"beta_dpo/beta_used": 0.16688939929008484,
"beta_dpo/beta_used_raw": 0.16688939929008484,
"beta_dpo/gap_mean": 11.778924942016602,
"beta_dpo/gap_std": 18.789505004882812,
"beta_dpo/mask_keep_frac": 1.0,
"epoch": 0.7770219198790628,
"grad_norm": 32.289146423339844,
"learning_rate": 7.275644829568747e-08,
"logits/chosen": 1.782091498374939,
"logits/rejected": 1.713914394378662,
"loss": 0.8214,
"step": 514
},
{
"beta_dpo/beta_used": 0.00235101324506104,
"beta_dpo/beta_used_raw": -0.041592229157686234,
"beta_dpo/gap_mean": 11.654163360595703,
"beta_dpo/gap_std": 18.874595642089844,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.7785336356764928,
"grad_norm": 0.9957330226898193,
"learning_rate": 7.182645715528435e-08,
"logits/chosen": 2.1653988361358643,
"logits/rejected": 1.9160587787628174,
"loss": 1.3631,
"step": 515
},
{
"beta_dpo/beta_used": 0.020925289019942284,
"beta_dpo/beta_used_raw": 0.004274457693099976,
"beta_dpo/gap_mean": 11.362092018127441,
"beta_dpo/gap_std": 18.48135757446289,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.780045351473923,
"grad_norm": 9.73312759399414,
"learning_rate": 7.090144991188568e-08,
"logits/chosen": 1.586578369140625,
"logits/rejected": 1.3872929811477661,
"loss": 1.2347,
"step": 516
},
{
"beta_dpo/beta_used": 0.06535185873508453,
"beta_dpo/beta_used_raw": -0.04223699867725372,
"beta_dpo/gap_mean": 11.007303237915039,
"beta_dpo/gap_std": 18.780548095703125,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.781557067271353,
"grad_norm": 22.2569637298584,
"learning_rate": 6.998145243993284e-08,
"logits/chosen": 1.4606246948242188,
"logits/rejected": 1.458913803100586,
"loss": 1.1673,
"step": 517
},
{
"beta_dpo/beta_used": 0.039747219532728195,
"beta_dpo/beta_used_raw": 0.017948148772120476,
"beta_dpo/gap_mean": 10.854924201965332,
"beta_dpo/gap_std": 18.901779174804688,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.783068783068783,
"grad_norm": 13.430063247680664,
"learning_rate": 6.906649047373245e-08,
"logits/chosen": 1.23757004737854,
"logits/rejected": 1.2449841499328613,
"loss": 1.1996,
"step": 518
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.16081054508686066,
"beta_dpo/gap_mean": 10.096181869506836,
"beta_dpo/gap_std": 19.545801162719727,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.7845804988662132,
"grad_norm": 0.32356882095336914,
"learning_rate": 6.815658960673781e-08,
"logits/chosen": 1.456130862236023,
"logits/rejected": 1.342390537261963,
"loss": 1.3808,
"step": 519
},
{
"beta_dpo/beta_used": 0.17143839597702026,
"beta_dpo/beta_used_raw": 0.1322009265422821,
"beta_dpo/gap_mean": 10.353086471557617,
"beta_dpo/gap_std": 18.939144134521484,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.7860922146636432,
"grad_norm": 22.188499450683594,
"learning_rate": 6.725177529083209e-08,
"logits/chosen": 1.7241880893707275,
"logits/rejected": 1.1929926872253418,
"loss": 0.8542,
"step": 520
},
{
"beta_dpo/beta_used": 0.1159067451953888,
"beta_dpo/beta_used_raw": 0.1159067451953888,
"beta_dpo/gap_mean": 10.21092414855957,
"beta_dpo/gap_std": 18.545848846435547,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.7876039304610734,
"grad_norm": 26.739612579345703,
"learning_rate": 6.63520728356167e-08,
"logits/chosen": 1.4143714904785156,
"logits/rejected": 1.3485612869262695,
"loss": 1.0487,
"step": 521
},
{
"beta_dpo/beta_used": 0.12564007937908173,
"beta_dpo/beta_used_raw": 0.12564007937908173,
"beta_dpo/gap_mean": 10.022201538085938,
"beta_dpo/gap_std": 18.765472412109375,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.7891156462585034,
"grad_norm": 59.06966018676758,
"learning_rate": 6.545750740770336e-08,
"logits/chosen": 1.6190658807754517,
"logits/rejected": 1.4547877311706543,
"loss": 1.3958,
"step": 522
},
{
"beta_dpo/beta_used": 0.23686088621616364,
"beta_dpo/beta_used_raw": 0.23686088621616364,
"beta_dpo/gap_mean": 10.224916458129883,
"beta_dpo/gap_std": 18.703235626220703,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.7906273620559335,
"grad_norm": 61.67593765258789,
"learning_rate": 6.456810403001012e-08,
"logits/chosen": 1.7736190557479858,
"logits/rejected": 0.9705901145935059,
"loss": 0.9546,
"step": 523
},
{
"beta_dpo/beta_used": 0.10322294384241104,
"beta_dpo/beta_used_raw": 0.10322294384241104,
"beta_dpo/gap_mean": 9.980840682983398,
"beta_dpo/gap_std": 18.50246810913086,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.7921390778533636,
"grad_norm": 37.48660659790039,
"learning_rate": 6.368388758106134e-08,
"logits/chosen": 1.1455576419830322,
"logits/rejected": 1.1552529335021973,
"loss": 1.0655,
"step": 524
},
{
"beta_dpo/beta_used": 0.003437028033658862,
"beta_dpo/beta_used_raw": -0.013644227758049965,
"beta_dpo/gap_mean": 9.956314086914062,
"beta_dpo/gap_std": 18.658798217773438,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.7936507936507936,
"grad_norm": 1.9367986917495728,
"learning_rate": 6.280488279429185e-08,
"logits/chosen": 0.9969067573547363,
"logits/rejected": 0.8848774433135986,
"loss": 1.3582,
"step": 525
},
{
"beta_dpo/beta_used": 0.11064125597476959,
"beta_dpo/beta_used_raw": 0.008903838694095612,
"beta_dpo/gap_mean": 9.922897338867188,
"beta_dpo/gap_std": 18.31914520263672,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.7951625094482238,
"grad_norm": 26.800683975219727,
"learning_rate": 6.193111425735515e-08,
"logits/chosen": 1.173614740371704,
"logits/rejected": 0.848638653755188,
"loss": 1.1171,
"step": 526
},
{
"beta_dpo/beta_used": 0.06301558017730713,
"beta_dpo/beta_used_raw": 0.04940624535083771,
"beta_dpo/gap_mean": 9.1787691116333,
"beta_dpo/gap_std": 17.924055099487305,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.7966742252456538,
"grad_norm": 24.338993072509766,
"learning_rate": 6.106260641143546e-08,
"logits/chosen": 1.8160502910614014,
"logits/rejected": 1.425750970840454,
"loss": 1.192,
"step": 527
},
{
"beta_dpo/beta_used": 0.06259048730134964,
"beta_dpo/beta_used_raw": 0.06259048730134964,
"beta_dpo/gap_mean": 9.066558837890625,
"beta_dpo/gap_std": 18.2850399017334,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.7981859410430839,
"grad_norm": 18.2799129486084,
"learning_rate": 6.019938355056422e-08,
"logits/chosen": 1.1618304252624512,
"logits/rejected": 1.5348981618881226,
"loss": 1.1168,
"step": 528
},
{
"beta_dpo/beta_used": 0.35698869824409485,
"beta_dpo/beta_used_raw": 0.35698869824409485,
"beta_dpo/gap_mean": 10.158662796020508,
"beta_dpo/gap_std": 18.37487030029297,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.799697656840514,
"grad_norm": 77.24592590332031,
"learning_rate": 5.934146982094049e-08,
"logits/chosen": 1.2812597751617432,
"logits/rejected": 1.2239878177642822,
"loss": 1.1347,
"step": 529
},
{
"beta_dpo/beta_used": 0.14690837264060974,
"beta_dpo/beta_used_raw": 0.11383026838302612,
"beta_dpo/gap_mean": 10.558483123779297,
"beta_dpo/gap_std": 18.502506256103516,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.8012093726379441,
"grad_norm": 49.202369689941406,
"learning_rate": 5.848888922025552e-08,
"logits/chosen": 1.660964012145996,
"logits/rejected": 1.2829644680023193,
"loss": 1.172,
"step": 530
},
{
"beta_dpo/beta_used": 0.11664751917123795,
"beta_dpo/beta_used_raw": 0.05959582328796387,
"beta_dpo/gap_mean": 10.151885032653809,
"beta_dpo/gap_std": 17.962663650512695,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.8027210884353742,
"grad_norm": 28.00320053100586,
"learning_rate": 5.7641665597021435e-08,
"logits/chosen": 2.1570699214935303,
"logits/rejected": 1.9092918634414673,
"loss": 0.9562,
"step": 531
},
{
"beta_dpo/beta_used": 0.0765802264213562,
"beta_dpo/beta_used_raw": 0.05343026667833328,
"beta_dpo/gap_mean": 10.470291137695312,
"beta_dpo/gap_std": 17.93800926208496,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.8042328042328042,
"grad_norm": 29.33611297607422,
"learning_rate": 5.679982264990424e-08,
"logits/chosen": 1.5656516551971436,
"logits/rejected": 1.12631356716156,
"loss": 1.306,
"step": 532
},
{
"beta_dpo/beta_used": 0.01579258404672146,
"beta_dpo/beta_used_raw": -0.00625237263739109,
"beta_dpo/gap_mean": 10.281692504882812,
"beta_dpo/gap_std": 18.202903747558594,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.8057445200302343,
"grad_norm": 5.443710803985596,
"learning_rate": 5.596338392706076e-08,
"logits/chosen": 1.9019914865493774,
"logits/rejected": 1.5987591743469238,
"loss": 1.2761,
"step": 533
},
{
"beta_dpo/beta_used": 0.25557124614715576,
"beta_dpo/beta_used_raw": 0.1648273766040802,
"beta_dpo/gap_mean": 10.804718971252441,
"beta_dpo/gap_std": 18.391155242919922,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.8072562358276644,
"grad_norm": 96.26758575439453,
"learning_rate": 5.513237282548033e-08,
"logits/chosen": 1.1472864151000977,
"logits/rejected": 0.6411304473876953,
"loss": 1.3409,
"step": 534
},
{
"beta_dpo/beta_used": 0.15714676678180695,
"beta_dpo/beta_used_raw": 0.1257384568452835,
"beta_dpo/gap_mean": 10.44320011138916,
"beta_dpo/gap_std": 18.42446517944336,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.8087679516250945,
"grad_norm": 38.07860565185547,
"learning_rate": 5.430681259032957e-08,
"logits/chosen": 1.16520094871521,
"logits/rejected": 0.858991801738739,
"loss": 1.0452,
"step": 535
},
{
"beta_dpo/beta_used": 0.17811733484268188,
"beta_dpo/beta_used_raw": 0.1541111320257187,
"beta_dpo/gap_mean": 10.847288131713867,
"beta_dpo/gap_std": 18.489604949951172,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.8102796674225246,
"grad_norm": 51.901405334472656,
"learning_rate": 5.3486726314303175e-08,
"logits/chosen": 1.6397348642349243,
"logits/rejected": 1.5184638500213623,
"loss": 1.1259,
"step": 536
},
{
"beta_dpo/beta_used": 0.06095781922340393,
"beta_dpo/beta_used_raw": 0.06095781922340393,
"beta_dpo/gap_mean": 10.72732162475586,
"beta_dpo/gap_std": 18.210926055908203,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.8117913832199547,
"grad_norm": 14.593372344970703,
"learning_rate": 5.267213693697695e-08,
"logits/chosen": 1.3957045078277588,
"logits/rejected": 1.092875599861145,
"loss": 1.0809,
"step": 537
},
{
"beta_dpo/beta_used": 0.09433559328317642,
"beta_dpo/beta_used_raw": 0.06990790367126465,
"beta_dpo/gap_mean": 10.813741683959961,
"beta_dpo/gap_std": 18.126432418823242,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.8133030990173847,
"grad_norm": 22.37622833251953,
"learning_rate": 5.1863067244167144e-08,
"logits/chosen": 1.5632539987564087,
"logits/rejected": 1.6953120231628418,
"loss": 1.0197,
"step": 538
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.04334234446287155,
"beta_dpo/gap_mean": 10.80185317993164,
"beta_dpo/gap_std": 17.788105010986328,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.8148148148148148,
"grad_norm": 0.34440314769744873,
"learning_rate": 5.105953986729195e-08,
"logits/chosen": 1.4430513381958008,
"logits/rejected": 1.2813853025436401,
"loss": 1.3784,
"step": 539
},
{
"beta_dpo/beta_used": 0.2005300521850586,
"beta_dpo/beta_used_raw": 0.2005300521850586,
"beta_dpo/gap_mean": 11.087736129760742,
"beta_dpo/gap_std": 17.963241577148438,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.8163265306122449,
"grad_norm": 67.64325714111328,
"learning_rate": 5.026157728273966e-08,
"logits/chosen": 1.9035638570785522,
"logits/rejected": 1.7133615016937256,
"loss": 0.9523,
"step": 540
},
{
"beta_dpo/beta_used": 0.1668683886528015,
"beta_dpo/beta_used_raw": 0.13656221330165863,
"beta_dpo/gap_mean": 11.58251667022705,
"beta_dpo/gap_std": 17.98819351196289,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.817838246409675,
"grad_norm": 57.01007843017578,
"learning_rate": 4.9469201811239035e-08,
"logits/chosen": 1.6449933052062988,
"logits/rejected": 1.851159930229187,
"loss": 1.1774,
"step": 541
},
{
"beta_dpo/beta_used": 0.2715034484863281,
"beta_dpo/beta_used_raw": 0.2715034484863281,
"beta_dpo/gap_mean": 12.08657455444336,
"beta_dpo/gap_std": 18.468975067138672,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.8193499622071051,
"grad_norm": 71.46245574951172,
"learning_rate": 4.868243561723534e-08,
"logits/chosen": 1.407960057258606,
"logits/rejected": 1.1887967586517334,
"loss": 1.0508,
"step": 542
},
{
"beta_dpo/beta_used": 0.12943723797798157,
"beta_dpo/beta_used_raw": 0.12943723797798157,
"beta_dpo/gap_mean": 12.076602935791016,
"beta_dpo/gap_std": 18.590787887573242,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.8208616780045351,
"grad_norm": 29.349206924438477,
"learning_rate": 4.790130070827028e-08,
"logits/chosen": 1.7124477624893188,
"logits/rejected": 1.4872949123382568,
"loss": 0.9852,
"step": 543
},
{
"beta_dpo/beta_used": 0.029429566115140915,
"beta_dpo/beta_used_raw": 0.023324094712734222,
"beta_dpo/gap_mean": 12.406301498413086,
"beta_dpo/gap_std": 19.229873657226562,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.8223733938019653,
"grad_norm": 13.987939834594727,
"learning_rate": 4.7125818934366454e-08,
"logits/chosen": 1.2513970136642456,
"logits/rejected": 1.3183088302612305,
"loss": 1.1907,
"step": 544
},
{
"beta_dpo/beta_used": 0.1011621505022049,
"beta_dpo/beta_used_raw": 0.1011621505022049,
"beta_dpo/gap_mean": 12.334243774414062,
"beta_dpo/gap_std": 19.540382385253906,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.8238851095993953,
"grad_norm": 15.821413040161133,
"learning_rate": 4.635601198741607e-08,
"logits/chosen": 1.6381149291992188,
"logits/rejected": 1.4946880340576172,
"loss": 0.823,
"step": 545
},
{
"beta_dpo/beta_used": 0.1270730048418045,
"beta_dpo/beta_used_raw": 0.11782974004745483,
"beta_dpo/gap_mean": 12.052278518676758,
"beta_dpo/gap_std": 19.115442276000977,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.8253968253968254,
"grad_norm": 42.763160705566406,
"learning_rate": 4.559190140057428e-08,
"logits/chosen": 1.3137729167938232,
"logits/rejected": 1.331726312637329,
"loss": 1.117,
"step": 546
},
{
"beta_dpo/beta_used": 0.11487125605344772,
"beta_dpo/beta_used_raw": 0.11487125605344772,
"beta_dpo/gap_mean": 12.46994400024414,
"beta_dpo/gap_std": 19.053627014160156,
"beta_dpo/mask_keep_frac": 0.625,
"epoch": 0.8269085411942555,
"grad_norm": 18.704116821289062,
"learning_rate": 4.483350854765672e-08,
"logits/chosen": 1.0267034769058228,
"logits/rejected": 0.6374800801277161,
"loss": 0.8122,
"step": 547
},
{
"beta_dpo/beta_used": 0.03800983354449272,
"beta_dpo/beta_used_raw": 0.00829574279487133,
"beta_dpo/gap_mean": 11.772872924804688,
"beta_dpo/gap_std": 18.622325897216797,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.8284202569916855,
"grad_norm": 19.206010818481445,
"learning_rate": 4.4080854642541826e-08,
"logits/chosen": 1.4546637535095215,
"logits/rejected": 1.3802220821380615,
"loss": 1.2243,
"step": 548
},
{
"beta_dpo/beta_used": 0.09710898995399475,
"beta_dpo/beta_used_raw": 0.07338780164718628,
"beta_dpo/gap_mean": 11.424276351928711,
"beta_dpo/gap_std": 18.64669418334961,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.8299319727891157,
"grad_norm": 38.650184631347656,
"learning_rate": 4.333396073857723e-08,
"logits/chosen": 1.831252098083496,
"logits/rejected": 1.8664486408233643,
"loss": 1.1348,
"step": 549
},
{
"beta_dpo/beta_used": 0.1007058173418045,
"beta_dpo/beta_used_raw": 0.1007058173418045,
"beta_dpo/gap_mean": 11.098159790039062,
"beta_dpo/gap_std": 18.92105484008789,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.8314436885865457,
"grad_norm": 33.382164001464844,
"learning_rate": 4.259284772799099e-08,
"logits/chosen": 1.819729208946228,
"logits/rejected": 1.856877088546753,
"loss": 1.0106,
"step": 550
},
{
"beta_dpo/beta_used": 0.044430945068597794,
"beta_dpo/beta_used_raw": 0.03116544708609581,
"beta_dpo/gap_mean": 10.615936279296875,
"beta_dpo/gap_std": 18.659568786621094,
"beta_dpo/mask_keep_frac": 0.625,
"epoch": 0.8329554043839759,
"grad_norm": 14.599547386169434,
"learning_rate": 4.1857536341307176e-08,
"logits/chosen": 1.8848488330841064,
"logits/rejected": 1.6397110223770142,
"loss": 1.1842,
"step": 551
},
{
"beta_dpo/beta_used": 0.19954092800617218,
"beta_dpo/beta_used_raw": 0.19954092800617218,
"beta_dpo/gap_mean": 10.359419822692871,
"beta_dpo/gap_std": 18.463096618652344,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.8344671201814059,
"grad_norm": 42.67924118041992,
"learning_rate": 4.112804714676593e-08,
"logits/chosen": 1.7389799356460571,
"logits/rejected": 1.3862097263336182,
"loss": 1.0244,
"step": 552
},
{
"beta_dpo/beta_used": 0.2873495817184448,
"beta_dpo/beta_used_raw": 0.2873495817184448,
"beta_dpo/gap_mean": 10.555915832519531,
"beta_dpo/gap_std": 18.88970184326172,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.8359788359788359,
"grad_norm": 65.49226379394531,
"learning_rate": 4.0404400549748144e-08,
"logits/chosen": 1.8026375770568848,
"logits/rejected": 1.2519030570983887,
"loss": 1.1918,
"step": 553
},
{
"beta_dpo/beta_used": 0.05840389430522919,
"beta_dpo/beta_used_raw": 0.033750779926776886,
"beta_dpo/gap_mean": 10.615468978881836,
"beta_dpo/gap_std": 18.631549835205078,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.8374905517762661,
"grad_norm": 17.503585815429688,
"learning_rate": 3.968661679220467e-08,
"logits/chosen": 1.0402394533157349,
"logits/rejected": 0.9546246528625488,
"loss": 1.1337,
"step": 554
},
{
"beta_dpo/beta_used": 0.13094915449619293,
"beta_dpo/beta_used_raw": 0.13006287813186646,
"beta_dpo/gap_mean": 10.578245162963867,
"beta_dpo/gap_std": 18.960113525390625,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.8390022675736961,
"grad_norm": 42.747737884521484,
"learning_rate": 3.89747159520904e-08,
"logits/chosen": 1.5776338577270508,
"logits/rejected": 1.3985557556152344,
"loss": 1.1577,
"step": 555
},
{
"beta_dpo/beta_used": 0.005854336079210043,
"beta_dpo/beta_used_raw": -0.03035161830484867,
"beta_dpo/gap_mean": 10.757535934448242,
"beta_dpo/gap_std": 18.839813232421875,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.8405139833711263,
"grad_norm": 2.7324094772338867,
"learning_rate": 3.826871794280192e-08,
"logits/chosen": 1.3301312923431396,
"logits/rejected": 1.2104971408843994,
"loss": 1.339,
"step": 556
},
{
"beta_dpo/beta_used": 0.12279447913169861,
"beta_dpo/beta_used_raw": 0.09184837341308594,
"beta_dpo/gap_mean": 11.148019790649414,
"beta_dpo/gap_std": 18.934059143066406,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.8420256991685563,
"grad_norm": 28.103437423706055,
"learning_rate": 3.756864251262143e-08,
"logits/chosen": 1.197859525680542,
"logits/rejected": 0.6767659187316895,
"loss": 1.0637,
"step": 557
},
{
"beta_dpo/beta_used": 0.08440352976322174,
"beta_dpo/beta_used_raw": 0.05450304225087166,
"beta_dpo/gap_mean": 11.180231094360352,
"beta_dpo/gap_std": 19.118072509765625,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.8435374149659864,
"grad_norm": 24.796546936035156,
"learning_rate": 3.687450924416341e-08,
"logits/chosen": 1.7133653163909912,
"logits/rejected": 1.6142797470092773,
"loss": 1.1398,
"step": 558
},
{
"beta_dpo/beta_used": 0.10217013955116272,
"beta_dpo/beta_used_raw": 0.0785236731171608,
"beta_dpo/gap_mean": 11.172683715820312,
"beta_dpo/gap_std": 19.399667739868164,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.8450491307634165,
"grad_norm": 28.550260543823242,
"learning_rate": 3.6186337553827743e-08,
"logits/chosen": 1.3882708549499512,
"logits/rejected": 0.937119722366333,
"loss": 1.0391,
"step": 559
},
{
"beta_dpo/beta_used": 0.1818804293870926,
"beta_dpo/beta_used_raw": 0.07039390504360199,
"beta_dpo/gap_mean": 11.01672649383545,
"beta_dpo/gap_std": 19.380664825439453,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.8465608465608465,
"grad_norm": 51.63716125488281,
"learning_rate": 3.550414669125573e-08,
"logits/chosen": 1.3455142974853516,
"logits/rejected": 1.3270020484924316,
"loss": 1.1142,
"step": 560
},
{
"beta_dpo/beta_used": 0.08334767073392868,
"beta_dpo/beta_used_raw": 0.012797832489013672,
"beta_dpo/gap_mean": 11.576347351074219,
"beta_dpo/gap_std": 19.41046905517578,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.8480725623582767,
"grad_norm": 28.351787567138672,
"learning_rate": 3.482795573879241e-08,
"logits/chosen": 1.6244086027145386,
"logits/rejected": 1.520763874053955,
"loss": 1.0719,
"step": 561
},
{
"beta_dpo/beta_used": 0.053987935185432434,
"beta_dpo/beta_used_raw": -0.038065314292907715,
"beta_dpo/gap_mean": 11.818717956542969,
"beta_dpo/gap_std": 19.462326049804688,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.8495842781557067,
"grad_norm": 18.966503143310547,
"learning_rate": 3.415778361095226e-08,
"logits/chosen": 1.7821249961853027,
"logits/rejected": 1.3919886350631714,
"loss": 1.1546,
"step": 562
},
{
"beta_dpo/beta_used": 0.11155681312084198,
"beta_dpo/beta_used_raw": 0.11155681312084198,
"beta_dpo/gap_mean": 11.847363471984863,
"beta_dpo/gap_std": 20.0205020904541,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.8510959939531368,
"grad_norm": 25.363344192504883,
"learning_rate": 3.349364905389032e-08,
"logits/chosen": 1.2508881092071533,
"logits/rejected": 0.9743169546127319,
"loss": 0.9877,
"step": 563
},
{
"beta_dpo/beta_used": 0.16263367235660553,
"beta_dpo/beta_used_raw": 0.16263367235660553,
"beta_dpo/gap_mean": 11.87989616394043,
"beta_dpo/gap_std": 20.273239135742188,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.8526077097505669,
"grad_norm": 38.15361785888672,
"learning_rate": 3.283557064487785e-08,
"logits/chosen": 1.650681734085083,
"logits/rejected": 1.6767246723175049,
"loss": 0.9964,
"step": 564
},
{
"beta_dpo/beta_used": 0.03271957114338875,
"beta_dpo/beta_used_raw": -0.13078060746192932,
"beta_dpo/gap_mean": 11.701870918273926,
"beta_dpo/gap_std": 19.98330307006836,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.854119425547997,
"grad_norm": 13.035989761352539,
"learning_rate": 3.218356679178252e-08,
"logits/chosen": 1.7115111351013184,
"logits/rejected": 1.2233140468597412,
"loss": 1.1906,
"step": 565
},
{
"beta_dpo/beta_used": 0.061468079686164856,
"beta_dpo/beta_used_raw": -0.002200111746788025,
"beta_dpo/gap_mean": 11.254524230957031,
"beta_dpo/gap_std": 19.918685913085938,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.8556311413454271,
"grad_norm": 36.61579132080078,
"learning_rate": 3.1537655732553764e-08,
"logits/chosen": 1.8470666408538818,
"logits/rejected": 1.5994932651519775,
"loss": 1.2811,
"step": 566
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.07913199067115784,
"beta_dpo/gap_mean": 11.579656600952148,
"beta_dpo/gap_std": 19.583362579345703,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.8571428571428571,
"grad_norm": 0.510140597820282,
"learning_rate": 3.089785553471233e-08,
"logits/chosen": 0.953148603439331,
"logits/rejected": 1.038599967956543,
"loss": 1.3768,
"step": 567
},
{
"beta_dpo/beta_used": 0.05987339839339256,
"beta_dpo/beta_used_raw": -0.008187372237443924,
"beta_dpo/gap_mean": 11.487663269042969,
"beta_dpo/gap_std": 18.96971893310547,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.8586545729402872,
"grad_norm": 23.292221069335938,
"learning_rate": 3.026418409484513e-08,
"logits/chosen": 1.5052483081817627,
"logits/rejected": 1.313591718673706,
"loss": 1.1959,
"step": 568
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.05208010971546173,
"beta_dpo/gap_mean": 11.568532943725586,
"beta_dpo/gap_std": 18.769332885742188,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.8601662887377173,
"grad_norm": 0.33875489234924316,
"learning_rate": 2.963665913810451e-08,
"logits/chosen": 1.5558233261108398,
"logits/rejected": 1.5793402194976807,
"loss": 1.378,
"step": 569
},
{
"beta_dpo/beta_used": 0.2890852391719818,
"beta_dpo/beta_used_raw": 0.2890852391719818,
"beta_dpo/gap_mean": 11.699023246765137,
"beta_dpo/gap_std": 18.902652740478516,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.8616780045351474,
"grad_norm": 49.38574981689453,
"learning_rate": 2.9015298217712453e-08,
"logits/chosen": 1.6536730527877808,
"logits/rejected": 1.5479531288146973,
"loss": 0.6619,
"step": 570
},
{
"beta_dpo/beta_used": 0.12220169603824615,
"beta_dpo/beta_used_raw": 0.07312116771936417,
"beta_dpo/gap_mean": 11.507964134216309,
"beta_dpo/gap_std": 18.88650894165039,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.8631897203325775,
"grad_norm": 22.84241485595703,
"learning_rate": 2.840011871446962e-08,
"logits/chosen": 1.3720738887786865,
"logits/rejected": 1.0536761283874512,
"loss": 1.0365,
"step": 571
},
{
"beta_dpo/beta_used": 0.07529482990503311,
"beta_dpo/beta_used_raw": 0.016804847866296768,
"beta_dpo/gap_mean": 11.208440780639648,
"beta_dpo/gap_std": 18.8841495513916,
"beta_dpo/mask_keep_frac": 0.5625,
"epoch": 0.8647014361300076,
"grad_norm": 22.246524810791016,
"learning_rate": 2.7791137836269158e-08,
"logits/chosen": 1.4899077415466309,
"logits/rejected": 1.3142718076705933,
"loss": 1.0893,
"step": 572
},
{
"beta_dpo/beta_used": 0.10679773986339569,
"beta_dpo/beta_used_raw": 0.10679773986339569,
"beta_dpo/gap_mean": 11.175505638122559,
"beta_dpo/gap_std": 19.131608963012695,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.8662131519274376,
"grad_norm": 25.517454147338867,
"learning_rate": 2.718837261761528e-08,
"logits/chosen": 1.6858450174331665,
"logits/rejected": 1.5363482236862183,
"loss": 0.9872,
"step": 573
},
{
"beta_dpo/beta_used": 0.3487897515296936,
"beta_dpo/beta_used_raw": 0.3487897515296936,
"beta_dpo/gap_mean": 11.519730567932129,
"beta_dpo/gap_std": 19.185972213745117,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.8677248677248677,
"grad_norm": 61.2857780456543,
"learning_rate": 2.659183991914696e-08,
"logits/chosen": 1.2357861995697021,
"logits/rejected": 1.3313536643981934,
"loss": 0.7915,
"step": 574
},
{
"beta_dpo/beta_used": 0.12381540983915329,
"beta_dpo/beta_used_raw": -0.13067613542079926,
"beta_dpo/gap_mean": 11.860994338989258,
"beta_dpo/gap_std": 19.543277740478516,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.8692365835222978,
"grad_norm": 35.53083038330078,
"learning_rate": 2.600155642716606e-08,
"logits/chosen": 1.5488557815551758,
"logits/rejected": 1.1586174964904785,
"loss": 1.3028,
"step": 575
},
{
"beta_dpo/beta_used": 0.3507198095321655,
"beta_dpo/beta_used_raw": 0.3507198095321655,
"beta_dpo/gap_mean": 11.940589904785156,
"beta_dpo/gap_std": 19.488418579101562,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.8707482993197279,
"grad_norm": 90.53150177001953,
"learning_rate": 2.5417538653170754e-08,
"logits/chosen": 1.8404762744903564,
"logits/rejected": 1.5911169052124023,
"loss": 1.0244,
"step": 576
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.2546420693397522,
"beta_dpo/gap_mean": 11.400903701782227,
"beta_dpo/gap_std": 19.448040008544922,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.872260015117158,
"grad_norm": 0.31355878710746765,
"learning_rate": 2.4839802933393607e-08,
"logits/chosen": 2.165390968322754,
"logits/rejected": 1.9945690631866455,
"loss": 1.3799,
"step": 577
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.14779648184776306,
"beta_dpo/gap_mean": 10.787927627563477,
"beta_dpo/gap_std": 19.160215377807617,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.873771730914588,
"grad_norm": 0.3057783842086792,
"learning_rate": 2.4268365428344733e-08,
"logits/chosen": 1.7347502708435059,
"logits/rejected": 1.4005095958709717,
"loss": 1.38,
"step": 578
},
{
"beta_dpo/beta_used": 0.04637397825717926,
"beta_dpo/beta_used_raw": 0.03942590579390526,
"beta_dpo/gap_mean": 10.95724105834961,
"beta_dpo/gap_std": 18.829822540283203,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.8752834467120182,
"grad_norm": 12.994908332824707,
"learning_rate": 2.3703242122359357e-08,
"logits/chosen": 1.5472080707550049,
"logits/rejected": 1.3895456790924072,
"loss": 1.1182,
"step": 579
},
{
"beta_dpo/beta_used": 0.0774238333106041,
"beta_dpo/beta_used_raw": 0.02635762467980385,
"beta_dpo/gap_mean": 10.798433303833008,
"beta_dpo/gap_std": 19.00153350830078,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.8767951625094482,
"grad_norm": 18.991615295410156,
"learning_rate": 2.3144448823151392e-08,
"logits/chosen": 1.3897788524627686,
"logits/rejected": 1.2619503736495972,
"loss": 1.0359,
"step": 580
},
{
"beta_dpo/beta_used": 0.20503893494606018,
"beta_dpo/beta_used_raw": 0.20503893494606018,
"beta_dpo/gap_mean": 10.974782943725586,
"beta_dpo/gap_std": 18.892879486083984,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.8783068783068783,
"grad_norm": 46.59199142456055,
"learning_rate": 2.259200116137039e-08,
"logits/chosen": 1.6775740385055542,
"logits/rejected": 1.5639266967773438,
"loss": 1.0779,
"step": 581
},
{
"beta_dpo/beta_used": 0.08953151851892471,
"beta_dpo/beta_used_raw": 0.08953151851892471,
"beta_dpo/gap_mean": 10.719525337219238,
"beta_dpo/gap_std": 18.856456756591797,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.8798185941043084,
"grad_norm": 28.192134857177734,
"learning_rate": 2.204591459016525e-08,
"logits/chosen": 1.0942572355270386,
"logits/rejected": 0.8079568147659302,
"loss": 1.3575,
"step": 582
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.047244954854249954,
"beta_dpo/gap_mean": 10.52918529510498,
"beta_dpo/gap_std": 19.494892120361328,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.8813303099017384,
"grad_norm": 0.4001644253730774,
"learning_rate": 2.1506204384751064e-08,
"logits/chosen": 1.7894879579544067,
"logits/rejected": 1.7086610794067383,
"loss": 1.3775,
"step": 583
},
{
"beta_dpo/beta_used": 0.226608008146286,
"beta_dpo/beta_used_raw": 0.226608008146286,
"beta_dpo/gap_mean": 10.642799377441406,
"beta_dpo/gap_std": 19.788650512695312,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.8828420256991686,
"grad_norm": 68.65081787109375,
"learning_rate": 2.09728856419826e-08,
"logits/chosen": 1.6787614822387695,
"logits/rejected": 1.416142225265503,
"loss": 1.4486,
"step": 584
},
{
"beta_dpo/beta_used": 0.147002711892128,
"beta_dpo/beta_used_raw": 0.09757278859615326,
"beta_dpo/gap_mean": 10.093536376953125,
"beta_dpo/gap_std": 19.318706512451172,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.8843537414965986,
"grad_norm": 46.095481872558594,
"learning_rate": 2.044597327993153e-08,
"logits/chosen": 1.482553243637085,
"logits/rejected": 1.1475740671157837,
"loss": 1.0405,
"step": 585
},
{
"beta_dpo/beta_used": 0.23156246542930603,
"beta_dpo/beta_used_raw": 0.23156246542930603,
"beta_dpo/gap_mean": 10.390697479248047,
"beta_dpo/gap_std": 18.974735260009766,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.8858654572940288,
"grad_norm": 73.39409637451172,
"learning_rate": 1.9925482037469187e-08,
"logits/chosen": 1.423257827758789,
"logits/rejected": 1.4906511306762695,
"loss": 1.2219,
"step": 586
},
{
"beta_dpo/beta_used": 0.12944234907627106,
"beta_dpo/beta_used_raw": 0.08352088928222656,
"beta_dpo/gap_mean": 11.080392837524414,
"beta_dpo/gap_std": 19.111534118652344,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.8873771730914588,
"grad_norm": 83.24421691894531,
"learning_rate": 1.9411426473854687e-08,
"logits/chosen": 1.5286226272583008,
"logits/rejected": 1.362818956375122,
"loss": 1.2369,
"step": 587
},
{
"beta_dpo/beta_used": 0.20540329813957214,
"beta_dpo/beta_used_raw": 0.20540329813957214,
"beta_dpo/gap_mean": 11.472732543945312,
"beta_dpo/gap_std": 18.702571868896484,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.8888888888888888,
"grad_norm": 45.81542205810547,
"learning_rate": 1.890382096832699e-08,
"logits/chosen": 1.0323173999786377,
"logits/rejected": 0.911257266998291,
"loss": 0.8494,
"step": 588
},
{
"beta_dpo/beta_used": 0.2612083852291107,
"beta_dpo/beta_used_raw": 0.2612083852291107,
"beta_dpo/gap_mean": 11.75611686706543,
"beta_dpo/gap_std": 18.558521270751953,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.890400604686319,
"grad_norm": 55.26761245727539,
"learning_rate": 1.840267971970344e-08,
"logits/chosen": 1.2208616733551025,
"logits/rejected": 1.25350022315979,
"loss": 0.6308,
"step": 589
},
{
"beta_dpo/beta_used": 0.06067529320716858,
"beta_dpo/beta_used_raw": 0.06067529320716858,
"beta_dpo/gap_mean": 12.026023864746094,
"beta_dpo/gap_std": 18.19622802734375,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.891912320483749,
"grad_norm": 14.301695823669434,
"learning_rate": 1.7908016745981856e-08,
"logits/chosen": 1.392610788345337,
"logits/rejected": 1.33760404586792,
"loss": 0.9256,
"step": 590
},
{
"beta_dpo/beta_used": 0.1715593785047531,
"beta_dpo/beta_used_raw": 0.06642448157072067,
"beta_dpo/gap_mean": 12.463174819946289,
"beta_dpo/gap_std": 18.169431686401367,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.8934240362811792,
"grad_norm": 47.96595001220703,
"learning_rate": 1.7419845883949098e-08,
"logits/chosen": 1.46830153465271,
"logits/rejected": 1.4073936939239502,
"loss": 1.2754,
"step": 591
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.3008629381656647,
"beta_dpo/gap_mean": 12.13792610168457,
"beta_dpo/gap_std": 18.62552833557129,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.8949357520786092,
"grad_norm": 0.33993443846702576,
"learning_rate": 1.6938180788793556e-08,
"logits/chosen": 1.4184048175811768,
"logits/rejected": 1.281282663345337,
"loss": 1.3805,
"step": 592
},
{
"beta_dpo/beta_used": 0.01834931969642639,
"beta_dpo/beta_used_raw": 0.01834931969642639,
"beta_dpo/gap_mean": 12.079296112060547,
"beta_dpo/gap_std": 18.619632720947266,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.8964474678760394,
"grad_norm": 9.732373237609863,
"learning_rate": 1.6463034933723336e-08,
"logits/chosen": 1.4756786823272705,
"logits/rejected": 1.439645528793335,
"loss": 1.2391,
"step": 593
},
{
"beta_dpo/beta_used": 0.03766559436917305,
"beta_dpo/beta_used_raw": -0.07916873693466187,
"beta_dpo/gap_mean": 11.784311294555664,
"beta_dpo/gap_std": 18.663883209228516,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.8979591836734694,
"grad_norm": 13.725388526916504,
"learning_rate": 1.5994421609589385e-08,
"logits/chosen": 1.5952414274215698,
"logits/rejected": 1.5322446823120117,
"loss": 1.1311,
"step": 594
},
{
"beta_dpo/beta_used": 0.29605233669281006,
"beta_dpo/beta_used_raw": 0.29605233669281006,
"beta_dpo/gap_mean": 11.810873031616211,
"beta_dpo/gap_std": 19.1055850982666,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.8994708994708994,
"grad_norm": 77.941650390625,
"learning_rate": 1.553235392451377e-08,
"logits/chosen": 1.9404058456420898,
"logits/rejected": 1.6339752674102783,
"loss": 1.0055,
"step": 595
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.19658523797988892,
"beta_dpo/gap_mean": 11.020740509033203,
"beta_dpo/gap_std": 19.33443832397461,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.9009826152683296,
"grad_norm": 0.28924560546875,
"learning_rate": 1.507684480352292e-08,
"logits/chosen": 1.3958216905593872,
"logits/rejected": 1.2835626602172852,
"loss": 1.3809,
"step": 596
},
{
"beta_dpo/beta_used": 0.11626582592725754,
"beta_dpo/beta_used_raw": 0.09599099308252335,
"beta_dpo/gap_mean": 10.842538833618164,
"beta_dpo/gap_std": 19.066997528076172,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.9024943310657596,
"grad_norm": 29.83753776550293,
"learning_rate": 1.4627906988186111e-08,
"logits/chosen": 1.3235628604888916,
"logits/rejected": 1.3475373983383179,
"loss": 1.0947,
"step": 597
},
{
"beta_dpo/beta_used": 0.00658452557399869,
"beta_dpo/beta_used_raw": -0.0820087194442749,
"beta_dpo/gap_mean": 10.465547561645508,
"beta_dpo/gap_std": 18.86764907836914,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.9040060468631897,
"grad_norm": 2.736207962036133,
"learning_rate": 1.4185553036259095e-08,
"logits/chosen": 0.8764083385467529,
"logits/rejected": 0.8989740610122681,
"loss": 1.3336,
"step": 598
},
{
"beta_dpo/beta_used": 0.01312843058258295,
"beta_dpo/beta_used_raw": -0.017385948449373245,
"beta_dpo/gap_mean": 10.36957836151123,
"beta_dpo/gap_std": 18.702917098999023,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.9055177626606198,
"grad_norm": 5.408941268920898,
"learning_rate": 1.3749795321332885e-08,
"logits/chosen": 1.087230920791626,
"logits/rejected": 1.0524613857269287,
"loss": 1.2818,
"step": 599
},
{
"beta_dpo/beta_used": 0.10121805220842361,
"beta_dpo/beta_used_raw": 0.10121805220842361,
"beta_dpo/gap_mean": 9.870223045349121,
"beta_dpo/gap_std": 18.549135208129883,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.9070294784580499,
"grad_norm": 29.112049102783203,
"learning_rate": 1.3320646032487393e-08,
"logits/chosen": 1.4981776475906372,
"logits/rejected": 1.579132080078125,
"loss": 1.0636,
"step": 600
},
{
"epoch": 0.9070294784580499,
"eval_beta_dpo/beta_used": 0.19951747357845306,
"eval_beta_dpo/beta_used_raw": 0.18094317615032196,
"eval_beta_dpo/gap_mean": 9.920242309570312,
"eval_beta_dpo/gap_std": 18.34697914123535,
"eval_beta_dpo/mask_keep_frac": 1.0,
"eval_logits/chosen": 1.5448524951934814,
"eval_logits/rejected": 1.4136770963668823,
"eval_loss": 0.725577175617218,
"eval_runtime": 42.6427,
"eval_samples_per_second": 54.007,
"eval_steps_per_second": 1.688,
"step": 600
},
{
"beta_dpo/beta_used": 0.30988115072250366,
"beta_dpo/beta_used_raw": 0.30988115072250366,
"beta_dpo/gap_mean": 10.584239959716797,
"beta_dpo/gap_std": 18.33379364013672,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.90854119425548,
"grad_norm": 72.30387878417969,
"learning_rate": 1.2898117173950868e-08,
"logits/chosen": 1.4453303813934326,
"logits/rejected": 1.3735606670379639,
"loss": 1.0263,
"step": 601
},
{
"beta_dpo/beta_used": 0.22164717316627502,
"beta_dpo/beta_used_raw": 0.22164717316627502,
"beta_dpo/gap_mean": 10.797483444213867,
"beta_dpo/gap_std": 17.95302963256836,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.91005291005291,
"grad_norm": 64.73311614990234,
"learning_rate": 1.2482220564763667e-08,
"logits/chosen": 1.5381059646606445,
"logits/rejected": 1.3643220663070679,
"loss": 0.875,
"step": 602
},
{
"beta_dpo/beta_used": 0.046099040657281876,
"beta_dpo/beta_used_raw": 0.046099040657281876,
"beta_dpo/gap_mean": 11.128530502319336,
"beta_dpo/gap_std": 17.995418548583984,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.9115646258503401,
"grad_norm": 12.024964332580566,
"learning_rate": 1.2072967838448051e-08,
"logits/chosen": 1.3041894435882568,
"logits/rejected": 0.8876796960830688,
"loss": 1.0542,
"step": 603
},
{
"beta_dpo/beta_used": 0.10406889021396637,
"beta_dpo/beta_used_raw": 0.10406889021396637,
"beta_dpo/gap_mean": 10.929356575012207,
"beta_dpo/gap_std": 18.385833740234375,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.9130763416477702,
"grad_norm": 30.560834884643555,
"learning_rate": 1.1670370442682459e-08,
"logits/chosen": 1.3623418807983398,
"logits/rejected": 1.2177022695541382,
"loss": 1.1108,
"step": 604
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.1141221672296524,
"beta_dpo/gap_mean": 10.722838401794434,
"beta_dpo/gap_std": 18.810199737548828,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.9145880574452003,
"grad_norm": 0.3671630620956421,
"learning_rate": 1.1274439638981532e-08,
"logits/chosen": 1.7869694232940674,
"logits/rejected": 1.4546797275543213,
"loss": 1.3779,
"step": 605
},
{
"beta_dpo/beta_used": 0.06800004839897156,
"beta_dpo/beta_used_raw": -0.004727482795715332,
"beta_dpo/gap_mean": 10.593547821044922,
"beta_dpo/gap_std": 18.50115394592285,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.9160997732426304,
"grad_norm": 18.27583885192871,
"learning_rate": 1.0885186502381016e-08,
"logits/chosen": 1.588325023651123,
"logits/rejected": 1.3476459980010986,
"loss": 1.0867,
"step": 606
},
{
"beta_dpo/beta_used": 0.08275961130857468,
"beta_dpo/beta_used_raw": 0.021806050091981888,
"beta_dpo/gap_mean": 11.124649047851562,
"beta_dpo/gap_std": 18.29534912109375,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.9176114890400605,
"grad_norm": 28.003469467163086,
"learning_rate": 1.0502621921127774e-08,
"logits/chosen": 0.8255990743637085,
"logits/rejected": 1.3248374462127686,
"loss": 1.0785,
"step": 607
},
{
"beta_dpo/beta_used": 0.05329656973481178,
"beta_dpo/beta_used_raw": 0.00821135938167572,
"beta_dpo/gap_mean": 10.249990463256836,
"beta_dpo/gap_std": 18.143394470214844,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.9191232048374905,
"grad_norm": 18.02731704711914,
"learning_rate": 1.0126756596375685e-08,
"logits/chosen": 1.4278472661972046,
"logits/rejected": 1.3889408111572266,
"loss": 1.1801,
"step": 608
},
{
"beta_dpo/beta_used": 0.20283672213554382,
"beta_dpo/beta_used_raw": 0.20283672213554382,
"beta_dpo/gap_mean": 10.51877212524414,
"beta_dpo/gap_std": 17.66318702697754,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.9206349206349206,
"grad_norm": 51.436180114746094,
"learning_rate": 9.757601041885694e-09,
"logits/chosen": 1.7704598903656006,
"logits/rejected": 1.4652361869812012,
"loss": 0.9272,
"step": 609
},
{
"beta_dpo/beta_used": 0.06131238117814064,
"beta_dpo/beta_used_raw": 0.020395085215568542,
"beta_dpo/gap_mean": 10.665861129760742,
"beta_dpo/gap_std": 17.976917266845703,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.9221466364323507,
"grad_norm": 21.912073135375977,
"learning_rate": 9.395165583732379e-09,
"logits/chosen": 1.5744285583496094,
"logits/rejected": 1.679150104522705,
"loss": 1.1984,
"step": 610
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.09681444615125656,
"beta_dpo/gap_mean": 10.473505973815918,
"beta_dpo/gap_std": 18.203094482421875,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.9236583522297808,
"grad_norm": 0.31549084186553955,
"learning_rate": 9.03946036001449e-09,
"logits/chosen": 1.1862019300460815,
"logits/rejected": 1.1827809810638428,
"loss": 1.3791,
"step": 611
},
{
"beta_dpo/beta_used": 0.1381537914276123,
"beta_dpo/beta_used_raw": 0.13283728063106537,
"beta_dpo/gap_mean": 10.559354782104492,
"beta_dpo/gap_std": 18.345184326171875,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.9251700680272109,
"grad_norm": 45.5057258605957,
"learning_rate": 8.690495320571839e-09,
"logits/chosen": 0.7239348292350769,
"logits/rejected": 0.553448498249054,
"loss": 1.0332,
"step": 612
},
{
"beta_dpo/beta_used": 0.44979095458984375,
"beta_dpo/beta_used_raw": 0.44979095458984375,
"beta_dpo/gap_mean": 11.126469612121582,
"beta_dpo/gap_std": 18.617183685302734,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.926681783824641,
"grad_norm": 74.70843505859375,
"learning_rate": 8.348280226706722e-09,
"logits/chosen": 0.8377181887626648,
"logits/rejected": 0.8131814002990723,
"loss": 0.5414,
"step": 613
},
{
"beta_dpo/beta_used": 0.15282967686653137,
"beta_dpo/beta_used_raw": 0.15282967686653137,
"beta_dpo/gap_mean": 11.11674690246582,
"beta_dpo/gap_std": 18.060022354125977,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.9281934996220711,
"grad_norm": 37.34785842895508,
"learning_rate": 8.012824650910937e-09,
"logits/chosen": 2.0333778858184814,
"logits/rejected": 1.645104169845581,
"loss": 0.8374,
"step": 614
},
{
"beta_dpo/beta_used": 0.09008196741342545,
"beta_dpo/beta_used_raw": 0.08506174385547638,
"beta_dpo/gap_mean": 11.391464233398438,
"beta_dpo/gap_std": 18.11256980895996,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.9297052154195011,
"grad_norm": 30.425193786621094,
"learning_rate": 7.684137976598088e-09,
"logits/chosen": 1.6812443733215332,
"logits/rejected": 1.6477621793746948,
"loss": 1.2128,
"step": 615
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.07090554386377335,
"beta_dpo/gap_mean": 11.225221633911133,
"beta_dpo/gap_std": 18.369062423706055,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.9312169312169312,
"grad_norm": 0.36726945638656616,
"learning_rate": 7.36222939784098e-09,
"logits/chosen": 1.408949851989746,
"logits/rejected": 1.2780832052230835,
"loss": 1.3783,
"step": 616
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.047676198184490204,
"beta_dpo/gap_mean": 10.978787422180176,
"beta_dpo/gap_std": 18.129138946533203,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.9327286470143613,
"grad_norm": 0.35759875178337097,
"learning_rate": 7.047107919114586e-09,
"logits/chosen": 1.7381243705749512,
"logits/rejected": 1.6514561176300049,
"loss": 1.3779,
"step": 617
},
{
"beta_dpo/beta_used": 0.048610154539346695,
"beta_dpo/beta_used_raw": 0.02857622131705284,
"beta_dpo/gap_mean": 10.871879577636719,
"beta_dpo/gap_std": 18.12830352783203,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.9342403628117913,
"grad_norm": 19.95886993408203,
"learning_rate": 6.738782355044048e-09,
"logits/chosen": 1.4191548824310303,
"logits/rejected": 1.0452890396118164,
"loss": 1.1654,
"step": 618
},
{
"beta_dpo/beta_used": 0.24798354506492615,
"beta_dpo/beta_used_raw": 0.24798354506492615,
"beta_dpo/gap_mean": 11.172018051147461,
"beta_dpo/gap_std": 18.40379524230957,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.9357520786092215,
"grad_norm": 59.49594497680664,
"learning_rate": 6.437261330158206e-09,
"logits/chosen": 1.8096110820770264,
"logits/rejected": 1.7222764492034912,
"loss": 0.9241,
"step": 619
},
{
"beta_dpo/beta_used": 0.10477973520755768,
"beta_dpo/beta_used_raw": 0.04265592247247696,
"beta_dpo/gap_mean": 10.613101959228516,
"beta_dpo/gap_std": 18.306976318359375,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.9372637944066515,
"grad_norm": 33.00358581542969,
"learning_rate": 6.142553278648238e-09,
"logits/chosen": 1.4938249588012695,
"logits/rejected": 1.4289090633392334,
"loss": 1.0332,
"step": 620
},
{
"beta_dpo/beta_used": 0.02596830204129219,
"beta_dpo/beta_used_raw": 0.004421204328536987,
"beta_dpo/gap_mean": 10.533326148986816,
"beta_dpo/gap_std": 18.14520263671875,
"beta_dpo/mask_keep_frac": 0.625,
"epoch": 0.9387755102040817,
"grad_norm": 8.95301342010498,
"learning_rate": 5.854666444131934e-09,
"logits/chosen": 1.3044579029083252,
"logits/rejected": 1.5364813804626465,
"loss": 1.2235,
"step": 621
},
{
"beta_dpo/beta_used": 0.303781121969223,
"beta_dpo/beta_used_raw": 0.303781121969223,
"beta_dpo/gap_mean": 10.607638359069824,
"beta_dpo/gap_std": 17.87635040283203,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.9402872260015117,
"grad_norm": 52.88166046142578,
"learning_rate": 5.573608879422875e-09,
"logits/chosen": 1.4276492595672607,
"logits/rejected": 1.4481043815612793,
"loss": 0.6664,
"step": 622
},
{
"beta_dpo/beta_used": 0.08703246712684631,
"beta_dpo/beta_used_raw": 0.06431964039802551,
"beta_dpo/gap_mean": 10.673839569091797,
"beta_dpo/gap_std": 17.585182189941406,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.9417989417989417,
"grad_norm": 20.131181716918945,
"learning_rate": 5.299388446305342e-09,
"logits/chosen": 1.686722993850708,
"logits/rejected": 1.3298940658569336,
"loss": 0.9207,
"step": 623
},
{
"beta_dpo/beta_used": 0.2075013518333435,
"beta_dpo/beta_used_raw": 0.2075013518333435,
"beta_dpo/gap_mean": 11.12516975402832,
"beta_dpo/gap_std": 17.868022918701172,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.9433106575963719,
"grad_norm": 44.760963439941406,
"learning_rate": 5.03201281531429e-09,
"logits/chosen": 1.1061675548553467,
"logits/rejected": 1.0347222089767456,
"loss": 0.9306,
"step": 624
},
{
"beta_dpo/beta_used": 0.0016511206049472094,
"beta_dpo/beta_used_raw": -0.07338549196720123,
"beta_dpo/gap_mean": 10.489020347595215,
"beta_dpo/gap_std": 17.979766845703125,
"beta_dpo/mask_keep_frac": 1.0,
"epoch": 0.9448223733938019,
"grad_norm": 0.6475630402565002,
"learning_rate": 4.7714894655209174e-09,
"logits/chosen": 1.7902281284332275,
"logits/rejected": 1.5662107467651367,
"loss": 1.3732,
"step": 625
},
{
"beta_dpo/beta_used": 0.11615270376205444,
"beta_dpo/beta_used_raw": 0.046706706285476685,
"beta_dpo/gap_mean": 10.757328987121582,
"beta_dpo/gap_std": 18.39080238342285,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.9463340891912321,
"grad_norm": 60.5959587097168,
"learning_rate": 4.517825684323323e-09,
"logits/chosen": 2.184417486190796,
"logits/rejected": 1.984039545059204,
"loss": 1.4281,
"step": 626
},
{
"beta_dpo/beta_used": 0.07429970800876617,
"beta_dpo/beta_used_raw": 0.005273900926113129,
"beta_dpo/gap_mean": 11.423591613769531,
"beta_dpo/gap_std": 18.419429779052734,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.9478458049886621,
"grad_norm": 16.982189178466797,
"learning_rate": 4.271028567242818e-09,
"logits/chosen": 1.4129197597503662,
"logits/rejected": 1.0711102485656738,
"loss": 0.9599,
"step": 627
},
{
"beta_dpo/beta_used": 0.342271089553833,
"beta_dpo/beta_used_raw": 0.342271089553833,
"beta_dpo/gap_mean": 11.820939064025879,
"beta_dpo/gap_std": 19.3472957611084,
"beta_dpo/mask_keep_frac": 0.625,
"epoch": 0.9493575207860923,
"grad_norm": 105.21951293945312,
"learning_rate": 4.0311050177251895e-09,
"logits/chosen": 1.5985822677612305,
"logits/rejected": 1.7508151531219482,
"loss": 1.1523,
"step": 628
},
{
"beta_dpo/beta_used": 0.012321592308580875,
"beta_dpo/beta_used_raw": -0.017206501215696335,
"beta_dpo/gap_mean": 12.163355827331543,
"beta_dpo/gap_std": 18.93172836303711,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.9508692365835223,
"grad_norm": 4.553313255310059,
"learning_rate": 3.798061746947995e-09,
"logits/chosen": 1.690114974975586,
"logits/rejected": 1.825777530670166,
"loss": 1.2731,
"step": 629
},
{
"beta_dpo/beta_used": 0.11094523966312408,
"beta_dpo/beta_used_raw": 0.11094523966312408,
"beta_dpo/gap_mean": 12.257745742797852,
"beta_dpo/gap_std": 18.752323150634766,
"beta_dpo/mask_keep_frac": 0.4375,
"epoch": 0.9523809523809523,
"grad_norm": 21.555618286132812,
"learning_rate": 3.5719052736323806e-09,
"logits/chosen": 1.6641654968261719,
"logits/rejected": 1.4665703773498535,
"loss": 0.8095,
"step": 630
},
{
"beta_dpo/beta_used": 0.12940296530723572,
"beta_dpo/beta_used_raw": 0.12940296530723572,
"beta_dpo/gap_mean": 12.77896499633789,
"beta_dpo/gap_std": 18.95907974243164,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.9538926681783825,
"grad_norm": 33.42890548706055,
"learning_rate": 3.352641923861144e-09,
"logits/chosen": 2.0447068214416504,
"logits/rejected": 1.8289787769317627,
"loss": 0.9098,
"step": 631
},
{
"beta_dpo/beta_used": 0.24121464788913727,
"beta_dpo/beta_used_raw": 0.24121464788913727,
"beta_dpo/gap_mean": 12.694576263427734,
"beta_dpo/gap_std": 19.064842224121094,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.9554043839758125,
"grad_norm": 85.89191436767578,
"learning_rate": 3.140277830901428e-09,
"logits/chosen": 2.1072683334350586,
"logits/rejected": 1.948132038116455,
"loss": 1.1471,
"step": 632
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.03531520441174507,
"beta_dpo/gap_mean": 12.826020240783691,
"beta_dpo/gap_std": 19.142948150634766,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.9569160997732427,
"grad_norm": 0.4302322268486023,
"learning_rate": 2.9348189350335007e-09,
"logits/chosen": 1.1234136819839478,
"logits/rejected": 0.9836254715919495,
"loss": 1.3753,
"step": 633
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.29016977548599243,
"beta_dpo/gap_mean": 11.909151077270508,
"beta_dpo/gap_std": 19.207143783569336,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.9584278155706727,
"grad_norm": 0.31137824058532715,
"learning_rate": 2.736270983384276e-09,
"logits/chosen": 1.444591999053955,
"logits/rejected": 1.4112733602523804,
"loss": 1.3815,
"step": 634
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.12229815125465393,
"beta_dpo/gap_mean": 11.218611717224121,
"beta_dpo/gap_std": 18.949787139892578,
"beta_dpo/mask_keep_frac": 0.5625,
"epoch": 0.9599395313681028,
"grad_norm": 0.361990749835968,
"learning_rate": 2.5446395297668287e-09,
"logits/chosen": 1.7448804378509521,
"logits/rejected": 1.665621280670166,
"loss": 1.3786,
"step": 635
},
{
"beta_dpo/beta_used": 0.21137313544750214,
"beta_dpo/beta_used_raw": 0.21137313544750214,
"beta_dpo/gap_mean": 11.505717277526855,
"beta_dpo/gap_std": 18.636005401611328,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.9614512471655329,
"grad_norm": 66.32644653320312,
"learning_rate": 2.359929934524829e-09,
"logits/chosen": 1.6981236934661865,
"logits/rejected": 1.3314778804779053,
"loss": 0.9321,
"step": 636
},
{
"beta_dpo/beta_used": 0.017410093918442726,
"beta_dpo/beta_used_raw": -0.047742683440446854,
"beta_dpo/gap_mean": 11.64794921875,
"beta_dpo/gap_std": 18.616622924804688,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.9629629629629629,
"grad_norm": 5.9650092124938965,
"learning_rate": 2.1821473643827137e-09,
"logits/chosen": 1.6464264392852783,
"logits/rejected": 1.3672943115234375,
"loss": 1.2547,
"step": 637
},
{
"beta_dpo/beta_used": 0.11326654255390167,
"beta_dpo/beta_used_raw": 0.11326654255390167,
"beta_dpo/gap_mean": 11.702375411987305,
"beta_dpo/gap_std": 18.63088607788086,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.9644746787603931,
"grad_norm": 34.071075439453125,
"learning_rate": 2.0112967923011646e-09,
"logits/chosen": 0.9265055656433105,
"logits/rejected": 0.8722898364067078,
"loss": 0.888,
"step": 638
},
{
"beta_dpo/beta_used": 0.09828314930200577,
"beta_dpo/beta_used_raw": 0.07606379687786102,
"beta_dpo/gap_mean": 11.771888732910156,
"beta_dpo/gap_std": 18.478546142578125,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.9659863945578231,
"grad_norm": 26.12000274658203,
"learning_rate": 1.847382997337943e-09,
"logits/chosen": 1.293177604675293,
"logits/rejected": 0.9671316146850586,
"loss": 1.0725,
"step": 639
},
{
"beta_dpo/beta_used": 0.08824127167463303,
"beta_dpo/beta_used_raw": 0.05229192599654198,
"beta_dpo/gap_mean": 11.435354232788086,
"beta_dpo/gap_std": 18.631763458251953,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.9674981103552532,
"grad_norm": 26.251401901245117,
"learning_rate": 1.690410564514244e-09,
"logits/chosen": 1.726629376411438,
"logits/rejected": 1.3301911354064941,
"loss": 1.0781,
"step": 640
},
{
"beta_dpo/beta_used": 0.09193282574415207,
"beta_dpo/beta_used_raw": 0.03977712616324425,
"beta_dpo/gap_mean": 11.201648712158203,
"beta_dpo/gap_std": 18.50255584716797,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.9690098261526833,
"grad_norm": 32.0823860168457,
"learning_rate": 1.5403838846864692e-09,
"logits/chosen": 1.4812743663787842,
"logits/rejected": 1.6599016189575195,
"loss": 1.0413,
"step": 641
},
{
"beta_dpo/beta_used": 0.040066223591566086,
"beta_dpo/beta_used_raw": -0.04489295557141304,
"beta_dpo/gap_mean": 11.188066482543945,
"beta_dpo/gap_std": 18.518863677978516,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.9705215419501134,
"grad_norm": 13.173298835754395,
"learning_rate": 1.3973071544233218e-09,
"logits/chosen": 1.4061027765274048,
"logits/rejected": 1.4609074592590332,
"loss": 1.1875,
"step": 642
},
{
"beta_dpo/beta_used": 0.10887844115495682,
"beta_dpo/beta_used_raw": 0.10887844115495682,
"beta_dpo/gap_mean": 11.167619705200195,
"beta_dpo/gap_std": 18.79291534423828,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.9720332577475435,
"grad_norm": 31.67186164855957,
"learning_rate": 1.261184375888541e-09,
"logits/chosen": 1.539642572402954,
"logits/rejected": 1.2710895538330078,
"loss": 1.042,
"step": 643
},
{
"beta_dpo/beta_used": 0.20992937684059143,
"beta_dpo/beta_used_raw": 0.20992937684059143,
"beta_dpo/gap_mean": 10.766650199890137,
"beta_dpo/gap_std": 19.097431182861328,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.9735449735449735,
"grad_norm": 53.034210205078125,
"learning_rate": 1.1320193567288527e-09,
"logits/chosen": 1.723166584968567,
"logits/rejected": 1.595113754272461,
"loss": 1.1572,
"step": 644
},
{
"beta_dpo/beta_used": 0.2538291811943054,
"beta_dpo/beta_used_raw": 0.2538291811943054,
"beta_dpo/gap_mean": 11.259082794189453,
"beta_dpo/gap_std": 18.895366668701172,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.9750566893424036,
"grad_norm": 50.3277702331543,
"learning_rate": 1.0098157099674987e-09,
"logits/chosen": 1.4872803688049316,
"logits/rejected": 1.5161464214324951,
"loss": 0.6968,
"step": 645
},
{
"beta_dpo/beta_used": 0.02309798076748848,
"beta_dpo/beta_used_raw": -0.013568395748734474,
"beta_dpo/gap_mean": 11.416717529296875,
"beta_dpo/gap_std": 18.746578216552734,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.9765684051398337,
"grad_norm": 8.568411827087402,
"learning_rate": 8.945768539031783e-10,
"logits/chosen": 1.5456962585449219,
"logits/rejected": 1.2485579252243042,
"loss": 1.1993,
"step": 646
},
{
"beta_dpo/beta_used": 0.26212525367736816,
"beta_dpo/beta_used_raw": 0.26212525367736816,
"beta_dpo/gap_mean": 12.01045036315918,
"beta_dpo/gap_std": 18.542598724365234,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.9780801209372638,
"grad_norm": 39.779212951660156,
"learning_rate": 7.863060120144316e-10,
"logits/chosen": 1.6833382844924927,
"logits/rejected": 1.4121618270874023,
"loss": 0.4523,
"step": 647
},
{
"beta_dpo/beta_used": 0.12411724776029587,
"beta_dpo/beta_used_raw": 0.1003761738538742,
"beta_dpo/gap_mean": 12.072593688964844,
"beta_dpo/gap_std": 18.08414077758789,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.9795918367346939,
"grad_norm": 34.08300018310547,
"learning_rate": 6.850062128694045e-10,
"logits/chosen": 1.20827054977417,
"logits/rejected": 1.0931397676467896,
"loss": 0.9287,
"step": 648
},
{
"beta_dpo/beta_used": 0.2505728006362915,
"beta_dpo/beta_used_raw": 0.2505728006362915,
"beta_dpo/gap_mean": 12.009123802185059,
"beta_dpo/gap_std": 18.238069534301758,
"beta_dpo/mask_keep_frac": 0.9375,
"epoch": 0.981103552532124,
"grad_norm": 70.92848205566406,
"learning_rate": 5.906802900412788e-10,
"logits/chosen": 1.342724084854126,
"logits/rejected": 1.1755287647247314,
"loss": 0.704,
"step": 649
},
{
"beta_dpo/beta_used": 0.23393958806991577,
"beta_dpo/beta_used_raw": 0.23393958806991577,
"beta_dpo/gap_mean": 11.982925415039062,
"beta_dpo/gap_std": 18.271129608154297,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.982615268329554,
"grad_norm": 65.76042938232422,
"learning_rate": 5.033308820289184e-10,
"logits/chosen": 1.873583197593689,
"logits/rejected": 1.529442310333252,
"loss": 1.1006,
"step": 650
},
{
"beta_dpo/beta_used": 0.011995721608400345,
"beta_dpo/beta_used_raw": -0.07203161716461182,
"beta_dpo/gap_mean": 11.841169357299805,
"beta_dpo/gap_std": 17.955848693847656,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.9841269841269841,
"grad_norm": 6.071944713592529,
"learning_rate": 4.2296043218295606e-10,
"logits/chosen": 1.3133518695831299,
"logits/rejected": 1.2405388355255127,
"loss": 1.2771,
"step": 651
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.07919080555438995,
"beta_dpo/gap_mean": 11.827400207519531,
"beta_dpo/gap_std": 18.362974166870117,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.9856386999244142,
"grad_norm": 0.3452511429786682,
"learning_rate": 3.4957118863768176e-10,
"logits/chosen": 1.775701642036438,
"logits/rejected": 1.7335072755813599,
"loss": 1.3775,
"step": 652
},
{
"beta_dpo/beta_used": 0.11402089893817902,
"beta_dpo/beta_used_raw": 0.11402089893817902,
"beta_dpo/gap_mean": 11.791513442993164,
"beta_dpo/gap_std": 18.576650619506836,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.9871504157218443,
"grad_norm": 22.66193962097168,
"learning_rate": 2.831652042480093e-10,
"logits/chosen": 1.2139712572097778,
"logits/rejected": 1.2871928215026855,
"loss": 0.8905,
"step": 653
},
{
"beta_dpo/beta_used": 0.09395039081573486,
"beta_dpo/beta_used_raw": -0.025495566427707672,
"beta_dpo/gap_mean": 11.91246223449707,
"beta_dpo/gap_std": 18.69017791748047,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.9886621315192744,
"grad_norm": 33.95069885253906,
"learning_rate": 2.2374433653205016e-10,
"logits/chosen": 1.3610148429870605,
"logits/rejected": 1.0752047300338745,
"loss": 1.0777,
"step": 654
},
{
"beta_dpo/beta_used": 0.01329093612730503,
"beta_dpo/beta_used_raw": -0.07511409372091293,
"beta_dpo/gap_mean": 11.890439987182617,
"beta_dpo/gap_std": 18.40912628173828,
"beta_dpo/mask_keep_frac": 1.0,
"epoch": 0.9901738473167044,
"grad_norm": 4.47454309463501,
"learning_rate": 1.7131024761923852e-10,
"logits/chosen": 1.2066938877105713,
"logits/rejected": 1.2044000625610352,
"loss": 1.2717,
"step": 655
},
{
"beta_dpo/beta_used": 0.13312599062919617,
"beta_dpo/beta_used_raw": 0.13312599062919617,
"beta_dpo/gap_mean": 11.72813892364502,
"beta_dpo/gap_std": 18.383424758911133,
"beta_dpo/mask_keep_frac": 0.625,
"epoch": 0.9916855631141346,
"grad_norm": 26.90215301513672,
"learning_rate": 1.2586440420372934e-10,
"logits/chosen": 1.0210485458374023,
"logits/rejected": 1.2208218574523926,
"loss": 0.7741,
"step": 656
},
{
"beta_dpo/beta_used": 0.2219039648771286,
"beta_dpo/beta_used_raw": 0.2219039648771286,
"beta_dpo/gap_mean": 12.457931518554688,
"beta_dpo/gap_std": 18.564159393310547,
"beta_dpo/mask_keep_frac": 0.6875,
"epoch": 0.9931972789115646,
"grad_norm": 55.28135681152344,
"learning_rate": 8.740807750345913e-11,
"logits/chosen": 1.7017799615859985,
"logits/rejected": 1.3097267150878906,
"loss": 0.6586,
"step": 657
},
{
"beta_dpo/beta_used": 0.10924780368804932,
"beta_dpo/beta_used_raw": 0.10924780368804932,
"beta_dpo/gap_mean": 12.133644104003906,
"beta_dpo/gap_std": 18.873802185058594,
"beta_dpo/mask_keep_frac": 0.75,
"epoch": 0.9947089947089947,
"grad_norm": 25.044599533081055,
"learning_rate": 5.594234322453539e-11,
"logits/chosen": 1.8211889266967773,
"logits/rejected": 1.6856117248535156,
"loss": 0.8992,
"step": 658
},
{
"beta_dpo/beta_used": 0.03363973647356033,
"beta_dpo/beta_used_raw": -0.06083240360021591,
"beta_dpo/gap_mean": 11.746139526367188,
"beta_dpo/gap_std": 19.16648292541504,
"beta_dpo/mask_keep_frac": 0.875,
"epoch": 0.9962207105064248,
"grad_norm": 10.613242149353027,
"learning_rate": 3.146808153123293e-11,
"logits/chosen": 1.6447190046310425,
"logits/rejected": 1.2653954029083252,
"loss": 1.1776,
"step": 659
},
{
"beta_dpo/beta_used": 0.3293432593345642,
"beta_dpo/beta_used_raw": 0.3293432593345642,
"beta_dpo/gap_mean": 11.700296401977539,
"beta_dpo/gap_std": 18.658584594726562,
"beta_dpo/mask_keep_frac": 0.8125,
"epoch": 0.9977324263038548,
"grad_norm": 89.82917022705078,
"learning_rate": 1.3985977021235829e-11,
"logits/chosen": 1.4930264949798584,
"logits/rejected": 1.5581485033035278,
"loss": 1.0381,
"step": 660
},
{
"beta_dpo/beta_used": 0.0010000000474974513,
"beta_dpo/beta_used_raw": -0.07245530188083649,
"beta_dpo/gap_mean": 11.2869873046875,
"beta_dpo/gap_std": 18.77579689025879,
"beta_dpo/mask_keep_frac": 0.5625,
"epoch": 0.999244142101285,
"grad_norm": 0.3315927982330322,
"learning_rate": 3.4965187065971735e-12,
"logits/chosen": 1.457210898399353,
"logits/rejected": 1.4589345455169678,
"loss": 1.3771,
"step": 661
},
{
"epoch": 0.999244142101285,
"step": 661,
"total_flos": 0.0,
"train_loss": 1.1663504292943294,
"train_runtime": 3087.1314,
"train_samples_per_second": 13.714,
"train_steps_per_second": 0.214
}
],
"logging_steps": 1,
"max_steps": 661,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}