Files
qwen3-8b-base-epsilon-dpo-u…/trainer_state.json
ModelHub XC 163929230f 初始化项目,由ModelHub XC社区提供模型
Model: jackf857/qwen3-8b-base-epsilon-dpo-ultrafeedback-4xh200-batch-128
Source: Original Platform
2026-06-12 14:32:41 +08:00

1392 lines
57 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9989528795811519,
"eval_steps": 200,
"global_step": 477,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0020942408376963353,
"epsilon_dpo/beta": 0.009997084736824036,
"epsilon_dpo/beta_margin_grad_mean": -0.499971479177475,
"epsilon_dpo/beta_margin_grad_std": 0.001938261673785746,
"epsilon_dpo/beta_margin_mean": 0.0001140289386967197,
"epsilon_dpo/beta_margin_std": 0.007753193378448486,
"epsilon_dpo/loss_margin_mean": 0.01704716682434082,
"grad_norm": 14.606449127197266,
"kl/avg_steps": 0.0390625,
"kl/beta": 0.009999999776482582,
"kl/n_epsilon_steps": 0.4765625,
"kl/p_epsilon_steps": 0.515625,
"learning_rate": 0.0,
"logits/chosen": 2.6271941661834717,
"logits/rejected": 2.237529993057251,
"logps/chosen": -267.3031921386719,
"logps/ref_chosen": -267.2525634765625,
"logps/ref_rejected": -219.97085571289062,
"logps/rejected": -220.0385284423828,
"loss": 5.5448,
"rewards/accuracies": 0.5546875,
"rewards/chosen": -0.0005317605682648718,
"rewards/margins": 0.00011403978714952245,
"rewards/rejected": -0.0006458003772422671,
"step": 1
},
{
"epoch": 0.020942408376963352,
"epsilon_dpo/beta": 0.010005515068769455,
"epsilon_dpo/beta_margin_grad_mean": -0.5000517964363098,
"epsilon_dpo/beta_margin_grad_std": 0.0022904376965016127,
"epsilon_dpo/beta_margin_mean": -0.0002073091600323096,
"epsilon_dpo/beta_margin_std": 0.009162054397165775,
"epsilon_dpo/loss_margin_mean": -0.014141757972538471,
"grad_norm": 13.806034088134766,
"kl/avg_steps": -0.013888888992369175,
"kl/beta": 0.010003137402236462,
"kl/n_epsilon_steps": 0.5017361044883728,
"kl/p_epsilon_steps": 0.4878472089767456,
"learning_rate": 9.375e-08,
"logits/chosen": 2.67746639251709,
"logits/rejected": 2.7837536334991455,
"logps/chosen": -282.07965087890625,
"logps/ref_chosen": -282.07989501953125,
"logps/ref_rejected": -261.4595642089844,
"logps/rejected": -261.4451904296875,
"loss": 5.5461,
"rewards/accuracies": 0.4913194477558136,
"rewards/chosen": -3.148229734506458e-05,
"rewards/margins": -0.00020731209951918572,
"rewards/rejected": 0.0001758297876222059,
"step": 10
},
{
"epoch": 0.041884816753926704,
"epsilon_dpo/beta": 0.010010017082095146,
"epsilon_dpo/beta_margin_grad_mean": -0.5000718235969543,
"epsilon_dpo/beta_margin_grad_std": 0.0022132620215415955,
"epsilon_dpo/beta_margin_mean": -0.0002871893811970949,
"epsilon_dpo/beta_margin_std": 0.008853326551616192,
"epsilon_dpo/loss_margin_mean": -0.02227994240820408,
"grad_norm": 15.510866165161133,
"kl/avg_steps": -0.02421874925494194,
"kl/beta": 0.010006600990891457,
"kl/n_epsilon_steps": 0.5078125,
"kl/p_epsilon_steps": 0.48359376192092896,
"learning_rate": 1.9791666666666664e-07,
"logits/chosen": 2.541713237762451,
"logits/rejected": 2.75179123878479,
"logps/chosen": -278.8614196777344,
"logps/ref_chosen": -278.8597106933594,
"logps/ref_rejected": -257.1719055175781,
"logps/rejected": -257.1513671875,
"loss": 5.5464,
"rewards/accuracies": 0.48515623807907104,
"rewards/chosen": -5.0874834414571524e-05,
"rewards/margins": -0.0002871867036446929,
"rewards/rejected": 0.00023631185467820615,
"step": 20
},
{
"epoch": 0.06282722513089005,
"epsilon_dpo/beta": 0.01001377496868372,
"epsilon_dpo/beta_margin_grad_mean": -0.4998772144317627,
"epsilon_dpo/beta_margin_grad_std": 0.0022012609988451004,
"epsilon_dpo/beta_margin_mean": 0.000491045939270407,
"epsilon_dpo/beta_margin_std": 0.008805298246443272,
"epsilon_dpo/loss_margin_mean": 0.055501788854599,
"grad_norm": 14.358946800231934,
"kl/avg_steps": 0.07187499850988388,
"kl/beta": 0.010019981302320957,
"kl/n_epsilon_steps": 0.45703125,
"kl/p_epsilon_steps": 0.5289062261581421,
"learning_rate": 3.020833333333333e-07,
"logits/chosen": 2.639504909515381,
"logits/rejected": 2.8058505058288574,
"logps/chosen": -273.9162902832031,
"logps/ref_chosen": -273.97674560546875,
"logps/ref_rejected": -257.2232360839844,
"logps/rejected": -257.2182922363281,
"loss": 5.5433,
"rewards/accuracies": 0.54296875,
"rewards/chosen": 0.0005733909783884883,
"rewards/margins": 0.0004910477437078953,
"rewards/rejected": 8.234316919697449e-05,
"step": 30
},
{
"epoch": 0.08376963350785341,
"epsilon_dpo/beta": 0.009926706552505493,
"epsilon_dpo/beta_margin_grad_mean": -0.49948254227638245,
"epsilon_dpo/beta_margin_grad_std": 0.0024200372863560915,
"epsilon_dpo/beta_margin_mean": 0.0020698602311313152,
"epsilon_dpo/beta_margin_std": 0.009680529125034809,
"epsilon_dpo/loss_margin_mean": 0.21598558127880096,
"grad_norm": 14.699762344360352,
"kl/avg_steps": 0.11953125149011612,
"kl/beta": 0.009937574155628681,
"kl/n_epsilon_steps": 0.4351562559604645,
"kl/p_epsilon_steps": 0.5546875,
"learning_rate": 4.0625e-07,
"logits/chosen": 2.59186053276062,
"logits/rejected": 2.7942440509796143,
"logps/chosen": -280.52899169921875,
"logps/ref_chosen": -280.8274841308594,
"logps/ref_rejected": -258.9448547363281,
"logps/rejected": -258.8622741699219,
"loss": 5.537,
"rewards/accuracies": 0.5726562738418579,
"rewards/chosen": 0.0029196988325566053,
"rewards/margins": 0.002069863025099039,
"rewards/rejected": 0.0008498359238728881,
"step": 40
},
{
"epoch": 0.10471204188481675,
"epsilon_dpo/beta": 0.009684694930911064,
"epsilon_dpo/beta_margin_grad_mean": -0.4989333748817444,
"epsilon_dpo/beta_margin_grad_std": 0.0033105709590017796,
"epsilon_dpo/beta_margin_mean": 0.004266691394150257,
"epsilon_dpo/beta_margin_std": 0.013243382796645164,
"epsilon_dpo/loss_margin_mean": 0.4500531256198883,
"grad_norm": 14.027534484863281,
"kl/avg_steps": 0.30390626192092896,
"kl/beta": 0.009713245555758476,
"kl/n_epsilon_steps": 0.34453123807907104,
"kl/p_epsilon_steps": 0.6484375,
"learning_rate": 4.999932966293553e-07,
"logits/chosen": 2.47767972946167,
"logits/rejected": 2.8026018142700195,
"logps/chosen": -277.54425048828125,
"logps/ref_chosen": -278.20208740234375,
"logps/ref_rejected": -265.7288818359375,
"logps/rejected": -265.5211181640625,
"loss": 5.5283,
"rewards/accuracies": 0.649218738079071,
"rewards/chosen": 0.006310028024017811,
"rewards/margins": 0.00426669092848897,
"rewards/rejected": 0.0020433368626981974,
"step": 50
},
{
"epoch": 0.1256544502617801,
"epsilon_dpo/beta": 0.009375964291393757,
"epsilon_dpo/beta_margin_grad_mean": -0.4979146420955658,
"epsilon_dpo/beta_margin_grad_std": 0.0050841751508414745,
"epsilon_dpo/beta_margin_mean": 0.008342581801116467,
"epsilon_dpo/beta_margin_std": 0.02034146524965763,
"epsilon_dpo/loss_margin_mean": 0.9050939679145813,
"grad_norm": 13.532852172851562,
"kl/avg_steps": 0.35546875,
"kl/beta": 0.009408445097506046,
"kl/n_epsilon_steps": 0.3187499940395355,
"kl/p_epsilon_steps": 0.6742187738418579,
"learning_rate": 4.991893270335525e-07,
"logits/chosen": 2.488196849822998,
"logits/rejected": 2.7562973499298096,
"logps/chosen": -267.5882263183594,
"logps/ref_chosen": -268.90765380859375,
"logps/ref_rejected": -259.67926025390625,
"logps/rejected": -259.2649230957031,
"loss": 5.5123,
"rewards/accuracies": 0.676562488079071,
"rewards/chosen": 0.012289796955883503,
"rewards/margins": 0.008342583663761616,
"rewards/rejected": 0.003947213292121887,
"step": 60
},
{
"epoch": 0.14659685863874344,
"epsilon_dpo/beta": 0.009031310677528381,
"epsilon_dpo/beta_margin_grad_mean": -0.4967042803764343,
"epsilon_dpo/beta_margin_grad_std": 0.00740186357870698,
"epsilon_dpo/beta_margin_mean": 0.013186539523303509,
"epsilon_dpo/beta_margin_std": 0.029618557542562485,
"epsilon_dpo/loss_margin_mean": 1.483746886253357,
"grad_norm": 13.820236206054688,
"kl/avg_steps": 0.3890624940395355,
"kl/beta": 0.009065655060112476,
"kl/n_epsilon_steps": 0.30078125,
"kl/p_epsilon_steps": 0.6898437738418579,
"learning_rate": 4.970496218214204e-07,
"logits/chosen": 2.474260091781616,
"logits/rejected": 2.7694077491760254,
"logps/chosen": -267.3814392089844,
"logps/ref_chosen": -269.73370361328125,
"logps/ref_rejected": -258.15594482421875,
"logps/rejected": -257.28741455078125,
"loss": 5.4935,
"rewards/accuracies": 0.702343761920929,
"rewards/chosen": 0.02110612951219082,
"rewards/margins": 0.013186539523303509,
"rewards/rejected": 0.00791959185153246,
"step": 70
},
{
"epoch": 0.16753926701570682,
"epsilon_dpo/beta": 0.008663726039230824,
"epsilon_dpo/beta_margin_grad_mean": -0.49476176500320435,
"epsilon_dpo/beta_margin_grad_std": 0.01098305732011795,
"epsilon_dpo/beta_margin_mean": 0.020962897688150406,
"epsilon_dpo/beta_margin_std": 0.04398656636476517,
"epsilon_dpo/loss_margin_mean": 2.4553990364074707,
"grad_norm": 13.310928344726562,
"kl/avg_steps": 0.4117187559604645,
"kl/beta": 0.008698700927197933,
"kl/n_epsilon_steps": 0.28984373807907104,
"kl/p_epsilon_steps": 0.7015625238418579,
"learning_rate": 4.935856505068998e-07,
"logits/chosen": 2.4028592109680176,
"logits/rejected": 2.7112083435058594,
"logps/chosen": -268.78997802734375,
"logps/ref_chosen": -273.09210205078125,
"logps/ref_rejected": -259.3874816894531,
"logps/rejected": -257.54071044921875,
"loss": 5.4638,
"rewards/accuracies": 0.703906238079071,
"rewards/chosen": 0.03706257790327072,
"rewards/margins": 0.020962897688150406,
"rewards/rejected": 0.016099678352475166,
"step": 80
},
{
"epoch": 0.18848167539267016,
"epsilon_dpo/beta": 0.008329156786203384,
"epsilon_dpo/beta_margin_grad_mean": -0.49337729811668396,
"epsilon_dpo/beta_margin_grad_std": 0.013919507153332233,
"epsilon_dpo/beta_margin_mean": 0.026513313874602318,
"epsilon_dpo/beta_margin_std": 0.05574870854616165,
"epsilon_dpo/loss_margin_mean": 3.229220151901245,
"grad_norm": 12.768597602844238,
"kl/avg_steps": 0.40625,
"kl/beta": 0.008362272754311562,
"kl/n_epsilon_steps": 0.29374998807907104,
"kl/p_epsilon_steps": 0.699999988079071,
"learning_rate": 4.8881598109976e-07,
"logits/chosen": 2.430711030960083,
"logits/rejected": 2.644582748413086,
"logps/chosen": -263.22772216796875,
"logps/ref_chosen": -270.48480224609375,
"logps/ref_rejected": -259.2120361328125,
"logps/rejected": -255.18417358398438,
"loss": 5.443,
"rewards/accuracies": 0.702343761920929,
"rewards/chosen": 0.0601632222533226,
"rewards/margins": 0.02651331201195717,
"rewards/rejected": 0.03364991024136543,
"step": 90
},
{
"epoch": 0.2094240837696335,
"epsilon_dpo/beta": 0.008008182048797607,
"epsilon_dpo/beta_margin_grad_mean": -0.4916536211967468,
"epsilon_dpo/beta_margin_grad_std": 0.01792542263865471,
"epsilon_dpo/beta_margin_mean": 0.03343886882066727,
"epsilon_dpo/beta_margin_std": 0.07184432446956635,
"epsilon_dpo/loss_margin_mean": 4.237745761871338,
"grad_norm": 12.262528419494629,
"kl/avg_steps": 0.3812499940395355,
"kl/beta": 0.00803801417350769,
"kl/n_epsilon_steps": 0.3031249940395355,
"kl/p_epsilon_steps": 0.684374988079071,
"learning_rate": 4.827661805750437e-07,
"logits/chosen": 2.3381965160369873,
"logits/rejected": 2.474226236343384,
"logps/chosen": -262.87408447265625,
"logps/ref_chosen": -272.49383544921875,
"logps/ref_rejected": -255.8369598388672,
"logps/rejected": -250.4550018310547,
"loss": 5.4178,
"rewards/accuracies": 0.6953125,
"rewards/chosen": 0.0767994076013565,
"rewards/margins": 0.033438872545957565,
"rewards/rejected": 0.04336053133010864,
"step": 100
},
{
"epoch": 0.23036649214659685,
"epsilon_dpo/beta": 0.007680200040340424,
"epsilon_dpo/beta_margin_grad_mean": -0.4877113699913025,
"epsilon_dpo/beta_margin_grad_std": 0.02195078134536743,
"epsilon_dpo/beta_margin_mean": 0.04926630109548569,
"epsilon_dpo/beta_margin_std": 0.08810068666934967,
"epsilon_dpo/loss_margin_mean": 6.498995780944824,
"grad_norm": 12.287609100341797,
"kl/avg_steps": 0.44140625,
"kl/beta": 0.007713483180850744,
"kl/n_epsilon_steps": 0.2718749940395355,
"kl/p_epsilon_steps": 0.7132812738418579,
"learning_rate": 4.75468677825789e-07,
"logits/chosen": 2.2321219444274902,
"logits/rejected": 2.585568904876709,
"logps/chosen": -263.58843994140625,
"logps/ref_chosen": -272.6753845214844,
"logps/ref_rejected": -260.817138671875,
"logps/rejected": -258.2291564941406,
"loss": 5.3585,
"rewards/accuracies": 0.7320312261581421,
"rewards/chosen": 0.06958577036857605,
"rewards/margins": 0.04926629737019539,
"rewards/rejected": 0.020319465547800064,
"step": 110
},
{
"epoch": 0.2513089005235602,
"epsilon_dpo/beta": 0.007364341057837009,
"epsilon_dpo/beta_margin_grad_mean": -0.4861171245574951,
"epsilon_dpo/beta_margin_grad_std": 0.027931923046708107,
"epsilon_dpo/beta_margin_mean": 0.05574618652462959,
"epsilon_dpo/beta_margin_std": 0.11227792501449585,
"epsilon_dpo/loss_margin_mean": 7.674368381500244,
"grad_norm": 12.68581485748291,
"kl/avg_steps": 0.3984375,
"kl/beta": 0.007393070962280035,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.6953125,
"learning_rate": 4.669625898336438e-07,
"logits/chosen": 2.292116403579712,
"logits/rejected": 2.474891185760498,
"logps/chosen": -273.1396789550781,
"logps/ref_chosen": -279.50213623046875,
"logps/ref_rejected": -263.6972351074219,
"logps/rejected": -265.0091857910156,
"loss": 5.3381,
"rewards/accuracies": 0.7007812261581421,
"rewards/chosen": 0.046533744782209396,
"rewards/margins": 0.05574618652462959,
"rewards/rejected": -0.009212437085807323,
"step": 120
},
{
"epoch": 0.27225130890052357,
"epsilon_dpo/beta": 0.007093364838510752,
"epsilon_dpo/beta_margin_grad_mean": -0.4820740818977356,
"epsilon_dpo/beta_margin_grad_std": 0.03345402330160141,
"epsilon_dpo/beta_margin_mean": 0.07208652794361115,
"epsilon_dpo/beta_margin_std": 0.13469013571739197,
"epsilon_dpo/loss_margin_mean": 10.307097434997559,
"grad_norm": 15.22977352142334,
"kl/avg_steps": 0.3843750059604645,
"kl/beta": 0.0071199932135641575,
"kl/n_epsilon_steps": 0.3023437559604645,
"kl/p_epsilon_steps": 0.686718761920929,
"learning_rate": 4.5729351198915705e-07,
"logits/chosen": 2.230104923248291,
"logits/rejected": 2.4557857513427734,
"logps/chosen": -272.00311279296875,
"logps/ref_chosen": -278.95745849609375,
"logps/ref_rejected": -262.9747314453125,
"logps/rejected": -266.3275146484375,
"loss": 5.2805,
"rewards/accuracies": 0.70703125,
"rewards/chosen": 0.04882372170686722,
"rewards/margins": 0.07208652794361115,
"rewards/rejected": -0.02326280251145363,
"step": 130
},
{
"epoch": 0.2931937172774869,
"epsilon_dpo/beta": 0.0068093957379460335,
"epsilon_dpo/beta_margin_grad_mean": -0.4802798628807068,
"epsilon_dpo/beta_margin_grad_std": 0.0389549545943737,
"epsilon_dpo/beta_margin_mean": 0.07946081459522247,
"epsilon_dpo/beta_margin_std": 0.1572197675704956,
"epsilon_dpo/loss_margin_mean": 11.81810474395752,
"grad_norm": 11.451045989990234,
"kl/avg_steps": 0.40625,
"kl/beta": 0.006836493965238333,
"kl/n_epsilon_steps": 0.2906250059604645,
"kl/p_epsilon_steps": 0.6968749761581421,
"learning_rate": 4.4651327368569684e-07,
"logits/chosen": 2.035799741744995,
"logits/rejected": 2.3696587085723877,
"logps/chosen": -278.00701904296875,
"logps/ref_chosen": -282.004150390625,
"logps/ref_rejected": -268.6994934082031,
"logps/rejected": -276.5204772949219,
"loss": 5.2585,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.02683289907872677,
"rewards/margins": 0.07946079969406128,
"rewards/rejected": -0.05262790992856026,
"step": 140
},
{
"epoch": 0.31413612565445026,
"epsilon_dpo/beta": 0.0065385727211833,
"epsilon_dpo/beta_margin_grad_mean": -0.47647207975387573,
"epsilon_dpo/beta_margin_grad_std": 0.04337490350008011,
"epsilon_dpo/beta_margin_mean": 0.09494680166244507,
"epsilon_dpo/beta_margin_std": 0.1755046844482422,
"epsilon_dpo/loss_margin_mean": 14.688570976257324,
"grad_norm": 12.580639839172363,
"kl/avg_steps": 0.39140623807907104,
"kl/beta": 0.006563636474311352,
"kl/n_epsilon_steps": 0.2984375059604645,
"kl/p_epsilon_steps": 0.6898437738418579,
"learning_rate": 4.346796604970912e-07,
"logits/chosen": 2.1158509254455566,
"logits/rejected": 2.3138821125030518,
"logps/chosen": -274.89691162109375,
"logps/ref_chosen": -278.5110778808594,
"logps/ref_rejected": -255.59854125976562,
"logps/rejected": -266.67291259765625,
"loss": 5.2052,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.023254716768860817,
"rewards/margins": 0.09494679421186447,
"rewards/rejected": -0.0716920793056488,
"step": 150
},
{
"epoch": 0.33507853403141363,
"epsilon_dpo/beta": 0.006265554577112198,
"epsilon_dpo/beta_margin_grad_mean": -0.4711342453956604,
"epsilon_dpo/beta_margin_grad_std": 0.04951424151659012,
"epsilon_dpo/beta_margin_mean": 0.11672033369541168,
"epsilon_dpo/beta_margin_std": 0.20064322650432587,
"epsilon_dpo/loss_margin_mean": 18.817256927490234,
"grad_norm": 12.49393367767334,
"kl/avg_steps": 0.4453125,
"kl/beta": 0.006292995996773243,
"kl/n_epsilon_steps": 0.27421873807907104,
"kl/p_epsilon_steps": 0.719531238079071,
"learning_rate": 4.218561044282098e-07,
"logits/chosen": 2.0132875442504883,
"logits/rejected": 2.3389055728912354,
"logps/chosen": -276.2854309082031,
"logps/ref_chosen": -276.8100280761719,
"logps/ref_rejected": -264.40625,
"logps/rejected": -282.6988525390625,
"loss": 5.1326,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.002674251329153776,
"rewards/margins": 0.11672033369541168,
"rewards/rejected": -0.11404608190059662,
"step": 160
},
{
"epoch": 0.35602094240837695,
"epsilon_dpo/beta": 0.005999959539622068,
"epsilon_dpo/beta_margin_grad_mean": -0.46788015961647034,
"epsilon_dpo/beta_margin_grad_std": 0.05059142783284187,
"epsilon_dpo/beta_margin_mean": 0.13001370429992676,
"epsilon_dpo/beta_margin_std": 0.2052367627620697,
"epsilon_dpo/loss_margin_mean": 21.894283294677734,
"grad_norm": 15.406351089477539,
"kl/avg_steps": 0.46875,
"kl/beta": 0.006027590483427048,
"kl/n_epsilon_steps": 0.26249998807907104,
"kl/p_epsilon_steps": 0.731249988079071,
"learning_rate": 4.081113438988443e-07,
"logits/chosen": 1.973179578781128,
"logits/rejected": 2.2208034992218018,
"logps/chosen": -282.03741455078125,
"logps/ref_chosen": -281.14337158203125,
"logps/ref_rejected": -250.2654266357422,
"logps/rejected": -273.05377197265625,
"loss": 5.0843,
"rewards/accuracies": 0.7359374761581421,
"rewards/chosen": -0.005938548129051924,
"rewards/margins": 0.13001371920108795,
"rewards/rejected": -0.13595226407051086,
"step": 170
},
{
"epoch": 0.3769633507853403,
"epsilon_dpo/beta": 0.0057226200588047504,
"epsilon_dpo/beta_margin_grad_mean": -0.46952924132347107,
"epsilon_dpo/beta_margin_grad_std": 0.05471862107515335,
"epsilon_dpo/beta_margin_mean": 0.12347264587879181,
"epsilon_dpo/beta_margin_std": 0.2224453240633011,
"epsilon_dpo/loss_margin_mean": 21.816726684570312,
"grad_norm": 24.414875030517578,
"kl/avg_steps": 0.45703125,
"kl/beta": 0.005748326890170574,
"kl/n_epsilon_steps": 0.26875001192092896,
"kl/p_epsilon_steps": 0.725781261920929,
"learning_rate": 3.935190552834828e-07,
"logits/chosen": 1.9551303386688232,
"logits/rejected": 2.1914541721343994,
"logps/chosen": -283.0456237792969,
"logps/ref_chosen": -279.8695068359375,
"logps/ref_rejected": -263.40533447265625,
"logps/rejected": -288.39813232421875,
"loss": 5.1163,
"rewards/accuracies": 0.723437488079071,
"rewards/chosen": -0.018750619143247604,
"rewards/margins": 0.1234726533293724,
"rewards/rejected": -0.1422232687473297,
"step": 180
},
{
"epoch": 0.39790575916230364,
"epsilon_dpo/beta": 0.005460767075419426,
"epsilon_dpo/beta_margin_grad_mean": -0.462840735912323,
"epsilon_dpo/beta_margin_grad_std": 0.05923638492822647,
"epsilon_dpo/beta_margin_mean": 0.15091852843761444,
"epsilon_dpo/beta_margin_std": 0.24113008379936218,
"epsilon_dpo/loss_margin_mean": 27.910152435302734,
"grad_norm": 19.144001007080078,
"kl/avg_steps": 0.47578126192092896,
"kl/beta": 0.005486341658979654,
"kl/n_epsilon_steps": 0.25703126192092896,
"kl/p_epsilon_steps": 0.7328125238418579,
"learning_rate": 3.781574579820464e-07,
"logits/chosen": 1.913297414779663,
"logits/rejected": 2.166954517364502,
"logps/chosen": -288.5598449707031,
"logps/ref_chosen": -278.2532958984375,
"logps/ref_rejected": -257.45025634765625,
"logps/rejected": -295.66693115234375,
"loss": 5.0227,
"rewards/accuracies": 0.741406261920929,
"rewards/chosen": -0.05687868595123291,
"rewards/margins": 0.15091851353645325,
"rewards/rejected": -0.20779721438884735,
"step": 190
},
{
"epoch": 0.418848167539267,
"epsilon_dpo/beta": 0.005235456861555576,
"epsilon_dpo/beta_margin_grad_mean": -0.4651154577732086,
"epsilon_dpo/beta_margin_grad_std": 0.06457895785570145,
"epsilon_dpo/beta_margin_mean": 0.14193181693553925,
"epsilon_dpo/beta_margin_std": 0.26321619749069214,
"epsilon_dpo/loss_margin_mean": 27.4693603515625,
"grad_norm": 20.511478424072266,
"kl/avg_steps": 0.38749998807907104,
"kl/beta": 0.005255300085991621,
"kl/n_epsilon_steps": 0.30390626192092896,
"kl/p_epsilon_steps": 0.69140625,
"learning_rate": 3.621088951385353e-07,
"logits/chosen": 1.876455307006836,
"logits/rejected": 2.166574001312256,
"logps/chosen": -285.0974426269531,
"logps/ref_chosen": -275.12750244140625,
"logps/ref_rejected": -260.0728759765625,
"logps/rejected": -297.5121154785156,
"loss": 5.0674,
"rewards/accuracies": 0.70703125,
"rewards/chosen": -0.053233105689287186,
"rewards/margins": 0.14193184673786163,
"rewards/rejected": -0.19516493380069733,
"step": 200
},
{
"epoch": 0.418848167539267,
"eval_epsilon_dpo/beta": 0.00512322410941124,
"eval_epsilon_dpo/beta_margin_grad_mean": -0.464358389377594,
"eval_epsilon_dpo/beta_margin_grad_std": 0.06305021047592163,
"eval_epsilon_dpo/beta_margin_mean": 0.14517197012901306,
"eval_epsilon_dpo/beta_margin_std": 0.25747936964035034,
"eval_epsilon_dpo/loss_margin_mean": 28.677000045776367,
"eval_kl/n_epsilon_steps": 0.2930000126361847,
"eval_kl/p_epsilon_steps": 0.6990000009536743,
"eval_logits/chosen": 1.8063491582870483,
"eval_logits/rejected": 2.155062198638916,
"eval_logps/chosen": -291.77764892578125,
"eval_logps/ref_chosen": -280.4282531738281,
"eval_logps/ref_rejected": -264.7044677734375,
"eval_logps/rejected": -304.7308654785156,
"eval_loss": 0.6321755647659302,
"eval_rewards/accuracies": 0.7170000076293945,
"eval_rewards/chosen": -0.05901862308382988,
"eval_rewards/margins": 0.14517197012901306,
"eval_rewards/rejected": -0.20419058203697205,
"eval_runtime": 103.5445,
"eval_samples_per_second": 19.315,
"eval_steps_per_second": 1.207,
"step": 200
},
{
"epoch": 0.4397905759162304,
"epsilon_dpo/beta": 0.005026308819651604,
"epsilon_dpo/beta_margin_grad_mean": -0.4626430571079254,
"epsilon_dpo/beta_margin_grad_std": 0.06565666198730469,
"epsilon_dpo/beta_margin_mean": 0.15212179720401764,
"epsilon_dpo/beta_margin_std": 0.2678548991680145,
"epsilon_dpo/loss_margin_mean": 30.614501953125,
"grad_norm": 30.989282608032227,
"kl/avg_steps": 0.4203124940395355,
"kl/beta": 0.005047028884291649,
"kl/n_epsilon_steps": 0.28437501192092896,
"kl/p_epsilon_steps": 0.7046874761581421,
"learning_rate": 3.454593922550693e-07,
"logits/chosen": 1.8265072107315063,
"logits/rejected": 2.06158185005188,
"logps/chosen": -291.03253173828125,
"logps/ref_chosen": -279.7332763671875,
"logps/ref_rejected": -267.92437744140625,
"logps/rejected": -309.8381042480469,
"loss": 5.0314,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.057643067091703415,
"rewards/margins": 0.15212179720401764,
"rewards/rejected": -0.20976486802101135,
"step": 210
},
{
"epoch": 0.4607329842931937,
"epsilon_dpo/beta": 0.004815506748855114,
"epsilon_dpo/beta_margin_grad_mean": -0.46018725633621216,
"epsilon_dpo/beta_margin_grad_std": 0.06686625629663467,
"epsilon_dpo/beta_margin_mean": 0.16237930953502655,
"epsilon_dpo/beta_margin_std": 0.2736971378326416,
"epsilon_dpo/loss_margin_mean": 34.08965301513672,
"grad_norm": 27.191370010375977,
"kl/avg_steps": 0.4453125,
"kl/beta": 0.004836562555283308,
"kl/n_epsilon_steps": 0.27265626192092896,
"kl/p_epsilon_steps": 0.717968761920929,
"learning_rate": 3.2829819606729477e-07,
"logits/chosen": 1.8367538452148438,
"logits/rejected": 2.1368610858917236,
"logps/chosen": -304.51153564453125,
"logps/ref_chosen": -287.2923583984375,
"logps/ref_rejected": -270.8887023925781,
"logps/rejected": -322.1975402832031,
"loss": 4.9966,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -0.08366179466247559,
"rewards/margins": 0.16237932443618774,
"rewards/rejected": -0.24604110419750214,
"step": 220
},
{
"epoch": 0.4816753926701571,
"epsilon_dpo/beta": 0.004599227569997311,
"epsilon_dpo/beta_margin_grad_mean": -0.45680707693099976,
"epsilon_dpo/beta_margin_grad_std": 0.06870144605636597,
"epsilon_dpo/beta_margin_mean": 0.1762588918209076,
"epsilon_dpo/beta_margin_std": 0.2809893488883972,
"epsilon_dpo/loss_margin_mean": 38.755615234375,
"grad_norm": 22.937519073486328,
"kl/avg_steps": 0.47734373807907104,
"kl/beta": 0.004620816558599472,
"kl/n_epsilon_steps": 0.2593750059604645,
"kl/p_epsilon_steps": 0.7367187738418579,
"learning_rate": 3.1071729615293424e-07,
"logits/chosen": 1.7133830785751343,
"logits/rejected": 2.039473533630371,
"logps/chosen": -293.60247802734375,
"logps/ref_chosen": -272.74945068359375,
"logps/ref_rejected": -258.1266784667969,
"logps/rejected": -317.7353515625,
"loss": 4.9502,
"rewards/accuracies": 0.739062488079071,
"rewards/chosen": -0.09684249013662338,
"rewards/margins": 0.1762588918209076,
"rewards/rejected": -0.2731013596057892,
"step": 230
},
{
"epoch": 0.5026178010471204,
"epsilon_dpo/beta": 0.0043902210891246796,
"epsilon_dpo/beta_margin_grad_mean": -0.457236111164093,
"epsilon_dpo/beta_margin_grad_std": 0.07059483975172043,
"epsilon_dpo/beta_margin_mean": 0.17473134398460388,
"epsilon_dpo/beta_margin_std": 0.2893211245536804,
"epsilon_dpo/loss_margin_mean": 40.25088882446289,
"grad_norm": 22.779020309448242,
"kl/avg_steps": 0.4468750059604645,
"kl/beta": 0.004409492947161198,
"kl/n_epsilon_steps": 0.2718749940395355,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 2.9281093183781403e-07,
"logits/chosen": 1.7209564447402954,
"logits/rejected": 2.0882318019866943,
"logps/chosen": -300.35296630859375,
"logps/ref_chosen": -280.094970703125,
"logps/ref_rejected": -263.1619873046875,
"logps/rejected": -323.6708679199219,
"loss": 4.9599,
"rewards/accuracies": 0.7289062738418579,
"rewards/chosen": -0.09005247056484222,
"rewards/margins": 0.17473134398460388,
"rewards/rejected": -0.2647838294506073,
"step": 240
},
{
"epoch": 0.5235602094240838,
"epsilon_dpo/beta": 0.00419188616797328,
"epsilon_dpo/beta_margin_grad_mean": -0.4556571841239929,
"epsilon_dpo/beta_margin_grad_std": 0.0703204870223999,
"epsilon_dpo/beta_margin_mean": 0.18116165697574615,
"epsilon_dpo/beta_margin_std": 0.2881784737110138,
"epsilon_dpo/loss_margin_mean": 43.624481201171875,
"grad_norm": 39.10613250732422,
"kl/avg_steps": 0.4593749940395355,
"kl/beta": 0.004210834391415119,
"kl/n_epsilon_steps": 0.26640623807907104,
"kl/p_epsilon_steps": 0.725781261920929,
"learning_rate": 2.7467508704251135e-07,
"logits/chosen": 1.741624116897583,
"logits/rejected": 1.9895031452178955,
"logps/chosen": -296.340576171875,
"logps/ref_chosen": -279.10601806640625,
"logps/ref_rejected": -255.9159698486328,
"logps/rejected": -316.7749938964844,
"loss": 4.9365,
"rewards/accuracies": 0.7359374761581421,
"rewards/chosen": -0.07299315184354782,
"rewards/margins": 0.18116167187690735,
"rewards/rejected": -0.25415483117103577,
"step": 250
},
{
"epoch": 0.5445026178010471,
"epsilon_dpo/beta": 0.004008334130048752,
"epsilon_dpo/beta_margin_grad_mean": -0.4575107991695404,
"epsilon_dpo/beta_margin_grad_std": 0.07278217375278473,
"epsilon_dpo/beta_margin_mean": 0.17386779189109802,
"epsilon_dpo/beta_margin_std": 0.2985754609107971,
"epsilon_dpo/loss_margin_mean": 43.82888412475586,
"grad_norm": 32.33043670654297,
"kl/avg_steps": 0.43828123807907104,
"kl/beta": 0.004025599919259548,
"kl/n_epsilon_steps": 0.2789062559604645,
"kl/p_epsilon_steps": 0.7171875238418579,
"learning_rate": 2.5640697577740815e-07,
"logits/chosen": 1.7184337377548218,
"logits/rejected": 1.9476096630096436,
"logps/chosen": -306.7433166503906,
"logps/ref_chosen": -279.7398986816406,
"logps/ref_rejected": -256.90155029296875,
"logps/rejected": -327.7337951660156,
"loss": 4.9692,
"rewards/accuracies": 0.72265625,
"rewards/chosen": -0.10899752378463745,
"rewards/margins": 0.17386779189109802,
"rewards/rejected": -0.28286534547805786,
"step": 260
},
{
"epoch": 0.5654450261780105,
"epsilon_dpo/beta": 0.0038394411094486713,
"epsilon_dpo/beta_margin_grad_mean": -0.45551127195358276,
"epsilon_dpo/beta_margin_grad_std": 0.07340405881404877,
"epsilon_dpo/beta_margin_mean": 0.18199250102043152,
"epsilon_dpo/beta_margin_std": 0.30104658007621765,
"epsilon_dpo/loss_margin_mean": 47.921356201171875,
"grad_norm": 26.059804916381836,
"kl/avg_steps": 0.44843751192092896,
"kl/beta": 0.00385635276325047,
"kl/n_epsilon_steps": 0.27031248807907104,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 2.381045210440644e-07,
"logits/chosen": 1.6957333087921143,
"logits/rejected": 1.981131911277771,
"logps/chosen": -306.7268981933594,
"logps/ref_chosen": -272.6238708496094,
"logps/ref_rejected": -256.24176025390625,
"logps/rejected": -338.26611328125,
"loss": 4.9401,
"rewards/accuracies": 0.7359374761581421,
"rewards/chosen": -0.13194236159324646,
"rewards/margins": 0.18199248611927032,
"rewards/rejected": -0.3139348328113556,
"step": 270
},
{
"epoch": 0.5863874345549738,
"epsilon_dpo/beta": 0.0036588613875210285,
"epsilon_dpo/beta_margin_grad_mean": -0.4539538323879242,
"epsilon_dpo/beta_margin_grad_std": 0.07207532227039337,
"epsilon_dpo/beta_margin_mean": 0.18829122185707092,
"epsilon_dpo/beta_margin_std": 0.2957257628440857,
"epsilon_dpo/loss_margin_mean": 51.929046630859375,
"grad_norm": 21.85626220703125,
"kl/avg_steps": 0.48906248807907104,
"kl/beta": 0.0036765006370842457,
"kl/n_epsilon_steps": 0.25078123807907104,
"kl/p_epsilon_steps": 0.7398437261581421,
"learning_rate": 2.1986582993616925e-07,
"logits/chosen": 1.5749285221099854,
"logits/rejected": 1.9680347442626953,
"logps/chosen": -298.32781982421875,
"logps/ref_chosen": -272.6661682128906,
"logps/ref_rejected": -259.3951721191406,
"logps/rejected": -336.98590087890625,
"loss": 4.9148,
"rewards/accuracies": 0.749218761920929,
"rewards/chosen": -0.09480254352092743,
"rewards/margins": 0.18829122185707092,
"rewards/rejected": -0.28309375047683716,
"step": 280
},
{
"epoch": 0.6073298429319371,
"epsilon_dpo/beta": 0.00350450468249619,
"epsilon_dpo/beta_margin_grad_mean": -0.46083664894104004,
"epsilon_dpo/beta_margin_grad_std": 0.07311917841434479,
"epsilon_dpo/beta_margin_mean": 0.1602335274219513,
"epsilon_dpo/beta_margin_std": 0.2994373142719269,
"epsilon_dpo/loss_margin_mean": 46.23841094970703,
"grad_norm": 34.233943939208984,
"kl/avg_steps": 0.3851562440395355,
"kl/beta": 0.003517721313983202,
"kl/n_epsilon_steps": 0.3031249940395355,
"kl/p_epsilon_steps": 0.688281238079071,
"learning_rate": 2.0178866775369774e-07,
"logits/chosen": 1.578467845916748,
"logits/rejected": 1.903235673904419,
"logps/chosen": -323.2730407714844,
"logps/ref_chosen": -287.4728698730469,
"logps/ref_rejected": -268.4922790527344,
"logps/rejected": -350.5308532714844,
"loss": 5.0191,
"rewards/accuracies": 0.7015625238418579,
"rewards/chosen": -0.1263677179813385,
"rewards/margins": 0.1602335274219513,
"rewards/rejected": -0.2866012454032898,
"step": 290
},
{
"epoch": 0.6282722513089005,
"epsilon_dpo/beta": 0.003364184172824025,
"epsilon_dpo/beta_margin_grad_mean": -0.4569614827632904,
"epsilon_dpo/beta_margin_grad_std": 0.07025741040706635,
"epsilon_dpo/beta_margin_mean": 0.17601335048675537,
"epsilon_dpo/beta_margin_std": 0.2878516614437103,
"epsilon_dpo/loss_margin_mean": 52.840850830078125,
"grad_norm": 19.78177833557129,
"kl/avg_steps": 0.4351562559604645,
"kl/beta": 0.003378564026206732,
"kl/n_epsilon_steps": 0.2789062559604645,
"kl/p_epsilon_steps": 0.714062511920929,
"learning_rate": 1.839699339491937e-07,
"logits/chosen": 1.6086456775665283,
"logits/rejected": 1.9709374904632568,
"logps/chosen": -301.5176696777344,
"logps/ref_chosen": -273.06646728515625,
"logps/ref_rejected": -266.1439208984375,
"logps/rejected": -347.4358825683594,
"loss": 4.9542,
"rewards/accuracies": 0.721875011920929,
"rewards/chosen": -0.09642257541418076,
"rewards/margins": 0.17601335048675537,
"rewards/rejected": -0.2724359333515167,
"step": 300
},
{
"epoch": 0.6492146596858639,
"epsilon_dpo/beta": 0.0032132375054061413,
"epsilon_dpo/beta_margin_grad_mean": -0.45566052198410034,
"epsilon_dpo/beta_margin_grad_std": 0.06932147592306137,
"epsilon_dpo/beta_margin_mean": 0.1810220181941986,
"epsilon_dpo/beta_margin_std": 0.28379470109939575,
"epsilon_dpo/loss_margin_mean": 56.88977813720703,
"grad_norm": 20.059579849243164,
"kl/avg_steps": 0.4867187440395355,
"kl/beta": 0.003228639718145132,
"kl/n_epsilon_steps": 0.25468748807907104,
"kl/p_epsilon_steps": 0.741406261920929,
"learning_rate": 1.6650514271527465e-07,
"logits/chosen": 1.593857765197754,
"logits/rejected": 1.952932596206665,
"logps/chosen": -313.94219970703125,
"logps/ref_chosen": -276.8886413574219,
"logps/ref_rejected": -256.80865478515625,
"logps/rejected": -350.75201416015625,
"loss": 4.9339,
"rewards/accuracies": 0.7367187738418579,
"rewards/chosen": -0.11971668899059296,
"rewards/margins": 0.1810220181941986,
"rewards/rejected": -0.30073872208595276,
"step": 310
},
{
"epoch": 0.6701570680628273,
"epsilon_dpo/beta": 0.0030656014569103718,
"epsilon_dpo/beta_margin_grad_mean": -0.45544466376304626,
"epsilon_dpo/beta_margin_grad_std": 0.06911682337522507,
"epsilon_dpo/beta_margin_mean": 0.18175189197063446,
"epsilon_dpo/beta_margin_std": 0.2825908660888672,
"epsilon_dpo/loss_margin_mean": 59.900352478027344,
"grad_norm": 24.982254028320312,
"kl/avg_steps": 0.47343748807907104,
"kl/beta": 0.003079873975366354,
"kl/n_epsilon_steps": 0.2593750059604645,
"kl/p_epsilon_steps": 0.7328125238418579,
"learning_rate": 1.4948791099758052e-07,
"logits/chosen": 1.6970676183700562,
"logits/rejected": 2.0628037452697754,
"logps/chosen": -321.9020080566406,
"logps/ref_chosen": -282.2432556152344,
"logps/ref_rejected": -256.89776611328125,
"logps/rejected": -356.45684814453125,
"loss": 4.9303,
"rewards/accuracies": 0.73828125,
"rewards/chosen": -0.12259833514690399,
"rewards/margins": 0.18175189197063446,
"rewards/rejected": -0.30435022711753845,
"step": 320
},
{
"epoch": 0.6910994764397905,
"epsilon_dpo/beta": 0.002925318432971835,
"epsilon_dpo/beta_margin_grad_mean": -0.45976167917251587,
"epsilon_dpo/beta_margin_grad_std": 0.06790686398744583,
"epsilon_dpo/beta_margin_mean": 0.16398653388023376,
"epsilon_dpo/beta_margin_std": 0.27741676568984985,
"epsilon_dpo/loss_margin_mean": 56.63254928588867,
"grad_norm": 35.780921936035156,
"kl/avg_steps": 0.46406251192092896,
"kl/beta": 0.0029386640526354313,
"kl/n_epsilon_steps": 0.2632812559604645,
"kl/p_epsilon_steps": 0.727343738079071,
"learning_rate": 1.3300945667758012e-07,
"logits/chosen": 1.6550931930541992,
"logits/rejected": 1.8850772380828857,
"logps/chosen": -316.6177062988281,
"logps/ref_chosen": -275.7609558105469,
"logps/ref_rejected": -263.5372619628906,
"logps/rejected": -361.02655029296875,
"loss": 4.9933,
"rewards/accuracies": 0.719531238079071,
"rewards/chosen": -0.12028974294662476,
"rewards/margins": 0.16398653388023376,
"rewards/rejected": -0.28427624702453613,
"step": 330
},
{
"epoch": 0.7120418848167539,
"epsilon_dpo/beta": 0.0027930724900215864,
"epsilon_dpo/beta_margin_grad_mean": -0.4600375294685364,
"epsilon_dpo/beta_margin_grad_std": 0.06828001886606216,
"epsilon_dpo/beta_margin_mean": 0.16278859972953796,
"epsilon_dpo/beta_margin_std": 0.2784718871116638,
"epsilon_dpo/loss_margin_mean": 58.940940856933594,
"grad_norm": 19.590518951416016,
"kl/avg_steps": 0.4609375,
"kl/beta": 0.0028057279996573925,
"kl/n_epsilon_steps": 0.26484376192092896,
"kl/p_epsilon_steps": 0.725781261920929,
"learning_rate": 1.1715810961514072e-07,
"logits/chosen": 1.6267999410629272,
"logits/rejected": 1.9399261474609375,
"logps/chosen": -319.0074157714844,
"logps/ref_chosen": -269.4908447265625,
"logps/ref_rejected": -253.1649627685547,
"logps/rejected": -361.62249755859375,
"loss": 4.9976,
"rewards/accuracies": 0.725781261920929,
"rewards/chosen": -0.13907715678215027,
"rewards/margins": 0.16278859972953796,
"rewards/rejected": -0.30186575651168823,
"step": 340
},
{
"epoch": 0.7329842931937173,
"epsilon_dpo/beta": 0.0026765193324536085,
"epsilon_dpo/beta_margin_grad_mean": -0.4628540575504303,
"epsilon_dpo/beta_margin_grad_std": 0.06378835439682007,
"epsilon_dpo/beta_margin_mean": 0.15105712413787842,
"epsilon_dpo/beta_margin_std": 0.25988245010375977,
"epsilon_dpo/loss_margin_mean": 57.061004638671875,
"grad_norm": 20.615802764892578,
"kl/avg_steps": 0.4242187440395355,
"kl/beta": 0.0026876390911638737,
"kl/n_epsilon_steps": 0.28515625,
"kl/p_epsilon_steps": 0.7093750238418579,
"learning_rate": 1.0201883817182949e-07,
"logits/chosen": 1.6629711389541626,
"logits/rejected": 2.020021915435791,
"logps/chosen": -344.3343811035156,
"logps/ref_chosen": -284.06365966796875,
"logps/ref_rejected": -260.7166442871094,
"logps/rejected": -378.0483703613281,
"loss": 5.0309,
"rewards/accuracies": 0.715624988079071,
"rewards/chosen": -0.16222040355205536,
"rewards/margins": 0.15105712413787842,
"rewards/rejected": -0.31327754259109497,
"step": 350
},
{
"epoch": 0.7539267015706806,
"epsilon_dpo/beta": 0.002562676090747118,
"epsilon_dpo/beta_margin_grad_mean": -0.4644971787929535,
"epsilon_dpo/beta_margin_grad_std": 0.06188509613275528,
"epsilon_dpo/beta_margin_mean": 0.14429207146167755,
"epsilon_dpo/beta_margin_std": 0.2519903779029846,
"epsilon_dpo/loss_margin_mean": 56.94682693481445,
"grad_norm": 28.58539581298828,
"kl/avg_steps": 0.4359374940395355,
"kl/beta": 0.0025736321695148945,
"kl/n_epsilon_steps": 0.27656251192092896,
"kl/p_epsilon_steps": 0.7124999761581421,
"learning_rate": 8.76727937529367e-08,
"logits/chosen": 1.558531403541565,
"logits/rejected": 1.9686288833618164,
"logps/chosen": -326.70318603515625,
"logps/ref_chosen": -269.2133483886719,
"logps/ref_rejected": -251.10647583007812,
"logps/rejected": -365.5430908203125,
"loss": 5.0524,
"rewards/accuracies": 0.7132812738418579,
"rewards/chosen": -0.14818084239959717,
"rewards/margins": 0.14429204165935516,
"rewards/rejected": -0.2924729287624359,
"step": 360
},
{
"epoch": 0.774869109947644,
"epsilon_dpo/beta": 0.0024432847276329994,
"epsilon_dpo/beta_margin_grad_mean": -0.45972761511802673,
"epsilon_dpo/beta_margin_grad_std": 0.06026551127433777,
"epsilon_dpo/beta_margin_mean": 0.16353142261505127,
"epsilon_dpo/beta_margin_std": 0.2452823668718338,
"epsilon_dpo/loss_margin_mean": 67.50531005859375,
"grad_norm": 18.816442489624023,
"kl/avg_steps": 0.500781238079071,
"kl/beta": 0.0024553355760872364,
"kl/n_epsilon_steps": 0.24609375,
"kl/p_epsilon_steps": 0.746874988079071,
"learning_rate": 7.419687580962222e-08,
"logits/chosen": 1.6747153997421265,
"logits/rejected": 1.9603767395019531,
"logps/chosen": -331.12542724609375,
"logps/ref_chosen": -276.8400573730469,
"logps/ref_rejected": -257.84912109375,
"logps/rejected": -379.6397705078125,
"loss": 4.9777,
"rewards/accuracies": 0.746874988079071,
"rewards/chosen": -0.13340650498867035,
"rewards/margins": 0.16353140771389008,
"rewards/rejected": -0.2969379425048828,
"step": 370
},
{
"epoch": 0.7958115183246073,
"epsilon_dpo/beta": 0.0023312487173825502,
"epsilon_dpo/beta_margin_grad_mean": -0.46638360619544983,
"epsilon_dpo/beta_margin_grad_std": 0.05908365920186043,
"epsilon_dpo/beta_margin_mean": 0.13641974329948425,
"epsilon_dpo/beta_margin_std": 0.23996075987815857,
"epsilon_dpo/loss_margin_mean": 59.121360778808594,
"grad_norm": 33.467586517333984,
"kl/avg_steps": 0.4359374940395355,
"kl/beta": 0.0023412262089550495,
"kl/n_epsilon_steps": 0.2789062559604645,
"kl/p_epsilon_steps": 0.71484375,
"learning_rate": 6.166331963291519e-08,
"logits/chosen": 1.7089202404022217,
"logits/rejected": 1.9208694696426392,
"logps/chosen": -356.5716857910156,
"logps/ref_chosen": -294.3582458496094,
"logps/ref_rejected": -266.00933837890625,
"logps/rejected": -387.34417724609375,
"loss": 5.0756,
"rewards/accuracies": 0.723437488079071,
"rewards/chosen": -0.14578744769096375,
"rewards/margins": 0.13641975820064545,
"rewards/rejected": -0.2822072207927704,
"step": 380
},
{
"epoch": 0.8167539267015707,
"epsilon_dpo/beta": 0.0022311562206596136,
"epsilon_dpo/beta_margin_grad_mean": -0.4652669429779053,
"epsilon_dpo/beta_margin_grad_std": 0.05686299130320549,
"epsilon_dpo/beta_margin_mean": 0.14073483645915985,
"epsilon_dpo/beta_margin_std": 0.23052707314491272,
"epsilon_dpo/loss_margin_mean": 63.751487731933594,
"grad_norm": 20.419815063476562,
"kl/avg_steps": 0.4375,
"kl/beta": 0.0022407451178878546,
"kl/n_epsilon_steps": 0.2750000059604645,
"kl/p_epsilon_steps": 0.7124999761581421,
"learning_rate": 5.013930914912476e-08,
"logits/chosen": 1.5366142988204956,
"logits/rejected": 1.9008190631866455,
"logps/chosen": -333.5438537597656,
"logps/ref_chosen": -271.92047119140625,
"logps/ref_rejected": -263.865478515625,
"logps/rejected": -389.2403259277344,
"loss": 5.0554,
"rewards/accuracies": 0.717968761920929,
"rewards/chosen": -0.13834409415721893,
"rewards/margins": 0.14073482155799866,
"rewards/rejected": -0.2790789306163788,
"step": 390
},
{
"epoch": 0.837696335078534,
"epsilon_dpo/beta": 0.0021363936830312014,
"epsilon_dpo/beta_margin_grad_mean": -0.4690118730068207,
"epsilon_dpo/beta_margin_grad_std": 0.05419831722974777,
"epsilon_dpo/beta_margin_mean": 0.12548907101154327,
"epsilon_dpo/beta_margin_std": 0.2197370082139969,
"epsilon_dpo/loss_margin_mean": 59.32947540283203,
"grad_norm": 16.475208282470703,
"kl/avg_steps": 0.42500001192092896,
"kl/beta": 0.0021453090012073517,
"kl/n_epsilon_steps": 0.2835937440395355,
"kl/p_epsilon_steps": 0.7085937261581421,
"learning_rate": 3.968661679220467e-08,
"logits/chosen": 1.5702852010726929,
"logits/rejected": 1.895922064781189,
"logps/chosen": -350.1571960449219,
"logps/ref_chosen": -284.8265075683594,
"logps/ref_rejected": -265.3280944824219,
"logps/rejected": -389.98828125,
"loss": 5.1073,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -0.14023001492023468,
"rewards/margins": 0.12548907101154327,
"rewards/rejected": -0.26571911573410034,
"step": 400
},
{
"epoch": 0.837696335078534,
"eval_epsilon_dpo/beta": 0.002089055487886071,
"eval_epsilon_dpo/beta_margin_grad_mean": -0.4698907434940338,
"eval_epsilon_dpo/beta_margin_grad_std": 0.05313246697187424,
"eval_epsilon_dpo/beta_margin_mean": 0.12187241017818451,
"eval_epsilon_dpo/beta_margin_std": 0.2152228057384491,
"eval_epsilon_dpo/loss_margin_mean": 59.03139877319336,
"eval_kl/n_epsilon_steps": 0.2854999899864197,
"eval_kl/p_epsilon_steps": 0.7085000276565552,
"eval_logits/chosen": 1.5736112594604492,
"eval_logits/rejected": 1.9568898677825928,
"eval_logps/chosen": -346.2501220703125,
"eval_logps/ref_chosen": -280.4282531738281,
"eval_logps/ref_rejected": -264.7044677734375,
"eval_logps/rejected": -389.5577392578125,
"eval_loss": 0.6402832269668579,
"eval_rewards/accuracies": 0.7164999842643738,
"eval_rewards/chosen": -0.13826368749141693,
"eval_rewards/margins": 0.12187241017818451,
"eval_rewards/rejected": -0.26013606786727905,
"eval_runtime": 103.0031,
"eval_samples_per_second": 19.417,
"eval_steps_per_second": 1.214,
"step": 400
},
{
"epoch": 0.8586387434554974,
"epsilon_dpo/beta": 0.0020442053209990263,
"epsilon_dpo/beta_margin_grad_mean": -0.46692174673080444,
"epsilon_dpo/beta_margin_grad_std": 0.05178702622652054,
"epsilon_dpo/beta_margin_mean": 0.13379183411598206,
"epsilon_dpo/beta_margin_std": 0.20962686836719513,
"epsilon_dpo/loss_margin_mean": 66.03794860839844,
"grad_norm": 41.441593170166016,
"kl/avg_steps": 0.45390623807907104,
"kl/beta": 0.0020533339120447636,
"kl/n_epsilon_steps": 0.26875001192092896,
"kl/p_epsilon_steps": 0.72265625,
"learning_rate": 3.036127238347164e-08,
"logits/chosen": 1.612749695777893,
"logits/rejected": 1.9225709438323975,
"logps/chosen": -344.31646728515625,
"logps/ref_chosen": -282.58233642578125,
"logps/ref_rejected": -266.00897216796875,
"logps/rejected": -393.7810363769531,
"loss": 5.0719,
"rewards/accuracies": 0.7398437261581421,
"rewards/chosen": -0.12682631611824036,
"rewards/margins": 0.13379183411598206,
"rewards/rejected": -0.2606181502342224,
"step": 410
},
{
"epoch": 0.8795811518324608,
"epsilon_dpo/beta": 0.001955785322934389,
"epsilon_dpo/beta_margin_grad_mean": -0.4684430658817291,
"epsilon_dpo/beta_margin_grad_std": 0.05116555094718933,
"epsilon_dpo/beta_margin_mean": 0.12757208943367004,
"epsilon_dpo/beta_margin_std": 0.207074373960495,
"epsilon_dpo/loss_margin_mean": 65.90140533447266,
"grad_norm": 19.453214645385742,
"kl/avg_steps": 0.46406251192092896,
"kl/beta": 0.001964703667908907,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.729687511920929,
"learning_rate": 2.2213262793589482e-08,
"logits/chosen": 1.5862172842025757,
"logits/rejected": 1.9309051036834717,
"logps/chosen": -341.8611755371094,
"logps/ref_chosen": -281.11688232421875,
"logps/ref_rejected": -263.7762145996094,
"logps/rejected": -390.4219665527344,
"loss": 5.094,
"rewards/accuracies": 0.73046875,
"rewards/chosen": -0.11936762183904648,
"rewards/margins": 0.12757208943367004,
"rewards/rejected": -0.24693970382213593,
"step": 420
},
{
"epoch": 0.900523560209424,
"epsilon_dpo/beta": 0.001865379512310028,
"epsilon_dpo/beta_margin_grad_mean": -0.46811485290527344,
"epsilon_dpo/beta_margin_grad_std": 0.0480102077126503,
"epsilon_dpo/beta_margin_mean": 0.1287469118833542,
"epsilon_dpo/beta_margin_std": 0.19402021169662476,
"epsilon_dpo/loss_margin_mean": 69.65689849853516,
"grad_norm": 17.445083618164062,
"kl/avg_steps": 0.47343748807907104,
"kl/beta": 0.0018740678206086159,
"kl/n_epsilon_steps": 0.25859373807907104,
"kl/p_epsilon_steps": 0.7320312261581421,
"learning_rate": 1.5286263996730026e-08,
"logits/chosen": 1.5173814296722412,
"logits/rejected": 1.9054569005966187,
"logps/chosen": -337.60888671875,
"logps/ref_chosen": -282.20098876953125,
"logps/ref_rejected": -257.6202392578125,
"logps/rejected": -382.68505859375,
"loss": 5.0847,
"rewards/accuracies": 0.741406261920929,
"rewards/chosen": -0.10385727882385254,
"rewards/margins": 0.1287469118833542,
"rewards/rejected": -0.23260419070720673,
"step": 430
},
{
"epoch": 0.9214659685863874,
"epsilon_dpo/beta": 0.0017827233532443643,
"epsilon_dpo/beta_margin_grad_mean": -0.4748317301273346,
"epsilon_dpo/beta_margin_grad_std": 0.0455574207007885,
"epsilon_dpo/beta_margin_mean": 0.1015293225646019,
"epsilon_dpo/beta_margin_std": 0.18387706577777863,
"epsilon_dpo/loss_margin_mean": 57.55500030517578,
"grad_norm": 15.522335052490234,
"kl/avg_steps": 0.42109376192092896,
"kl/beta": 0.0017900926759466529,
"kl/n_epsilon_steps": 0.2835937440395355,
"kl/p_epsilon_steps": 0.7046874761581421,
"learning_rate": 9.617406953185136e-09,
"logits/chosen": 1.6178176403045654,
"logits/rejected": 1.9510142803192139,
"logps/chosen": -333.5023498535156,
"logps/ref_chosen": -272.00103759765625,
"logps/ref_rejected": -258.02813720703125,
"logps/rejected": -377.08441162109375,
"loss": 5.1835,
"rewards/accuracies": 0.703906238079071,
"rewards/chosen": -0.11019601672887802,
"rewards/margins": 0.1015293151140213,
"rewards/rejected": -0.2117253541946411,
"step": 440
},
{
"epoch": 0.9424083769633508,
"epsilon_dpo/beta": 0.001706903101876378,
"epsilon_dpo/beta_margin_grad_mean": -0.4734385013580322,
"epsilon_dpo/beta_margin_grad_std": 0.044034797698259354,
"epsilon_dpo/beta_margin_mean": 0.10707694292068481,
"epsilon_dpo/beta_margin_std": 0.1776462197303772,
"epsilon_dpo/loss_margin_mean": 63.39220428466797,
"grad_norm": 16.360170364379883,
"kl/avg_steps": 0.4546875059604645,
"kl/beta": 0.0017145348247140646,
"kl/n_epsilon_steps": 0.2671875059604645,
"kl/p_epsilon_steps": 0.721875011920929,
"learning_rate": 5.2370785753763356e-09,
"logits/chosen": 1.5754592418670654,
"logits/rejected": 1.9332977533340454,
"logps/chosen": -337.49688720703125,
"logps/ref_chosen": -278.8232421875,
"logps/ref_rejected": -256.79656982421875,
"logps/rejected": -378.8623962402344,
"loss": 5.16,
"rewards/accuracies": 0.72265625,
"rewards/chosen": -0.10077029466629028,
"rewards/margins": 0.10707694292068481,
"rewards/rejected": -0.2078472375869751,
"step": 450
},
{
"epoch": 0.9633507853403142,
"epsilon_dpo/beta": 0.0016306890174746513,
"epsilon_dpo/beta_margin_grad_mean": -0.47516068816185,
"epsilon_dpo/beta_margin_grad_std": 0.04221952706575394,
"epsilon_dpo/beta_margin_mean": 0.10008412599563599,
"epsilon_dpo/beta_margin_std": 0.17021533846855164,
"epsilon_dpo/loss_margin_mean": 61.97832107543945,
"grad_norm": 14.846392631530762,
"kl/avg_steps": 0.4546875059604645,
"kl/beta": 0.0016379815060645342,
"kl/n_epsilon_steps": 0.26953125,
"kl/p_epsilon_steps": 0.7242187261581421,
"learning_rate": 2.168758844148272e-09,
"logits/chosen": 1.6337049007415771,
"logits/rejected": 1.9634275436401367,
"logps/chosen": -353.42510986328125,
"logps/ref_chosen": -294.84185791015625,
"logps/ref_rejected": -276.9571533203125,
"logps/rejected": -397.5187072753906,
"loss": 5.184,
"rewards/accuracies": 0.7281249761581421,
"rewards/chosen": -0.09599287807941437,
"rewards/margins": 0.10008411109447479,
"rewards/rejected": -0.19607700407505035,
"step": 460
},
{
"epoch": 0.9842931937172775,
"epsilon_dpo/beta": 0.0015589601825922728,
"epsilon_dpo/beta_margin_grad_mean": -0.47446101903915405,
"epsilon_dpo/beta_margin_grad_std": 0.04050491005182266,
"epsilon_dpo/beta_margin_mean": 0.10283418744802475,
"epsilon_dpo/beta_margin_std": 0.16317032277584076,
"epsilon_dpo/loss_margin_mean": 66.61624145507812,
"grad_norm": 14.901313781738281,
"kl/avg_steps": 0.46562498807907104,
"kl/beta": 0.0015660974895581603,
"kl/n_epsilon_steps": 0.2632812559604645,
"kl/p_epsilon_steps": 0.7289062738418579,
"learning_rate": 4.288949484559934e-10,
"logits/chosen": 1.5405309200286865,
"logits/rejected": 1.751405119895935,
"logps/chosen": -339.19415283203125,
"logps/ref_chosen": -285.2023620605469,
"logps/ref_rejected": -255.1339569091797,
"logps/rejected": -375.7419738769531,
"loss": 5.1712,
"rewards/accuracies": 0.733593761920929,
"rewards/chosen": -0.08475174009799957,
"rewards/margins": 0.10283420234918594,
"rewards/rejected": -0.18758592009544373,
"step": 470
},
{
"epoch": 0.9989528795811519,
"step": 477,
"total_flos": 0.0,
"train_loss": 5.1642030939865915,
"train_runtime": 8287.5392,
"train_samples_per_second": 7.377,
"train_steps_per_second": 0.058
}
],
"logging_steps": 10,
"max_steps": 477,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}