{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9989528795811519, "eval_steps": 200, "global_step": 477, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0020942408376963353, "epsilon_dpo/beta": 0.009997084736824036, "epsilon_dpo/beta_margin_grad_mean": -0.499971479177475, "epsilon_dpo/beta_margin_grad_std": 0.001938261673785746, "epsilon_dpo/beta_margin_mean": 0.0001140289386967197, "epsilon_dpo/beta_margin_std": 0.007753193378448486, "epsilon_dpo/loss_margin_mean": 0.01704716682434082, "grad_norm": 14.606449127197266, "kl/avg_steps": 0.0390625, "kl/beta": 0.009999999776482582, "kl/n_epsilon_steps": 0.4765625, "kl/p_epsilon_steps": 0.515625, "learning_rate": 0.0, "logits/chosen": 2.6271941661834717, "logits/rejected": 2.237529993057251, "logps/chosen": -267.3031921386719, "logps/ref_chosen": -267.2525634765625, "logps/ref_rejected": -219.97085571289062, "logps/rejected": -220.0385284423828, "loss": 5.5448, "rewards/accuracies": 0.5546875, "rewards/chosen": -0.0005317605682648718, "rewards/margins": 0.00011403978714952245, "rewards/rejected": -0.0006458003772422671, "step": 1 }, { "epoch": 0.020942408376963352, "epsilon_dpo/beta": 0.010005515068769455, "epsilon_dpo/beta_margin_grad_mean": -0.5000517964363098, "epsilon_dpo/beta_margin_grad_std": 0.0022904376965016127, "epsilon_dpo/beta_margin_mean": -0.0002073091600323096, "epsilon_dpo/beta_margin_std": 0.009162054397165775, "epsilon_dpo/loss_margin_mean": -0.014141757972538471, "grad_norm": 13.806034088134766, "kl/avg_steps": -0.013888888992369175, "kl/beta": 0.010003137402236462, "kl/n_epsilon_steps": 0.5017361044883728, "kl/p_epsilon_steps": 0.4878472089767456, "learning_rate": 9.375e-08, "logits/chosen": 2.67746639251709, "logits/rejected": 2.7837536334991455, "logps/chosen": -282.07965087890625, "logps/ref_chosen": -282.07989501953125, "logps/ref_rejected": -261.4595642089844, "logps/rejected": -261.4451904296875, "loss": 5.5461, "rewards/accuracies": 0.4913194477558136, "rewards/chosen": -3.148229734506458e-05, "rewards/margins": -0.00020731209951918572, "rewards/rejected": 0.0001758297876222059, "step": 10 }, { "epoch": 0.041884816753926704, "epsilon_dpo/beta": 0.010010017082095146, "epsilon_dpo/beta_margin_grad_mean": -0.5000718235969543, "epsilon_dpo/beta_margin_grad_std": 0.0022132620215415955, "epsilon_dpo/beta_margin_mean": -0.0002871893811970949, "epsilon_dpo/beta_margin_std": 0.008853326551616192, "epsilon_dpo/loss_margin_mean": -0.02227994240820408, "grad_norm": 15.510866165161133, "kl/avg_steps": -0.02421874925494194, "kl/beta": 0.010006600990891457, "kl/n_epsilon_steps": 0.5078125, "kl/p_epsilon_steps": 0.48359376192092896, "learning_rate": 1.9791666666666664e-07, "logits/chosen": 2.541713237762451, "logits/rejected": 2.75179123878479, "logps/chosen": -278.8614196777344, "logps/ref_chosen": -278.8597106933594, "logps/ref_rejected": -257.1719055175781, "logps/rejected": -257.1513671875, "loss": 5.5464, "rewards/accuracies": 0.48515623807907104, "rewards/chosen": -5.0874834414571524e-05, "rewards/margins": -0.0002871867036446929, "rewards/rejected": 0.00023631185467820615, "step": 20 }, { "epoch": 0.06282722513089005, "epsilon_dpo/beta": 0.01001377496868372, "epsilon_dpo/beta_margin_grad_mean": -0.4998772144317627, "epsilon_dpo/beta_margin_grad_std": 0.0022012609988451004, "epsilon_dpo/beta_margin_mean": 0.000491045939270407, "epsilon_dpo/beta_margin_std": 0.008805298246443272, "epsilon_dpo/loss_margin_mean": 0.055501788854599, "grad_norm": 14.358946800231934, "kl/avg_steps": 0.07187499850988388, "kl/beta": 0.010019981302320957, "kl/n_epsilon_steps": 0.45703125, "kl/p_epsilon_steps": 0.5289062261581421, "learning_rate": 3.020833333333333e-07, "logits/chosen": 2.639504909515381, "logits/rejected": 2.8058505058288574, "logps/chosen": -273.9162902832031, "logps/ref_chosen": -273.97674560546875, "logps/ref_rejected": -257.2232360839844, "logps/rejected": -257.2182922363281, "loss": 5.5433, "rewards/accuracies": 0.54296875, "rewards/chosen": 0.0005733909783884883, "rewards/margins": 0.0004910477437078953, "rewards/rejected": 8.234316919697449e-05, "step": 30 }, { "epoch": 0.08376963350785341, "epsilon_dpo/beta": 0.009926706552505493, "epsilon_dpo/beta_margin_grad_mean": -0.49948254227638245, "epsilon_dpo/beta_margin_grad_std": 0.0024200372863560915, "epsilon_dpo/beta_margin_mean": 0.0020698602311313152, "epsilon_dpo/beta_margin_std": 0.009680529125034809, "epsilon_dpo/loss_margin_mean": 0.21598558127880096, "grad_norm": 14.699762344360352, "kl/avg_steps": 0.11953125149011612, "kl/beta": 0.009937574155628681, "kl/n_epsilon_steps": 0.4351562559604645, "kl/p_epsilon_steps": 0.5546875, "learning_rate": 4.0625e-07, "logits/chosen": 2.59186053276062, "logits/rejected": 2.7942440509796143, "logps/chosen": -280.52899169921875, "logps/ref_chosen": -280.8274841308594, "logps/ref_rejected": -258.9448547363281, "logps/rejected": -258.8622741699219, "loss": 5.537, "rewards/accuracies": 0.5726562738418579, "rewards/chosen": 0.0029196988325566053, "rewards/margins": 0.002069863025099039, "rewards/rejected": 0.0008498359238728881, "step": 40 }, { "epoch": 0.10471204188481675, "epsilon_dpo/beta": 0.009684694930911064, "epsilon_dpo/beta_margin_grad_mean": -0.4989333748817444, "epsilon_dpo/beta_margin_grad_std": 0.0033105709590017796, "epsilon_dpo/beta_margin_mean": 0.004266691394150257, "epsilon_dpo/beta_margin_std": 0.013243382796645164, "epsilon_dpo/loss_margin_mean": 0.4500531256198883, "grad_norm": 14.027534484863281, "kl/avg_steps": 0.30390626192092896, "kl/beta": 0.009713245555758476, "kl/n_epsilon_steps": 0.34453123807907104, "kl/p_epsilon_steps": 0.6484375, "learning_rate": 4.999932966293553e-07, "logits/chosen": 2.47767972946167, "logits/rejected": 2.8026018142700195, "logps/chosen": -277.54425048828125, "logps/ref_chosen": -278.20208740234375, "logps/ref_rejected": -265.7288818359375, "logps/rejected": -265.5211181640625, "loss": 5.5283, "rewards/accuracies": 0.649218738079071, "rewards/chosen": 0.006310028024017811, "rewards/margins": 0.00426669092848897, "rewards/rejected": 0.0020433368626981974, "step": 50 }, { "epoch": 0.1256544502617801, "epsilon_dpo/beta": 0.009375964291393757, "epsilon_dpo/beta_margin_grad_mean": -0.4979146420955658, "epsilon_dpo/beta_margin_grad_std": 0.0050841751508414745, "epsilon_dpo/beta_margin_mean": 0.008342581801116467, "epsilon_dpo/beta_margin_std": 0.02034146524965763, "epsilon_dpo/loss_margin_mean": 0.9050939679145813, "grad_norm": 13.532852172851562, "kl/avg_steps": 0.35546875, "kl/beta": 0.009408445097506046, "kl/n_epsilon_steps": 0.3187499940395355, "kl/p_epsilon_steps": 0.6742187738418579, "learning_rate": 4.991893270335525e-07, "logits/chosen": 2.488196849822998, "logits/rejected": 2.7562973499298096, "logps/chosen": -267.5882263183594, "logps/ref_chosen": -268.90765380859375, "logps/ref_rejected": -259.67926025390625, "logps/rejected": -259.2649230957031, "loss": 5.5123, "rewards/accuracies": 0.676562488079071, "rewards/chosen": 0.012289796955883503, "rewards/margins": 0.008342583663761616, "rewards/rejected": 0.003947213292121887, "step": 60 }, { "epoch": 0.14659685863874344, "epsilon_dpo/beta": 0.009031310677528381, "epsilon_dpo/beta_margin_grad_mean": -0.4967042803764343, "epsilon_dpo/beta_margin_grad_std": 0.00740186357870698, "epsilon_dpo/beta_margin_mean": 0.013186539523303509, "epsilon_dpo/beta_margin_std": 0.029618557542562485, "epsilon_dpo/loss_margin_mean": 1.483746886253357, "grad_norm": 13.820236206054688, "kl/avg_steps": 0.3890624940395355, "kl/beta": 0.009065655060112476, "kl/n_epsilon_steps": 0.30078125, "kl/p_epsilon_steps": 0.6898437738418579, "learning_rate": 4.970496218214204e-07, "logits/chosen": 2.474260091781616, "logits/rejected": 2.7694077491760254, "logps/chosen": -267.3814392089844, "logps/ref_chosen": -269.73370361328125, "logps/ref_rejected": -258.15594482421875, "logps/rejected": -257.28741455078125, "loss": 5.4935, "rewards/accuracies": 0.702343761920929, "rewards/chosen": 0.02110612951219082, "rewards/margins": 0.013186539523303509, "rewards/rejected": 0.00791959185153246, "step": 70 }, { "epoch": 0.16753926701570682, "epsilon_dpo/beta": 0.008663726039230824, "epsilon_dpo/beta_margin_grad_mean": -0.49476176500320435, "epsilon_dpo/beta_margin_grad_std": 0.01098305732011795, "epsilon_dpo/beta_margin_mean": 0.020962897688150406, "epsilon_dpo/beta_margin_std": 0.04398656636476517, "epsilon_dpo/loss_margin_mean": 2.4553990364074707, "grad_norm": 13.310928344726562, "kl/avg_steps": 0.4117187559604645, "kl/beta": 0.008698700927197933, "kl/n_epsilon_steps": 0.28984373807907104, "kl/p_epsilon_steps": 0.7015625238418579, "learning_rate": 4.935856505068998e-07, "logits/chosen": 2.4028592109680176, "logits/rejected": 2.7112083435058594, "logps/chosen": -268.78997802734375, "logps/ref_chosen": -273.09210205078125, "logps/ref_rejected": -259.3874816894531, "logps/rejected": -257.54071044921875, "loss": 5.4638, "rewards/accuracies": 0.703906238079071, "rewards/chosen": 0.03706257790327072, "rewards/margins": 0.020962897688150406, "rewards/rejected": 0.016099678352475166, "step": 80 }, { "epoch": 0.18848167539267016, "epsilon_dpo/beta": 0.008329156786203384, "epsilon_dpo/beta_margin_grad_mean": -0.49337729811668396, "epsilon_dpo/beta_margin_grad_std": 0.013919507153332233, "epsilon_dpo/beta_margin_mean": 0.026513313874602318, "epsilon_dpo/beta_margin_std": 0.05574870854616165, "epsilon_dpo/loss_margin_mean": 3.229220151901245, "grad_norm": 12.768597602844238, "kl/avg_steps": 0.40625, "kl/beta": 0.008362272754311562, "kl/n_epsilon_steps": 0.29374998807907104, "kl/p_epsilon_steps": 0.699999988079071, "learning_rate": 4.8881598109976e-07, "logits/chosen": 2.430711030960083, "logits/rejected": 2.644582748413086, "logps/chosen": -263.22772216796875, "logps/ref_chosen": -270.48480224609375, "logps/ref_rejected": -259.2120361328125, "logps/rejected": -255.18417358398438, "loss": 5.443, "rewards/accuracies": 0.702343761920929, "rewards/chosen": 0.0601632222533226, "rewards/margins": 0.02651331201195717, "rewards/rejected": 0.03364991024136543, "step": 90 }, { "epoch": 0.2094240837696335, "epsilon_dpo/beta": 0.008008182048797607, "epsilon_dpo/beta_margin_grad_mean": -0.4916536211967468, "epsilon_dpo/beta_margin_grad_std": 0.01792542263865471, "epsilon_dpo/beta_margin_mean": 0.03343886882066727, "epsilon_dpo/beta_margin_std": 0.07184432446956635, "epsilon_dpo/loss_margin_mean": 4.237745761871338, "grad_norm": 12.262528419494629, "kl/avg_steps": 0.3812499940395355, "kl/beta": 0.00803801417350769, "kl/n_epsilon_steps": 0.3031249940395355, "kl/p_epsilon_steps": 0.684374988079071, "learning_rate": 4.827661805750437e-07, "logits/chosen": 2.3381965160369873, "logits/rejected": 2.474226236343384, "logps/chosen": -262.87408447265625, "logps/ref_chosen": -272.49383544921875, "logps/ref_rejected": -255.8369598388672, "logps/rejected": -250.4550018310547, "loss": 5.4178, "rewards/accuracies": 0.6953125, "rewards/chosen": 0.0767994076013565, "rewards/margins": 0.033438872545957565, "rewards/rejected": 0.04336053133010864, "step": 100 }, { "epoch": 0.23036649214659685, "epsilon_dpo/beta": 0.007680200040340424, "epsilon_dpo/beta_margin_grad_mean": -0.4877113699913025, "epsilon_dpo/beta_margin_grad_std": 0.02195078134536743, "epsilon_dpo/beta_margin_mean": 0.04926630109548569, "epsilon_dpo/beta_margin_std": 0.08810068666934967, "epsilon_dpo/loss_margin_mean": 6.498995780944824, "grad_norm": 12.287609100341797, "kl/avg_steps": 0.44140625, "kl/beta": 0.007713483180850744, "kl/n_epsilon_steps": 0.2718749940395355, "kl/p_epsilon_steps": 0.7132812738418579, "learning_rate": 4.75468677825789e-07, "logits/chosen": 2.2321219444274902, "logits/rejected": 2.585568904876709, "logps/chosen": -263.58843994140625, "logps/ref_chosen": -272.6753845214844, "logps/ref_rejected": -260.817138671875, "logps/rejected": -258.2291564941406, "loss": 5.3585, "rewards/accuracies": 0.7320312261581421, "rewards/chosen": 0.06958577036857605, "rewards/margins": 0.04926629737019539, "rewards/rejected": 0.020319465547800064, "step": 110 }, { "epoch": 0.2513089005235602, "epsilon_dpo/beta": 0.007364341057837009, "epsilon_dpo/beta_margin_grad_mean": -0.4861171245574951, "epsilon_dpo/beta_margin_grad_std": 0.027931923046708107, "epsilon_dpo/beta_margin_mean": 0.05574618652462959, "epsilon_dpo/beta_margin_std": 0.11227792501449585, "epsilon_dpo/loss_margin_mean": 7.674368381500244, "grad_norm": 12.68581485748291, "kl/avg_steps": 0.3984375, "kl/beta": 0.007393070962280035, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.6953125, "learning_rate": 4.669625898336438e-07, "logits/chosen": 2.292116403579712, "logits/rejected": 2.474891185760498, "logps/chosen": -273.1396789550781, "logps/ref_chosen": -279.50213623046875, "logps/ref_rejected": -263.6972351074219, "logps/rejected": -265.0091857910156, "loss": 5.3381, "rewards/accuracies": 0.7007812261581421, "rewards/chosen": 0.046533744782209396, "rewards/margins": 0.05574618652462959, "rewards/rejected": -0.009212437085807323, "step": 120 }, { "epoch": 0.27225130890052357, "epsilon_dpo/beta": 0.007093364838510752, "epsilon_dpo/beta_margin_grad_mean": -0.4820740818977356, "epsilon_dpo/beta_margin_grad_std": 0.03345402330160141, "epsilon_dpo/beta_margin_mean": 0.07208652794361115, "epsilon_dpo/beta_margin_std": 0.13469013571739197, "epsilon_dpo/loss_margin_mean": 10.307097434997559, "grad_norm": 15.22977352142334, "kl/avg_steps": 0.3843750059604645, "kl/beta": 0.0071199932135641575, "kl/n_epsilon_steps": 0.3023437559604645, "kl/p_epsilon_steps": 0.686718761920929, "learning_rate": 4.5729351198915705e-07, "logits/chosen": 2.230104923248291, "logits/rejected": 2.4557857513427734, "logps/chosen": -272.00311279296875, "logps/ref_chosen": -278.95745849609375, "logps/ref_rejected": -262.9747314453125, "logps/rejected": -266.3275146484375, "loss": 5.2805, "rewards/accuracies": 0.70703125, "rewards/chosen": 0.04882372170686722, "rewards/margins": 0.07208652794361115, "rewards/rejected": -0.02326280251145363, "step": 130 }, { "epoch": 0.2931937172774869, "epsilon_dpo/beta": 0.0068093957379460335, "epsilon_dpo/beta_margin_grad_mean": -0.4802798628807068, "epsilon_dpo/beta_margin_grad_std": 0.0389549545943737, "epsilon_dpo/beta_margin_mean": 0.07946081459522247, "epsilon_dpo/beta_margin_std": 0.1572197675704956, "epsilon_dpo/loss_margin_mean": 11.81810474395752, "grad_norm": 11.451045989990234, "kl/avg_steps": 0.40625, "kl/beta": 0.006836493965238333, "kl/n_epsilon_steps": 0.2906250059604645, "kl/p_epsilon_steps": 0.6968749761581421, "learning_rate": 4.4651327368569684e-07, "logits/chosen": 2.035799741744995, "logits/rejected": 2.3696587085723877, "logps/chosen": -278.00701904296875, "logps/ref_chosen": -282.004150390625, "logps/ref_rejected": -268.6994934082031, "logps/rejected": -276.5204772949219, "loss": 5.2585, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.02683289907872677, "rewards/margins": 0.07946079969406128, "rewards/rejected": -0.05262790992856026, "step": 140 }, { "epoch": 0.31413612565445026, "epsilon_dpo/beta": 0.0065385727211833, "epsilon_dpo/beta_margin_grad_mean": -0.47647207975387573, "epsilon_dpo/beta_margin_grad_std": 0.04337490350008011, "epsilon_dpo/beta_margin_mean": 0.09494680166244507, "epsilon_dpo/beta_margin_std": 0.1755046844482422, "epsilon_dpo/loss_margin_mean": 14.688570976257324, "grad_norm": 12.580639839172363, "kl/avg_steps": 0.39140623807907104, "kl/beta": 0.006563636474311352, "kl/n_epsilon_steps": 0.2984375059604645, "kl/p_epsilon_steps": 0.6898437738418579, "learning_rate": 4.346796604970912e-07, "logits/chosen": 2.1158509254455566, "logits/rejected": 2.3138821125030518, "logps/chosen": -274.89691162109375, "logps/ref_chosen": -278.5110778808594, "logps/ref_rejected": -255.59854125976562, "logps/rejected": -266.67291259765625, "loss": 5.2052, "rewards/accuracies": 0.71875, "rewards/chosen": 0.023254716768860817, "rewards/margins": 0.09494679421186447, "rewards/rejected": -0.0716920793056488, "step": 150 }, { "epoch": 0.33507853403141363, "epsilon_dpo/beta": 0.006265554577112198, "epsilon_dpo/beta_margin_grad_mean": -0.4711342453956604, "epsilon_dpo/beta_margin_grad_std": 0.04951424151659012, "epsilon_dpo/beta_margin_mean": 0.11672033369541168, "epsilon_dpo/beta_margin_std": 0.20064322650432587, "epsilon_dpo/loss_margin_mean": 18.817256927490234, "grad_norm": 12.49393367767334, "kl/avg_steps": 0.4453125, "kl/beta": 0.006292995996773243, "kl/n_epsilon_steps": 0.27421873807907104, "kl/p_epsilon_steps": 0.719531238079071, "learning_rate": 4.218561044282098e-07, "logits/chosen": 2.0132875442504883, "logits/rejected": 2.3389055728912354, "logps/chosen": -276.2854309082031, "logps/ref_chosen": -276.8100280761719, "logps/ref_rejected": -264.40625, "logps/rejected": -282.6988525390625, "loss": 5.1326, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.002674251329153776, "rewards/margins": 0.11672033369541168, "rewards/rejected": -0.11404608190059662, "step": 160 }, { "epoch": 0.35602094240837695, "epsilon_dpo/beta": 0.005999959539622068, "epsilon_dpo/beta_margin_grad_mean": -0.46788015961647034, "epsilon_dpo/beta_margin_grad_std": 0.05059142783284187, "epsilon_dpo/beta_margin_mean": 0.13001370429992676, "epsilon_dpo/beta_margin_std": 0.2052367627620697, "epsilon_dpo/loss_margin_mean": 21.894283294677734, "grad_norm": 15.406351089477539, "kl/avg_steps": 0.46875, "kl/beta": 0.006027590483427048, "kl/n_epsilon_steps": 0.26249998807907104, "kl/p_epsilon_steps": 0.731249988079071, "learning_rate": 4.081113438988443e-07, "logits/chosen": 1.973179578781128, "logits/rejected": 2.2208034992218018, "logps/chosen": -282.03741455078125, "logps/ref_chosen": -281.14337158203125, "logps/ref_rejected": -250.2654266357422, "logps/rejected": -273.05377197265625, "loss": 5.0843, "rewards/accuracies": 0.7359374761581421, "rewards/chosen": -0.005938548129051924, "rewards/margins": 0.13001371920108795, "rewards/rejected": -0.13595226407051086, "step": 170 }, { "epoch": 0.3769633507853403, "epsilon_dpo/beta": 0.0057226200588047504, "epsilon_dpo/beta_margin_grad_mean": -0.46952924132347107, "epsilon_dpo/beta_margin_grad_std": 0.05471862107515335, "epsilon_dpo/beta_margin_mean": 0.12347264587879181, "epsilon_dpo/beta_margin_std": 0.2224453240633011, "epsilon_dpo/loss_margin_mean": 21.816726684570312, "grad_norm": 24.414875030517578, "kl/avg_steps": 0.45703125, "kl/beta": 0.005748326890170574, "kl/n_epsilon_steps": 0.26875001192092896, "kl/p_epsilon_steps": 0.725781261920929, "learning_rate": 3.935190552834828e-07, "logits/chosen": 1.9551303386688232, "logits/rejected": 2.1914541721343994, "logps/chosen": -283.0456237792969, "logps/ref_chosen": -279.8695068359375, "logps/ref_rejected": -263.40533447265625, "logps/rejected": -288.39813232421875, "loss": 5.1163, "rewards/accuracies": 0.723437488079071, "rewards/chosen": -0.018750619143247604, "rewards/margins": 0.1234726533293724, "rewards/rejected": -0.1422232687473297, "step": 180 }, { "epoch": 0.39790575916230364, "epsilon_dpo/beta": 0.005460767075419426, "epsilon_dpo/beta_margin_grad_mean": -0.462840735912323, "epsilon_dpo/beta_margin_grad_std": 0.05923638492822647, "epsilon_dpo/beta_margin_mean": 0.15091852843761444, "epsilon_dpo/beta_margin_std": 0.24113008379936218, "epsilon_dpo/loss_margin_mean": 27.910152435302734, "grad_norm": 19.144001007080078, "kl/avg_steps": 0.47578126192092896, "kl/beta": 0.005486341658979654, "kl/n_epsilon_steps": 0.25703126192092896, "kl/p_epsilon_steps": 0.7328125238418579, "learning_rate": 3.781574579820464e-07, "logits/chosen": 1.913297414779663, "logits/rejected": 2.166954517364502, "logps/chosen": -288.5598449707031, "logps/ref_chosen": -278.2532958984375, "logps/ref_rejected": -257.45025634765625, "logps/rejected": -295.66693115234375, "loss": 5.0227, "rewards/accuracies": 0.741406261920929, "rewards/chosen": -0.05687868595123291, "rewards/margins": 0.15091851353645325, "rewards/rejected": -0.20779721438884735, "step": 190 }, { "epoch": 0.418848167539267, "epsilon_dpo/beta": 0.005235456861555576, "epsilon_dpo/beta_margin_grad_mean": -0.4651154577732086, "epsilon_dpo/beta_margin_grad_std": 0.06457895785570145, "epsilon_dpo/beta_margin_mean": 0.14193181693553925, "epsilon_dpo/beta_margin_std": 0.26321619749069214, "epsilon_dpo/loss_margin_mean": 27.4693603515625, "grad_norm": 20.511478424072266, "kl/avg_steps": 0.38749998807907104, "kl/beta": 0.005255300085991621, "kl/n_epsilon_steps": 0.30390626192092896, "kl/p_epsilon_steps": 0.69140625, "learning_rate": 3.621088951385353e-07, "logits/chosen": 1.876455307006836, "logits/rejected": 2.166574001312256, "logps/chosen": -285.0974426269531, "logps/ref_chosen": -275.12750244140625, "logps/ref_rejected": -260.0728759765625, "logps/rejected": -297.5121154785156, "loss": 5.0674, "rewards/accuracies": 0.70703125, "rewards/chosen": -0.053233105689287186, "rewards/margins": 0.14193184673786163, "rewards/rejected": -0.19516493380069733, "step": 200 }, { "epoch": 0.418848167539267, "eval_epsilon_dpo/beta": 0.00512322410941124, "eval_epsilon_dpo/beta_margin_grad_mean": -0.464358389377594, "eval_epsilon_dpo/beta_margin_grad_std": 0.06305021047592163, "eval_epsilon_dpo/beta_margin_mean": 0.14517197012901306, "eval_epsilon_dpo/beta_margin_std": 0.25747936964035034, "eval_epsilon_dpo/loss_margin_mean": 28.677000045776367, "eval_kl/n_epsilon_steps": 0.2930000126361847, "eval_kl/p_epsilon_steps": 0.6990000009536743, "eval_logits/chosen": 1.8063491582870483, "eval_logits/rejected": 2.155062198638916, "eval_logps/chosen": -291.77764892578125, "eval_logps/ref_chosen": -280.4282531738281, "eval_logps/ref_rejected": -264.7044677734375, "eval_logps/rejected": -304.7308654785156, "eval_loss": 0.6321755647659302, "eval_rewards/accuracies": 0.7170000076293945, "eval_rewards/chosen": -0.05901862308382988, "eval_rewards/margins": 0.14517197012901306, "eval_rewards/rejected": -0.20419058203697205, "eval_runtime": 103.5445, "eval_samples_per_second": 19.315, "eval_steps_per_second": 1.207, "step": 200 }, { "epoch": 0.4397905759162304, "epsilon_dpo/beta": 0.005026308819651604, "epsilon_dpo/beta_margin_grad_mean": -0.4626430571079254, "epsilon_dpo/beta_margin_grad_std": 0.06565666198730469, "epsilon_dpo/beta_margin_mean": 0.15212179720401764, "epsilon_dpo/beta_margin_std": 0.2678548991680145, "epsilon_dpo/loss_margin_mean": 30.614501953125, "grad_norm": 30.989282608032227, "kl/avg_steps": 0.4203124940395355, "kl/beta": 0.005047028884291649, "kl/n_epsilon_steps": 0.28437501192092896, "kl/p_epsilon_steps": 0.7046874761581421, "learning_rate": 3.454593922550693e-07, "logits/chosen": 1.8265072107315063, "logits/rejected": 2.06158185005188, "logps/chosen": -291.03253173828125, "logps/ref_chosen": -279.7332763671875, "logps/ref_rejected": -267.92437744140625, "logps/rejected": -309.8381042480469, "loss": 5.0314, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.057643067091703415, "rewards/margins": 0.15212179720401764, "rewards/rejected": -0.20976486802101135, "step": 210 }, { "epoch": 0.4607329842931937, "epsilon_dpo/beta": 0.004815506748855114, "epsilon_dpo/beta_margin_grad_mean": -0.46018725633621216, "epsilon_dpo/beta_margin_grad_std": 0.06686625629663467, "epsilon_dpo/beta_margin_mean": 0.16237930953502655, "epsilon_dpo/beta_margin_std": 0.2736971378326416, "epsilon_dpo/loss_margin_mean": 34.08965301513672, "grad_norm": 27.191370010375977, "kl/avg_steps": 0.4453125, "kl/beta": 0.004836562555283308, "kl/n_epsilon_steps": 0.27265626192092896, "kl/p_epsilon_steps": 0.717968761920929, "learning_rate": 3.2829819606729477e-07, "logits/chosen": 1.8367538452148438, "logits/rejected": 2.1368610858917236, "logps/chosen": -304.51153564453125, "logps/ref_chosen": -287.2923583984375, "logps/ref_rejected": -270.8887023925781, "logps/rejected": -322.1975402832031, "loss": 4.9966, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.08366179466247559, "rewards/margins": 0.16237932443618774, "rewards/rejected": -0.24604110419750214, "step": 220 }, { "epoch": 0.4816753926701571, "epsilon_dpo/beta": 0.004599227569997311, "epsilon_dpo/beta_margin_grad_mean": -0.45680707693099976, "epsilon_dpo/beta_margin_grad_std": 0.06870144605636597, "epsilon_dpo/beta_margin_mean": 0.1762588918209076, "epsilon_dpo/beta_margin_std": 0.2809893488883972, "epsilon_dpo/loss_margin_mean": 38.755615234375, "grad_norm": 22.937519073486328, "kl/avg_steps": 0.47734373807907104, "kl/beta": 0.004620816558599472, "kl/n_epsilon_steps": 0.2593750059604645, "kl/p_epsilon_steps": 0.7367187738418579, "learning_rate": 3.1071729615293424e-07, "logits/chosen": 1.7133830785751343, "logits/rejected": 2.039473533630371, "logps/chosen": -293.60247802734375, "logps/ref_chosen": -272.74945068359375, "logps/ref_rejected": -258.1266784667969, "logps/rejected": -317.7353515625, "loss": 4.9502, "rewards/accuracies": 0.739062488079071, "rewards/chosen": -0.09684249013662338, "rewards/margins": 0.1762588918209076, "rewards/rejected": -0.2731013596057892, "step": 230 }, { "epoch": 0.5026178010471204, "epsilon_dpo/beta": 0.0043902210891246796, "epsilon_dpo/beta_margin_grad_mean": -0.457236111164093, "epsilon_dpo/beta_margin_grad_std": 0.07059483975172043, "epsilon_dpo/beta_margin_mean": 0.17473134398460388, "epsilon_dpo/beta_margin_std": 0.2893211245536804, "epsilon_dpo/loss_margin_mean": 40.25088882446289, "grad_norm": 22.779020309448242, "kl/avg_steps": 0.4468750059604645, "kl/beta": 0.004409492947161198, "kl/n_epsilon_steps": 0.2718749940395355, "kl/p_epsilon_steps": 0.71875, "learning_rate": 2.9281093183781403e-07, "logits/chosen": 1.7209564447402954, "logits/rejected": 2.0882318019866943, "logps/chosen": -300.35296630859375, "logps/ref_chosen": -280.094970703125, "logps/ref_rejected": -263.1619873046875, "logps/rejected": -323.6708679199219, "loss": 4.9599, "rewards/accuracies": 0.7289062738418579, "rewards/chosen": -0.09005247056484222, "rewards/margins": 0.17473134398460388, "rewards/rejected": -0.2647838294506073, "step": 240 }, { "epoch": 0.5235602094240838, "epsilon_dpo/beta": 0.00419188616797328, "epsilon_dpo/beta_margin_grad_mean": -0.4556571841239929, "epsilon_dpo/beta_margin_grad_std": 0.0703204870223999, "epsilon_dpo/beta_margin_mean": 0.18116165697574615, "epsilon_dpo/beta_margin_std": 0.2881784737110138, "epsilon_dpo/loss_margin_mean": 43.624481201171875, "grad_norm": 39.10613250732422, "kl/avg_steps": 0.4593749940395355, "kl/beta": 0.004210834391415119, "kl/n_epsilon_steps": 0.26640623807907104, "kl/p_epsilon_steps": 0.725781261920929, "learning_rate": 2.7467508704251135e-07, "logits/chosen": 1.741624116897583, "logits/rejected": 1.9895031452178955, "logps/chosen": -296.340576171875, "logps/ref_chosen": -279.10601806640625, "logps/ref_rejected": -255.9159698486328, "logps/rejected": -316.7749938964844, "loss": 4.9365, "rewards/accuracies": 0.7359374761581421, "rewards/chosen": -0.07299315184354782, "rewards/margins": 0.18116167187690735, "rewards/rejected": -0.25415483117103577, "step": 250 }, { "epoch": 0.5445026178010471, "epsilon_dpo/beta": 0.004008334130048752, "epsilon_dpo/beta_margin_grad_mean": -0.4575107991695404, "epsilon_dpo/beta_margin_grad_std": 0.07278217375278473, "epsilon_dpo/beta_margin_mean": 0.17386779189109802, "epsilon_dpo/beta_margin_std": 0.2985754609107971, "epsilon_dpo/loss_margin_mean": 43.82888412475586, "grad_norm": 32.33043670654297, "kl/avg_steps": 0.43828123807907104, "kl/beta": 0.004025599919259548, "kl/n_epsilon_steps": 0.2789062559604645, "kl/p_epsilon_steps": 0.7171875238418579, "learning_rate": 2.5640697577740815e-07, "logits/chosen": 1.7184337377548218, "logits/rejected": 1.9476096630096436, "logps/chosen": -306.7433166503906, "logps/ref_chosen": -279.7398986816406, "logps/ref_rejected": -256.90155029296875, "logps/rejected": -327.7337951660156, "loss": 4.9692, "rewards/accuracies": 0.72265625, "rewards/chosen": -0.10899752378463745, "rewards/margins": 0.17386779189109802, "rewards/rejected": -0.28286534547805786, "step": 260 }, { "epoch": 0.5654450261780105, "epsilon_dpo/beta": 0.0038394411094486713, "epsilon_dpo/beta_margin_grad_mean": -0.45551127195358276, "epsilon_dpo/beta_margin_grad_std": 0.07340405881404877, "epsilon_dpo/beta_margin_mean": 0.18199250102043152, "epsilon_dpo/beta_margin_std": 0.30104658007621765, "epsilon_dpo/loss_margin_mean": 47.921356201171875, "grad_norm": 26.059804916381836, "kl/avg_steps": 0.44843751192092896, "kl/beta": 0.00385635276325047, "kl/n_epsilon_steps": 0.27031248807907104, "kl/p_epsilon_steps": 0.71875, "learning_rate": 2.381045210440644e-07, "logits/chosen": 1.6957333087921143, "logits/rejected": 1.981131911277771, "logps/chosen": -306.7268981933594, "logps/ref_chosen": -272.6238708496094, "logps/ref_rejected": -256.24176025390625, "logps/rejected": -338.26611328125, "loss": 4.9401, "rewards/accuracies": 0.7359374761581421, "rewards/chosen": -0.13194236159324646, "rewards/margins": 0.18199248611927032, "rewards/rejected": -0.3139348328113556, "step": 270 }, { "epoch": 0.5863874345549738, "epsilon_dpo/beta": 0.0036588613875210285, "epsilon_dpo/beta_margin_grad_mean": -0.4539538323879242, "epsilon_dpo/beta_margin_grad_std": 0.07207532227039337, "epsilon_dpo/beta_margin_mean": 0.18829122185707092, "epsilon_dpo/beta_margin_std": 0.2957257628440857, "epsilon_dpo/loss_margin_mean": 51.929046630859375, "grad_norm": 21.85626220703125, "kl/avg_steps": 0.48906248807907104, "kl/beta": 0.0036765006370842457, "kl/n_epsilon_steps": 0.25078123807907104, "kl/p_epsilon_steps": 0.7398437261581421, "learning_rate": 2.1986582993616925e-07, "logits/chosen": 1.5749285221099854, "logits/rejected": 1.9680347442626953, "logps/chosen": -298.32781982421875, "logps/ref_chosen": -272.6661682128906, "logps/ref_rejected": -259.3951721191406, "logps/rejected": -336.98590087890625, "loss": 4.9148, "rewards/accuracies": 0.749218761920929, "rewards/chosen": -0.09480254352092743, "rewards/margins": 0.18829122185707092, "rewards/rejected": -0.28309375047683716, "step": 280 }, { "epoch": 0.6073298429319371, "epsilon_dpo/beta": 0.00350450468249619, "epsilon_dpo/beta_margin_grad_mean": -0.46083664894104004, "epsilon_dpo/beta_margin_grad_std": 0.07311917841434479, "epsilon_dpo/beta_margin_mean": 0.1602335274219513, "epsilon_dpo/beta_margin_std": 0.2994373142719269, "epsilon_dpo/loss_margin_mean": 46.23841094970703, "grad_norm": 34.233943939208984, "kl/avg_steps": 0.3851562440395355, "kl/beta": 0.003517721313983202, "kl/n_epsilon_steps": 0.3031249940395355, "kl/p_epsilon_steps": 0.688281238079071, "learning_rate": 2.0178866775369774e-07, "logits/chosen": 1.578467845916748, "logits/rejected": 1.903235673904419, "logps/chosen": -323.2730407714844, "logps/ref_chosen": -287.4728698730469, "logps/ref_rejected": -268.4922790527344, "logps/rejected": -350.5308532714844, "loss": 5.0191, "rewards/accuracies": 0.7015625238418579, "rewards/chosen": -0.1263677179813385, "rewards/margins": 0.1602335274219513, "rewards/rejected": -0.2866012454032898, "step": 290 }, { "epoch": 0.6282722513089005, "epsilon_dpo/beta": 0.003364184172824025, "epsilon_dpo/beta_margin_grad_mean": -0.4569614827632904, "epsilon_dpo/beta_margin_grad_std": 0.07025741040706635, "epsilon_dpo/beta_margin_mean": 0.17601335048675537, "epsilon_dpo/beta_margin_std": 0.2878516614437103, "epsilon_dpo/loss_margin_mean": 52.840850830078125, "grad_norm": 19.78177833557129, "kl/avg_steps": 0.4351562559604645, "kl/beta": 0.003378564026206732, "kl/n_epsilon_steps": 0.2789062559604645, "kl/p_epsilon_steps": 0.714062511920929, "learning_rate": 1.839699339491937e-07, "logits/chosen": 1.6086456775665283, "logits/rejected": 1.9709374904632568, "logps/chosen": -301.5176696777344, "logps/ref_chosen": -273.06646728515625, "logps/ref_rejected": -266.1439208984375, "logps/rejected": -347.4358825683594, "loss": 4.9542, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.09642257541418076, "rewards/margins": 0.17601335048675537, "rewards/rejected": -0.2724359333515167, "step": 300 }, { "epoch": 0.6492146596858639, "epsilon_dpo/beta": 0.0032132375054061413, "epsilon_dpo/beta_margin_grad_mean": -0.45566052198410034, "epsilon_dpo/beta_margin_grad_std": 0.06932147592306137, "epsilon_dpo/beta_margin_mean": 0.1810220181941986, "epsilon_dpo/beta_margin_std": 0.28379470109939575, "epsilon_dpo/loss_margin_mean": 56.88977813720703, "grad_norm": 20.059579849243164, "kl/avg_steps": 0.4867187440395355, "kl/beta": 0.003228639718145132, "kl/n_epsilon_steps": 0.25468748807907104, "kl/p_epsilon_steps": 0.741406261920929, "learning_rate": 1.6650514271527465e-07, "logits/chosen": 1.593857765197754, "logits/rejected": 1.952932596206665, "logps/chosen": -313.94219970703125, "logps/ref_chosen": -276.8886413574219, "logps/ref_rejected": -256.80865478515625, "logps/rejected": -350.75201416015625, "loss": 4.9339, "rewards/accuracies": 0.7367187738418579, "rewards/chosen": -0.11971668899059296, "rewards/margins": 0.1810220181941986, "rewards/rejected": -0.30073872208595276, "step": 310 }, { "epoch": 0.6701570680628273, "epsilon_dpo/beta": 0.0030656014569103718, "epsilon_dpo/beta_margin_grad_mean": -0.45544466376304626, "epsilon_dpo/beta_margin_grad_std": 0.06911682337522507, "epsilon_dpo/beta_margin_mean": 0.18175189197063446, "epsilon_dpo/beta_margin_std": 0.2825908660888672, "epsilon_dpo/loss_margin_mean": 59.900352478027344, "grad_norm": 24.982254028320312, "kl/avg_steps": 0.47343748807907104, "kl/beta": 0.003079873975366354, "kl/n_epsilon_steps": 0.2593750059604645, "kl/p_epsilon_steps": 0.7328125238418579, "learning_rate": 1.4948791099758052e-07, "logits/chosen": 1.6970676183700562, "logits/rejected": 2.0628037452697754, "logps/chosen": -321.9020080566406, "logps/ref_chosen": -282.2432556152344, "logps/ref_rejected": -256.89776611328125, "logps/rejected": -356.45684814453125, "loss": 4.9303, "rewards/accuracies": 0.73828125, "rewards/chosen": -0.12259833514690399, "rewards/margins": 0.18175189197063446, "rewards/rejected": -0.30435022711753845, "step": 320 }, { "epoch": 0.6910994764397905, "epsilon_dpo/beta": 0.002925318432971835, "epsilon_dpo/beta_margin_grad_mean": -0.45976167917251587, "epsilon_dpo/beta_margin_grad_std": 0.06790686398744583, "epsilon_dpo/beta_margin_mean": 0.16398653388023376, "epsilon_dpo/beta_margin_std": 0.27741676568984985, "epsilon_dpo/loss_margin_mean": 56.63254928588867, "grad_norm": 35.780921936035156, "kl/avg_steps": 0.46406251192092896, "kl/beta": 0.0029386640526354313, "kl/n_epsilon_steps": 0.2632812559604645, "kl/p_epsilon_steps": 0.727343738079071, "learning_rate": 1.3300945667758012e-07, "logits/chosen": 1.6550931930541992, "logits/rejected": 1.8850772380828857, "logps/chosen": -316.6177062988281, "logps/ref_chosen": -275.7609558105469, "logps/ref_rejected": -263.5372619628906, "logps/rejected": -361.02655029296875, "loss": 4.9933, "rewards/accuracies": 0.719531238079071, "rewards/chosen": -0.12028974294662476, "rewards/margins": 0.16398653388023376, "rewards/rejected": -0.28427624702453613, "step": 330 }, { "epoch": 0.7120418848167539, "epsilon_dpo/beta": 0.0027930724900215864, "epsilon_dpo/beta_margin_grad_mean": -0.4600375294685364, "epsilon_dpo/beta_margin_grad_std": 0.06828001886606216, "epsilon_dpo/beta_margin_mean": 0.16278859972953796, "epsilon_dpo/beta_margin_std": 0.2784718871116638, "epsilon_dpo/loss_margin_mean": 58.940940856933594, "grad_norm": 19.590518951416016, "kl/avg_steps": 0.4609375, "kl/beta": 0.0028057279996573925, "kl/n_epsilon_steps": 0.26484376192092896, "kl/p_epsilon_steps": 0.725781261920929, "learning_rate": 1.1715810961514072e-07, "logits/chosen": 1.6267999410629272, "logits/rejected": 1.9399261474609375, "logps/chosen": -319.0074157714844, "logps/ref_chosen": -269.4908447265625, "logps/ref_rejected": -253.1649627685547, "logps/rejected": -361.62249755859375, "loss": 4.9976, "rewards/accuracies": 0.725781261920929, "rewards/chosen": -0.13907715678215027, "rewards/margins": 0.16278859972953796, "rewards/rejected": -0.30186575651168823, "step": 340 }, { "epoch": 0.7329842931937173, "epsilon_dpo/beta": 0.0026765193324536085, "epsilon_dpo/beta_margin_grad_mean": -0.4628540575504303, "epsilon_dpo/beta_margin_grad_std": 0.06378835439682007, "epsilon_dpo/beta_margin_mean": 0.15105712413787842, "epsilon_dpo/beta_margin_std": 0.25988245010375977, "epsilon_dpo/loss_margin_mean": 57.061004638671875, "grad_norm": 20.615802764892578, "kl/avg_steps": 0.4242187440395355, "kl/beta": 0.0026876390911638737, "kl/n_epsilon_steps": 0.28515625, "kl/p_epsilon_steps": 0.7093750238418579, "learning_rate": 1.0201883817182949e-07, "logits/chosen": 1.6629711389541626, "logits/rejected": 2.020021915435791, "logps/chosen": -344.3343811035156, "logps/ref_chosen": -284.06365966796875, "logps/ref_rejected": -260.7166442871094, "logps/rejected": -378.0483703613281, "loss": 5.0309, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.16222040355205536, "rewards/margins": 0.15105712413787842, "rewards/rejected": -0.31327754259109497, "step": 350 }, { "epoch": 0.7539267015706806, "epsilon_dpo/beta": 0.002562676090747118, "epsilon_dpo/beta_margin_grad_mean": -0.4644971787929535, "epsilon_dpo/beta_margin_grad_std": 0.06188509613275528, "epsilon_dpo/beta_margin_mean": 0.14429207146167755, "epsilon_dpo/beta_margin_std": 0.2519903779029846, "epsilon_dpo/loss_margin_mean": 56.94682693481445, "grad_norm": 28.58539581298828, "kl/avg_steps": 0.4359374940395355, "kl/beta": 0.0025736321695148945, "kl/n_epsilon_steps": 0.27656251192092896, "kl/p_epsilon_steps": 0.7124999761581421, "learning_rate": 8.76727937529367e-08, "logits/chosen": 1.558531403541565, "logits/rejected": 1.9686288833618164, "logps/chosen": -326.70318603515625, "logps/ref_chosen": -269.2133483886719, "logps/ref_rejected": -251.10647583007812, "logps/rejected": -365.5430908203125, "loss": 5.0524, "rewards/accuracies": 0.7132812738418579, "rewards/chosen": -0.14818084239959717, "rewards/margins": 0.14429204165935516, "rewards/rejected": -0.2924729287624359, "step": 360 }, { "epoch": 0.774869109947644, "epsilon_dpo/beta": 0.0024432847276329994, "epsilon_dpo/beta_margin_grad_mean": -0.45972761511802673, "epsilon_dpo/beta_margin_grad_std": 0.06026551127433777, "epsilon_dpo/beta_margin_mean": 0.16353142261505127, "epsilon_dpo/beta_margin_std": 0.2452823668718338, "epsilon_dpo/loss_margin_mean": 67.50531005859375, "grad_norm": 18.816442489624023, "kl/avg_steps": 0.500781238079071, "kl/beta": 0.0024553355760872364, "kl/n_epsilon_steps": 0.24609375, "kl/p_epsilon_steps": 0.746874988079071, "learning_rate": 7.419687580962222e-08, "logits/chosen": 1.6747153997421265, "logits/rejected": 1.9603767395019531, "logps/chosen": -331.12542724609375, "logps/ref_chosen": -276.8400573730469, "logps/ref_rejected": -257.84912109375, "logps/rejected": -379.6397705078125, "loss": 4.9777, "rewards/accuracies": 0.746874988079071, "rewards/chosen": -0.13340650498867035, "rewards/margins": 0.16353140771389008, "rewards/rejected": -0.2969379425048828, "step": 370 }, { "epoch": 0.7958115183246073, "epsilon_dpo/beta": 0.0023312487173825502, "epsilon_dpo/beta_margin_grad_mean": -0.46638360619544983, "epsilon_dpo/beta_margin_grad_std": 0.05908365920186043, "epsilon_dpo/beta_margin_mean": 0.13641974329948425, "epsilon_dpo/beta_margin_std": 0.23996075987815857, "epsilon_dpo/loss_margin_mean": 59.121360778808594, "grad_norm": 33.467586517333984, "kl/avg_steps": 0.4359374940395355, "kl/beta": 0.0023412262089550495, "kl/n_epsilon_steps": 0.2789062559604645, "kl/p_epsilon_steps": 0.71484375, "learning_rate": 6.166331963291519e-08, "logits/chosen": 1.7089202404022217, "logits/rejected": 1.9208694696426392, "logps/chosen": -356.5716857910156, "logps/ref_chosen": -294.3582458496094, "logps/ref_rejected": -266.00933837890625, "logps/rejected": -387.34417724609375, "loss": 5.0756, "rewards/accuracies": 0.723437488079071, "rewards/chosen": -0.14578744769096375, "rewards/margins": 0.13641975820064545, "rewards/rejected": -0.2822072207927704, "step": 380 }, { "epoch": 0.8167539267015707, "epsilon_dpo/beta": 0.0022311562206596136, "epsilon_dpo/beta_margin_grad_mean": -0.4652669429779053, "epsilon_dpo/beta_margin_grad_std": 0.05686299130320549, "epsilon_dpo/beta_margin_mean": 0.14073483645915985, "epsilon_dpo/beta_margin_std": 0.23052707314491272, "epsilon_dpo/loss_margin_mean": 63.751487731933594, "grad_norm": 20.419815063476562, "kl/avg_steps": 0.4375, "kl/beta": 0.0022407451178878546, "kl/n_epsilon_steps": 0.2750000059604645, "kl/p_epsilon_steps": 0.7124999761581421, "learning_rate": 5.013930914912476e-08, "logits/chosen": 1.5366142988204956, "logits/rejected": 1.9008190631866455, "logps/chosen": -333.5438537597656, "logps/ref_chosen": -271.92047119140625, "logps/ref_rejected": -263.865478515625, "logps/rejected": -389.2403259277344, "loss": 5.0554, "rewards/accuracies": 0.717968761920929, "rewards/chosen": -0.13834409415721893, "rewards/margins": 0.14073482155799866, "rewards/rejected": -0.2790789306163788, "step": 390 }, { "epoch": 0.837696335078534, "epsilon_dpo/beta": 0.0021363936830312014, "epsilon_dpo/beta_margin_grad_mean": -0.4690118730068207, "epsilon_dpo/beta_margin_grad_std": 0.05419831722974777, "epsilon_dpo/beta_margin_mean": 0.12548907101154327, "epsilon_dpo/beta_margin_std": 0.2197370082139969, "epsilon_dpo/loss_margin_mean": 59.32947540283203, "grad_norm": 16.475208282470703, "kl/avg_steps": 0.42500001192092896, "kl/beta": 0.0021453090012073517, "kl/n_epsilon_steps": 0.2835937440395355, "kl/p_epsilon_steps": 0.7085937261581421, "learning_rate": 3.968661679220467e-08, "logits/chosen": 1.5702852010726929, "logits/rejected": 1.895922064781189, "logps/chosen": -350.1571960449219, "logps/ref_chosen": -284.8265075683594, "logps/ref_rejected": -265.3280944824219, "logps/rejected": -389.98828125, "loss": 5.1073, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.14023001492023468, "rewards/margins": 0.12548907101154327, "rewards/rejected": -0.26571911573410034, "step": 400 }, { "epoch": 0.837696335078534, "eval_epsilon_dpo/beta": 0.002089055487886071, "eval_epsilon_dpo/beta_margin_grad_mean": -0.4698907434940338, "eval_epsilon_dpo/beta_margin_grad_std": 0.05313246697187424, "eval_epsilon_dpo/beta_margin_mean": 0.12187241017818451, "eval_epsilon_dpo/beta_margin_std": 0.2152228057384491, "eval_epsilon_dpo/loss_margin_mean": 59.03139877319336, "eval_kl/n_epsilon_steps": 0.2854999899864197, "eval_kl/p_epsilon_steps": 0.7085000276565552, "eval_logits/chosen": 1.5736112594604492, "eval_logits/rejected": 1.9568898677825928, "eval_logps/chosen": -346.2501220703125, "eval_logps/ref_chosen": -280.4282531738281, "eval_logps/ref_rejected": -264.7044677734375, "eval_logps/rejected": -389.5577392578125, "eval_loss": 0.6402832269668579, "eval_rewards/accuracies": 0.7164999842643738, "eval_rewards/chosen": -0.13826368749141693, "eval_rewards/margins": 0.12187241017818451, "eval_rewards/rejected": -0.26013606786727905, "eval_runtime": 103.0031, "eval_samples_per_second": 19.417, "eval_steps_per_second": 1.214, "step": 400 }, { "epoch": 0.8586387434554974, "epsilon_dpo/beta": 0.0020442053209990263, "epsilon_dpo/beta_margin_grad_mean": -0.46692174673080444, "epsilon_dpo/beta_margin_grad_std": 0.05178702622652054, "epsilon_dpo/beta_margin_mean": 0.13379183411598206, "epsilon_dpo/beta_margin_std": 0.20962686836719513, "epsilon_dpo/loss_margin_mean": 66.03794860839844, "grad_norm": 41.441593170166016, "kl/avg_steps": 0.45390623807907104, "kl/beta": 0.0020533339120447636, "kl/n_epsilon_steps": 0.26875001192092896, "kl/p_epsilon_steps": 0.72265625, "learning_rate": 3.036127238347164e-08, "logits/chosen": 1.612749695777893, "logits/rejected": 1.9225709438323975, "logps/chosen": -344.31646728515625, "logps/ref_chosen": -282.58233642578125, "logps/ref_rejected": -266.00897216796875, "logps/rejected": -393.7810363769531, "loss": 5.0719, "rewards/accuracies": 0.7398437261581421, "rewards/chosen": -0.12682631611824036, "rewards/margins": 0.13379183411598206, "rewards/rejected": -0.2606181502342224, "step": 410 }, { "epoch": 0.8795811518324608, "epsilon_dpo/beta": 0.001955785322934389, "epsilon_dpo/beta_margin_grad_mean": -0.4684430658817291, "epsilon_dpo/beta_margin_grad_std": 0.05116555094718933, "epsilon_dpo/beta_margin_mean": 0.12757208943367004, "epsilon_dpo/beta_margin_std": 0.207074373960495, "epsilon_dpo/loss_margin_mean": 65.90140533447266, "grad_norm": 19.453214645385742, "kl/avg_steps": 0.46406251192092896, "kl/beta": 0.001964703667908907, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.729687511920929, "learning_rate": 2.2213262793589482e-08, "logits/chosen": 1.5862172842025757, "logits/rejected": 1.9309051036834717, "logps/chosen": -341.8611755371094, "logps/ref_chosen": -281.11688232421875, "logps/ref_rejected": -263.7762145996094, "logps/rejected": -390.4219665527344, "loss": 5.094, "rewards/accuracies": 0.73046875, "rewards/chosen": -0.11936762183904648, "rewards/margins": 0.12757208943367004, "rewards/rejected": -0.24693970382213593, "step": 420 }, { "epoch": 0.900523560209424, "epsilon_dpo/beta": 0.001865379512310028, "epsilon_dpo/beta_margin_grad_mean": -0.46811485290527344, "epsilon_dpo/beta_margin_grad_std": 0.0480102077126503, "epsilon_dpo/beta_margin_mean": 0.1287469118833542, "epsilon_dpo/beta_margin_std": 0.19402021169662476, "epsilon_dpo/loss_margin_mean": 69.65689849853516, "grad_norm": 17.445083618164062, "kl/avg_steps": 0.47343748807907104, "kl/beta": 0.0018740678206086159, "kl/n_epsilon_steps": 0.25859373807907104, "kl/p_epsilon_steps": 0.7320312261581421, "learning_rate": 1.5286263996730026e-08, "logits/chosen": 1.5173814296722412, "logits/rejected": 1.9054569005966187, "logps/chosen": -337.60888671875, "logps/ref_chosen": -282.20098876953125, "logps/ref_rejected": -257.6202392578125, "logps/rejected": -382.68505859375, "loss": 5.0847, "rewards/accuracies": 0.741406261920929, "rewards/chosen": -0.10385727882385254, "rewards/margins": 0.1287469118833542, "rewards/rejected": -0.23260419070720673, "step": 430 }, { "epoch": 0.9214659685863874, "epsilon_dpo/beta": 0.0017827233532443643, "epsilon_dpo/beta_margin_grad_mean": -0.4748317301273346, "epsilon_dpo/beta_margin_grad_std": 0.0455574207007885, "epsilon_dpo/beta_margin_mean": 0.1015293225646019, "epsilon_dpo/beta_margin_std": 0.18387706577777863, "epsilon_dpo/loss_margin_mean": 57.55500030517578, "grad_norm": 15.522335052490234, "kl/avg_steps": 0.42109376192092896, "kl/beta": 0.0017900926759466529, "kl/n_epsilon_steps": 0.2835937440395355, "kl/p_epsilon_steps": 0.7046874761581421, "learning_rate": 9.617406953185136e-09, "logits/chosen": 1.6178176403045654, "logits/rejected": 1.9510142803192139, "logps/chosen": -333.5023498535156, "logps/ref_chosen": -272.00103759765625, "logps/ref_rejected": -258.02813720703125, "logps/rejected": -377.08441162109375, "loss": 5.1835, "rewards/accuracies": 0.703906238079071, "rewards/chosen": -0.11019601672887802, "rewards/margins": 0.1015293151140213, "rewards/rejected": -0.2117253541946411, "step": 440 }, { "epoch": 0.9424083769633508, "epsilon_dpo/beta": 0.001706903101876378, "epsilon_dpo/beta_margin_grad_mean": -0.4734385013580322, "epsilon_dpo/beta_margin_grad_std": 0.044034797698259354, "epsilon_dpo/beta_margin_mean": 0.10707694292068481, "epsilon_dpo/beta_margin_std": 0.1776462197303772, "epsilon_dpo/loss_margin_mean": 63.39220428466797, "grad_norm": 16.360170364379883, "kl/avg_steps": 0.4546875059604645, "kl/beta": 0.0017145348247140646, "kl/n_epsilon_steps": 0.2671875059604645, "kl/p_epsilon_steps": 0.721875011920929, "learning_rate": 5.2370785753763356e-09, "logits/chosen": 1.5754592418670654, "logits/rejected": 1.9332977533340454, "logps/chosen": -337.49688720703125, "logps/ref_chosen": -278.8232421875, "logps/ref_rejected": -256.79656982421875, "logps/rejected": -378.8623962402344, "loss": 5.16, "rewards/accuracies": 0.72265625, "rewards/chosen": -0.10077029466629028, "rewards/margins": 0.10707694292068481, "rewards/rejected": -0.2078472375869751, "step": 450 }, { "epoch": 0.9633507853403142, "epsilon_dpo/beta": 0.0016306890174746513, "epsilon_dpo/beta_margin_grad_mean": -0.47516068816185, "epsilon_dpo/beta_margin_grad_std": 0.04221952706575394, "epsilon_dpo/beta_margin_mean": 0.10008412599563599, "epsilon_dpo/beta_margin_std": 0.17021533846855164, "epsilon_dpo/loss_margin_mean": 61.97832107543945, "grad_norm": 14.846392631530762, "kl/avg_steps": 0.4546875059604645, "kl/beta": 0.0016379815060645342, "kl/n_epsilon_steps": 0.26953125, "kl/p_epsilon_steps": 0.7242187261581421, "learning_rate": 2.168758844148272e-09, "logits/chosen": 1.6337049007415771, "logits/rejected": 1.9634275436401367, "logps/chosen": -353.42510986328125, "logps/ref_chosen": -294.84185791015625, "logps/ref_rejected": -276.9571533203125, "logps/rejected": -397.5187072753906, "loss": 5.184, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.09599287807941437, "rewards/margins": 0.10008411109447479, "rewards/rejected": -0.19607700407505035, "step": 460 }, { "epoch": 0.9842931937172775, "epsilon_dpo/beta": 0.0015589601825922728, "epsilon_dpo/beta_margin_grad_mean": -0.47446101903915405, "epsilon_dpo/beta_margin_grad_std": 0.04050491005182266, "epsilon_dpo/beta_margin_mean": 0.10283418744802475, "epsilon_dpo/beta_margin_std": 0.16317032277584076, "epsilon_dpo/loss_margin_mean": 66.61624145507812, "grad_norm": 14.901313781738281, "kl/avg_steps": 0.46562498807907104, "kl/beta": 0.0015660974895581603, "kl/n_epsilon_steps": 0.2632812559604645, "kl/p_epsilon_steps": 0.7289062738418579, "learning_rate": 4.288949484559934e-10, "logits/chosen": 1.5405309200286865, "logits/rejected": 1.751405119895935, "logps/chosen": -339.19415283203125, "logps/ref_chosen": -285.2023620605469, "logps/ref_rejected": -255.1339569091797, "logps/rejected": -375.7419738769531, "loss": 5.1712, "rewards/accuracies": 0.733593761920929, "rewards/chosen": -0.08475174009799957, "rewards/margins": 0.10283420234918594, "rewards/rejected": -0.18758592009544373, "step": 470 }, { "epoch": 0.9989528795811519, "step": 477, "total_flos": 0.0, "train_loss": 5.1642030939865915, "train_runtime": 8287.5392, "train_samples_per_second": 7.377, "train_steps_per_second": 0.058 } ], "logging_steps": 10, "max_steps": 477, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }