{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9988221436984688, "eval_steps": 500, "global_step": 477, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0020939667582777124, "grad_norm": 4.153600215911865, "learning_rate": 2.083333333333333e-08, "logits/chosen": -0.385508269071579, "logits/rejected": -0.3815978169441223, "logps/chosen": -313.31878662109375, "logps/ref_chosen": -313.1247253417969, "logps/ref_rejected": -257.44195556640625, "logps/rejected": -257.5481262207031, "loss": 0.6932, "margin_dpo/margin_mean": -0.08785581588745117, "margin_dpo/margin_std": 0.37070298194885254, "step": 1 }, { "epoch": 0.020939667582777124, "grad_norm": 3.972473382949829, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -0.4696977436542511, "logits/rejected": -0.4675690829753876, "logps/chosen": -301.630859375, "logps/ref_chosen": -301.46142578125, "logps/ref_rejected": -265.9523620605469, "logps/rejected": -266.04180908203125, "loss": 0.6933, "margin_dpo/margin_mean": -0.08001349866390228, "margin_dpo/margin_std": 0.32228612899780273, "step": 10 }, { "epoch": 0.04187933516555425, "grad_norm": 5.9276628494262695, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -0.46030598878860474, "logits/rejected": -0.49390584230422974, "logps/chosen": -287.4084777832031, "logps/ref_chosen": -287.79296875, "logps/ref_rejected": -241.2596435546875, "logps/rejected": -241.0130157470703, "loss": 0.6924, "margin_dpo/margin_mean": 0.13785475492477417, "margin_dpo/margin_std": 0.439021497964859, "step": 20 }, { "epoch": 0.06281900274833137, "grad_norm": 3.928417444229126, "learning_rate": 4.997836020254328e-07, "logits/chosen": -0.4774892330169678, "logits/rejected": -0.48092085123062134, "logps/chosen": -264.83526611328125, "logps/ref_chosen": -267.33026123046875, "logps/ref_rejected": -244.6634063720703, "logps/rejected": -242.9745635986328, "loss": 0.6888, "margin_dpo/margin_mean": 0.8061596751213074, "margin_dpo/margin_std": 1.0586752891540527, "step": 30 }, { "epoch": 0.0837586703311085, "grad_norm": 4.087418079376221, "learning_rate": 4.984625263805177e-07, "logits/chosen": -0.47675904631614685, "logits/rejected": -0.49461251497268677, "logps/chosen": -283.3618469238281, "logps/ref_chosen": -290.2306823730469, "logps/ref_rejected": -270.0351257324219, "logps/rejected": -265.93634033203125, "loss": 0.6806, "margin_dpo/margin_mean": 2.7700228691101074, "margin_dpo/margin_std": 3.1351680755615234, "step": 40 }, { "epoch": 0.10469833791388562, "grad_norm": 5.013513088226318, "learning_rate": 4.9594693969485e-07, "logits/chosen": -0.46262645721435547, "logits/rejected": -0.4923989176750183, "logps/chosen": -260.5640869140625, "logps/ref_chosen": -266.95281982421875, "logps/ref_rejected": -271.7529602050781, "logps/rejected": -269.3885498046875, "loss": 0.6734, "margin_dpo/margin_mean": 4.024331569671631, "margin_dpo/margin_std": 5.195467472076416, "step": 50 }, { "epoch": 0.12563800549666274, "grad_norm": 4.223069667816162, "learning_rate": 4.922489359292927e-07, "logits/chosen": -0.47976773977279663, "logits/rejected": -0.4877360463142395, "logps/chosen": -289.54461669921875, "logps/ref_chosen": -288.6689758300781, "logps/ref_rejected": -254.53158569335938, "logps/rejected": -261.9698486328125, "loss": 0.6622, "margin_dpo/margin_mean": 6.56261682510376, "margin_dpo/margin_std": 8.391728401184082, "step": 60 }, { "epoch": 0.14657767307943986, "grad_norm": 19.633056640625, "learning_rate": 4.873862936454303e-07, "logits/chosen": -0.47945109009742737, "logits/rejected": -0.4913012981414795, "logps/chosen": -302.0457763671875, "logps/ref_chosen": -299.79888916015625, "logps/ref_rejected": -278.92218017578125, "logps/rejected": -291.8330078125, "loss": 0.6488, "margin_dpo/margin_mean": 10.66395092010498, "margin_dpo/margin_std": 12.481794357299805, "step": 70 }, { "epoch": 0.167517340662217, "grad_norm": 10.23404598236084, "learning_rate": 4.813823905331703e-07, "logits/chosen": -0.37753456830978394, "logits/rejected": -0.33191266655921936, "logps/chosen": -276.65997314453125, "logps/ref_chosen": -273.9964294433594, "logps/ref_rejected": -261.94219970703125, "logps/rejected": -278.61053466796875, "loss": 0.6335, "margin_dpo/margin_mean": 14.00488567352295, "margin_dpo/margin_std": 14.380203247070312, "step": 80 }, { "epoch": 0.18845700824499412, "grad_norm": 16.92243766784668, "learning_rate": 4.74266091019916e-07, "logits/chosen": -0.34219780564308167, "logits/rejected": -0.2503359913825989, "logps/chosen": -305.6651916503906, "logps/ref_chosen": -290.7003479003906, "logps/ref_rejected": -248.63436889648438, "logps/rejected": -281.1633605957031, "loss": 0.6279, "margin_dpo/margin_mean": 17.564119338989258, "margin_dpo/margin_std": 19.235614776611328, "step": 90 }, { "epoch": 0.20939667582777124, "grad_norm": 19.076921463012695, "learning_rate": 4.660716075016441e-07, "logits/chosen": -0.18392817676067352, "logits/rejected": -0.14755138754844666, "logps/chosen": -284.04473876953125, "logps/ref_chosen": -273.87774658203125, "logps/ref_rejected": -261.7587890625, "logps/rejected": -292.94903564453125, "loss": 0.617, "margin_dpo/margin_mean": 21.023273468017578, "margin_dpo/margin_std": 23.208118438720703, "step": 100 }, { "epoch": 0.23033634341054834, "grad_norm": 11.625693321228027, "learning_rate": 4.568383358630315e-07, "logits/chosen": -0.0882129892706871, "logits/rejected": 0.031399063766002655, "logps/chosen": -312.0311584472656, "logps/ref_chosen": -294.00189208984375, "logps/ref_rejected": -256.91058349609375, "logps/rejected": -298.51300048828125, "loss": 0.5953, "margin_dpo/margin_mean": 23.573087692260742, "margin_dpo/margin_std": 27.610767364501953, "step": 110 }, { "epoch": 0.25127601099332547, "grad_norm": 59.717994689941406, "learning_rate": 4.466106660773884e-07, "logits/chosen": -0.06424394994974136, "logits/rejected": 0.046889692544937134, "logps/chosen": -304.84332275390625, "logps/ref_chosen": -285.0730895996094, "logps/ref_rejected": -267.43023681640625, "logps/rejected": -312.349609375, "loss": 0.6073, "margin_dpo/margin_mean": 25.149160385131836, "margin_dpo/margin_std": 28.71588706970215, "step": 120 }, { "epoch": 0.2722156785761026, "grad_norm": 14.268017768859863, "learning_rate": 4.3543776879695804e-07, "logits/chosen": 0.049873873591423035, "logits/rejected": 0.10700122267007828, "logps/chosen": -326.23651123046875, "logps/ref_chosen": -293.5408020019531, "logps/ref_rejected": -288.2164001464844, "logps/rejected": -354.05877685546875, "loss": 0.5941, "margin_dpo/margin_mean": 33.14662170410156, "margin_dpo/margin_std": 31.353755950927734, "step": 130 }, { "epoch": 0.2931553461588797, "grad_norm": 42.10697937011719, "learning_rate": 4.233733589595746e-07, "logits/chosen": 0.044216498732566833, "logits/rejected": 0.12556368112564087, "logps/chosen": -332.8862609863281, "logps/ref_chosen": -293.3451232910156, "logps/ref_rejected": -279.8287658691406, "logps/rejected": -351.7282409667969, "loss": 0.597, "margin_dpo/margin_mean": 32.35832214355469, "margin_dpo/margin_std": 35.75043869018555, "step": 140 }, { "epoch": 0.3140950137416568, "grad_norm": 10.703039169311523, "learning_rate": 4.104754375481664e-07, "logits/chosen": 0.01981363818049431, "logits/rejected": 0.15257558226585388, "logps/chosen": -323.87200927734375, "logps/ref_chosen": -292.59295654296875, "logps/ref_rejected": -258.49530029296875, "logps/rejected": -323.07672119140625, "loss": 0.5844, "margin_dpo/margin_mean": 33.3023681640625, "margin_dpo/margin_std": 36.2221565246582, "step": 150 }, { "epoch": 0.335034681324434, "grad_norm": 26.41683006286621, "learning_rate": 3.9680601274462354e-07, "logits/chosen": 0.16788892447948456, "logits/rejected": 0.2648904025554657, "logps/chosen": -324.91375732421875, "logps/ref_chosen": -279.3182678222656, "logps/ref_rejected": -266.2412414550781, "logps/rejected": -357.54644775390625, "loss": 0.5572, "margin_dpo/margin_mean": 45.709693908691406, "margin_dpo/margin_std": 37.458805084228516, "step": 160 }, { "epoch": 0.3559743489072111, "grad_norm": 20.263166427612305, "learning_rate": 3.824308018186143e-07, "logits/chosen": -0.04693519324064255, "logits/rejected": 0.14640530943870544, "logps/chosen": -361.36346435546875, "logps/ref_chosen": -297.69024658203125, "logps/ref_rejected": -246.70840454101562, "logps/rejected": -353.06201171875, "loss": 0.5614, "margin_dpo/margin_mean": 42.680423736572266, "margin_dpo/margin_std": 40.208683013916016, "step": 170 }, { "epoch": 0.37691401648998824, "grad_norm": 21.05248260498047, "learning_rate": 3.6741891518455146e-07, "logits/chosen": 0.04265492409467697, "logits/rejected": 0.12863174080848694, "logps/chosen": -336.97955322265625, "logps/ref_chosen": -282.07220458984375, "logps/ref_rejected": -267.81048583984375, "logps/rejected": -365.7275390625, "loss": 0.579, "margin_dpo/margin_mean": 43.00971984863281, "margin_dpo/margin_std": 39.17428207397461, "step": 180 }, { "epoch": 0.39785368407276533, "grad_norm": 13.581904411315918, "learning_rate": 3.5184252414564197e-07, "logits/chosen": 0.019876545295119286, "logits/rejected": 0.14703558385372162, "logps/chosen": -334.26300048828125, "logps/ref_chosen": -277.98150634765625, "logps/ref_rejected": -251.03671264648438, "logps/rejected": -350.8268127441406, "loss": 0.555, "margin_dpo/margin_mean": 43.508628845214844, "margin_dpo/margin_std": 42.15697479248047, "step": 190 }, { "epoch": 0.4187933516555425, "grad_norm": 18.143381118774414, "learning_rate": 3.3577651392237394e-07, "logits/chosen": 0.07500011473894119, "logits/rejected": 0.24331322312355042, "logps/chosen": -381.325927734375, "logps/ref_chosen": -301.87799072265625, "logps/ref_rejected": -271.30010986328125, "logps/rejected": -394.4322814941406, "loss": 0.5642, "margin_dpo/margin_mean": 43.68426513671875, "margin_dpo/margin_std": 38.98915100097656, "step": 200 }, { "epoch": 0.4397330192383196, "grad_norm": 24.03816032409668, "learning_rate": 3.1929812363354764e-07, "logits/chosen": 0.11765547096729279, "logits/rejected": 0.18735218048095703, "logps/chosen": -365.60113525390625, "logps/ref_chosen": -271.31854248046875, "logps/ref_rejected": -255.88186645507812, "logps/rejected": -395.07745361328125, "loss": 0.5552, "margin_dpo/margin_mean": 44.91301345825195, "margin_dpo/margin_std": 47.19767379760742, "step": 210 }, { "epoch": 0.4606726868210967, "grad_norm": 22.125686645507812, "learning_rate": 3.024865749606803e-07, "logits/chosen": 0.07137051969766617, "logits/rejected": 0.18482640385627747, "logps/chosen": -391.6545104980469, "logps/ref_chosen": -293.5024108886719, "logps/ref_rejected": -263.943115234375, "logps/rejected": -409.07977294921875, "loss": 0.5519, "margin_dpo/margin_mean": 46.98453140258789, "margin_dpo/margin_std": 43.669212341308594, "step": 220 }, { "epoch": 0.48161235440387384, "grad_norm": 18.157793045043945, "learning_rate": 2.854226912810206e-07, "logits/chosen": 0.11615083366632462, "logits/rejected": 0.24443212151527405, "logps/chosen": -374.0119323730469, "logps/ref_chosen": -265.1332702636719, "logps/ref_rejected": -261.63671875, "logps/rejected": -415.58380126953125, "loss": 0.5424, "margin_dpo/margin_mean": 45.06837844848633, "margin_dpo/margin_std": 45.513736724853516, "step": 230 }, { "epoch": 0.5025520219866509, "grad_norm": 10.94947624206543, "learning_rate": 2.68188509100236e-07, "logits/chosen": 0.07515954971313477, "logits/rejected": 0.18595214188098907, "logps/chosen": -355.45574951171875, "logps/ref_chosen": -265.394287109375, "logps/ref_rejected": -263.04449462890625, "logps/rejected": -400.0995788574219, "loss": 0.5557, "margin_dpo/margin_mean": 46.99358367919922, "margin_dpo/margin_std": 43.50702667236328, "step": 240 }, { "epoch": 0.523491689569428, "grad_norm": 26.13774299621582, "learning_rate": 2.5086688365284857e-07, "logits/chosen": 0.057459376752376556, "logits/rejected": 0.23900671303272247, "logps/chosen": -364.48931884765625, "logps/ref_chosen": -276.13116455078125, "logps/ref_rejected": -233.1983642578125, "logps/rejected": -372.2765197753906, "loss": 0.5488, "margin_dpo/margin_mean": 50.719993591308594, "margin_dpo/margin_std": 46.66020965576172, "step": 250 }, { "epoch": 0.5444313571522053, "grad_norm": 14.217330932617188, "learning_rate": 2.3354109056654418e-07, "logits/chosen": 0.1635710746049881, "logits/rejected": 0.27944889664649963, "logps/chosen": -396.4654846191406, "logps/ref_chosen": -289.42364501953125, "logps/ref_rejected": -255.2952117919922, "logps/rejected": -411.4334411621094, "loss": 0.5539, "margin_dpo/margin_mean": 49.09638595581055, "margin_dpo/margin_std": 50.09044647216797, "step": 260 }, { "epoch": 0.5653710247349824, "grad_norm": 47.02643585205078, "learning_rate": 2.162944255053928e-07, "logits/chosen": 0.23213577270507812, "logits/rejected": 0.2836950123310089, "logps/chosen": -406.9697265625, "logps/ref_chosen": -295.31121826171875, "logps/ref_rejected": -283.03643798828125, "logps/rejected": -445.3810119628906, "loss": 0.537, "margin_dpo/margin_mean": 50.686119079589844, "margin_dpo/margin_std": 50.13835906982422, "step": 270 }, { "epoch": 0.5863106923177595, "grad_norm": 16.077573776245117, "learning_rate": 1.9920980371674297e-07, "logits/chosen": 0.04740264266729355, "logits/rejected": 0.20282933115959167, "logps/chosen": -392.91693115234375, "logps/ref_chosen": -277.9930725097656, "logps/ref_rejected": -258.11956787109375, "logps/rejected": -436.17108154296875, "loss": 0.53, "margin_dpo/margin_mean": 63.12766647338867, "margin_dpo/margin_std": 48.20869827270508, "step": 280 }, { "epoch": 0.6072503599005366, "grad_norm": 30.580135345458984, "learning_rate": 1.823693614070116e-07, "logits/chosen": 0.1650705635547638, "logits/rejected": 0.3033481240272522, "logps/chosen": -422.15130615234375, "logps/ref_chosen": -295.2901306152344, "logps/ref_rejected": -269.10552978515625, "logps/rejected": -446.4153747558594, "loss": 0.5597, "margin_dpo/margin_mean": 50.44862747192383, "margin_dpo/margin_std": 48.35343933105469, "step": 290 }, { "epoch": 0.6281900274833137, "grad_norm": 46.89568328857422, "learning_rate": 1.6585406086279846e-07, "logits/chosen": 0.2200140655040741, "logits/rejected": 0.34437426924705505, "logps/chosen": -378.20428466796875, "logps/ref_chosen": -261.69488525390625, "logps/ref_rejected": -253.68557739257812, "logps/rejected": -420.6162109375, "loss": 0.5471, "margin_dpo/margin_mean": 50.421165466308594, "margin_dpo/margin_std": 47.259437561035156, "step": 300 }, { "epoch": 0.6491296950660909, "grad_norm": 16.970218658447266, "learning_rate": 1.4974330121575046e-07, "logits/chosen": 0.2641686797142029, "logits/rejected": 0.383655846118927, "logps/chosen": -414.44512939453125, "logps/ref_chosen": -285.978515625, "logps/ref_rejected": -262.2162170410156, "logps/rejected": -446.66815185546875, "loss": 0.5278, "margin_dpo/margin_mean": 55.9853515625, "margin_dpo/margin_std": 45.96001434326172, "step": 310 }, { "epoch": 0.670069362648868, "grad_norm": 16.45661735534668, "learning_rate": 1.341145367224657e-07, "logits/chosen": 0.14594177901744843, "logits/rejected": 0.3130393326282501, "logps/chosen": -418.21820068359375, "logps/ref_chosen": -297.89361572265625, "logps/ref_rejected": -261.50360107421875, "logps/rejected": -444.85150146484375, "loss": 0.5262, "margin_dpo/margin_mean": 63.023284912109375, "margin_dpo/margin_std": 49.52865219116211, "step": 320 }, { "epoch": 0.6910090302316451, "grad_norm": 52.76103210449219, "learning_rate": 1.1904290439459971e-07, "logits/chosen": 0.14333733916282654, "logits/rejected": 0.2348686158657074, "logps/chosen": -395.7355651855469, "logps/ref_chosen": -282.3043518066406, "logps/ref_rejected": -272.1080322265625, "logps/rejected": -451.24932861328125, "loss": 0.5455, "margin_dpo/margin_mean": 65.71006774902344, "margin_dpo/margin_std": 54.01335525512695, "step": 330 }, { "epoch": 0.7119486978144222, "grad_norm": 26.729217529296875, "learning_rate": 1.0460086276938143e-07, "logits/chosen": 0.1563154011964798, "logits/rejected": 0.2881699502468109, "logps/chosen": -378.7862854003906, "logps/ref_chosen": -264.32440185546875, "logps/ref_rejected": -231.1798095703125, "logps/rejected": -404.14886474609375, "loss": 0.5356, "margin_dpo/margin_mean": 58.50718307495117, "margin_dpo/margin_std": 58.1771125793457, "step": 340 }, { "epoch": 0.7328883653971994, "grad_norm": 31.903892517089844, "learning_rate": 9.085784355719256e-08, "logits/chosen": 0.10641894489526749, "logits/rejected": 0.2847670614719391, "logps/chosen": -425.1806640625, "logps/ref_chosen": -309.6944274902344, "logps/ref_rejected": -275.2430114746094, "logps/rejected": -448.0032653808594, "loss": 0.5388, "margin_dpo/margin_mean": 57.274009704589844, "margin_dpo/margin_std": 47.37055587768555, "step": 350 }, { "epoch": 0.7538280329799765, "grad_norm": 29.521696090698242, "learning_rate": 7.787991784094999e-08, "logits/chosen": 0.15716208517551422, "logits/rejected": 0.26787060499191284, "logps/chosen": -378.3572998046875, "logps/ref_chosen": -256.606201171875, "logps/ref_rejected": -244.90805053710938, "logps/rejected": -420.41864013671875, "loss": 0.5474, "margin_dpo/margin_mean": 53.7595100402832, "margin_dpo/margin_std": 58.170265197753906, "step": 360 }, { "epoch": 0.7747677005627536, "grad_norm": 19.0128231048584, "learning_rate": 6.572947843207677e-08, "logits/chosen": 0.16855831444263458, "logits/rejected": 0.3704710900783539, "logps/chosen": -413.5511169433594, "logps/ref_chosen": -297.1283264160156, "logps/ref_rejected": -262.2713317871094, "logps/rejected": -444.85626220703125, "loss": 0.5201, "margin_dpo/margin_mean": 66.16206359863281, "margin_dpo/margin_std": 55.746543884277344, "step": 370 }, { "epoch": 0.7957073681455307, "grad_norm": 23.740262985229492, "learning_rate": 5.446493991016879e-08, "logits/chosen": 0.18822483718395233, "logits/rejected": 0.29204973578453064, "logps/chosen": -412.62615966796875, "logps/ref_chosen": -291.81365966796875, "logps/ref_rejected": -264.4927673339844, "logps/rejected": -443.88836669921875, "loss": 0.5438, "margin_dpo/margin_mean": 58.58308029174805, "margin_dpo/margin_std": 47.825599670410156, "step": 380 }, { "epoch": 0.8166470357283078, "grad_norm": 14.888653755187988, "learning_rate": 4.4140457788451434e-08, "logits/chosen": 0.1765083372592926, "logits/rejected": 0.3199649751186371, "logps/chosen": -421.967529296875, "logps/ref_chosen": -289.7127990722656, "logps/ref_rejected": -267.163330078125, "logps/rejected": -455.80517578125, "loss": 0.5362, "margin_dpo/margin_mean": 56.387115478515625, "margin_dpo/margin_std": 51.367897033691406, "step": 390 }, { "epoch": 0.837586703311085, "grad_norm": 27.52701759338379, "learning_rate": 3.4805668155167556e-08, "logits/chosen": 0.23016759753227234, "logits/rejected": 0.414517879486084, "logps/chosen": -432.7222595214844, "logps/ref_chosen": -298.2846984863281, "logps/ref_rejected": -244.9308624267578, "logps/rejected": -438.0635681152344, "loss": 0.5389, "margin_dpo/margin_mean": 58.69511795043945, "margin_dpo/margin_std": 53.363685607910156, "step": 400 }, { "epoch": 0.8585263708938621, "grad_norm": 31.697511672973633, "learning_rate": 2.650544904260024e-08, "logits/chosen": 0.18534260988235474, "logits/rejected": 0.3869909346103668, "logps/chosen": -443.33575439453125, "logps/ref_chosen": -308.6795959472656, "logps/ref_rejected": -275.4808349609375, "logps/rejected": -468.99200439453125, "loss": 0.5198, "margin_dpo/margin_mean": 58.855003356933594, "margin_dpo/margin_std": 53.85388946533203, "step": 410 }, { "epoch": 0.8794660384766392, "grad_norm": 21.451770782470703, "learning_rate": 1.9279704670975726e-08, "logits/chosen": 0.10304410755634308, "logits/rejected": 0.2722802758216858, "logps/chosen": -427.65203857421875, "logps/ref_chosen": -291.2632141113281, "logps/ref_rejected": -276.36553955078125, "logps/rejected": -470.96038818359375, "loss": 0.5276, "margin_dpo/margin_mean": 58.20595169067383, "margin_dpo/margin_std": 57.847412109375, "step": 420 }, { "epoch": 0.9004057060594163, "grad_norm": 27.079269409179688, "learning_rate": 1.3163173604516409e-08, "logits/chosen": 0.2293200045824051, "logits/rejected": 0.399575412273407, "logps/chosen": -426.8741760253906, "logps/ref_chosen": -296.31085205078125, "logps/ref_rejected": -259.3516845703125, "logps/rejected": -452.6231384277344, "loss": 0.5185, "margin_dpo/margin_mean": 62.70819091796875, "margin_dpo/margin_std": 54.348899841308594, "step": 430 }, { "epoch": 0.9213453736421934, "grad_norm": 26.594852447509766, "learning_rate": 8.18526174195655e-09, "logits/chosen": 0.277737557888031, "logits/rejected": 0.3488383889198303, "logps/chosen": -430.6412658691406, "logps/ref_chosen": -290.34857177734375, "logps/ref_rejected": -267.0846862792969, "logps/rejected": -453.26580810546875, "loss": 0.5468, "margin_dpo/margin_mean": 45.888404846191406, "margin_dpo/margin_std": 55.84312057495117, "step": 440 }, { "epoch": 0.9422850412249706, "grad_norm": 17.050395965576172, "learning_rate": 4.369900944435734e-09, "logits/chosen": 0.14949986338615417, "logits/rejected": 0.3868565857410431, "logps/chosen": -442.13580322265625, "logps/ref_chosen": -316.7445068359375, "logps/ref_rejected": -255.24594116210938, "logps/rejected": -452.7721252441406, "loss": 0.5221, "margin_dpo/margin_mean": 72.13482666015625, "margin_dpo/margin_std": 50.0427360534668, "step": 450 }, { "epoch": 0.9632247088077477, "grad_norm": 24.54353141784668, "learning_rate": 1.73543398043291e-09, "logits/chosen": 0.1670341193675995, "logits/rejected": 0.3386840224266052, "logps/chosen": -449.431884765625, "logps/ref_chosen": -316.5346984863281, "logps/ref_rejected": -293.4228820800781, "logps/rejected": -487.91412353515625, "loss": 0.5389, "margin_dpo/margin_mean": 61.59406280517578, "margin_dpo/margin_std": 53.5306282043457, "step": 460 }, { "epoch": 0.9841643763905248, "grad_norm": 23.415748596191406, "learning_rate": 2.94526340880813e-10, "logits/chosen": 0.23086392879486084, "logits/rejected": 0.39752504229545593, "logps/chosen": -410.21978759765625, "logps/ref_chosen": -284.00579833984375, "logps/ref_rejected": -252.9314727783203, "logps/rejected": -446.0233459472656, "loss": 0.5346, "margin_dpo/margin_mean": 66.87791442871094, "margin_dpo/margin_std": 52.97399139404297, "step": 470 }, { "epoch": 0.9988221436984688, "step": 477, "total_flos": 0.0, "train_loss": 0.5733983517942689, "train_runtime": 5703.1485, "train_samples_per_second": 10.72, "train_steps_per_second": 0.084 } ], "logging_steps": 10, "max_steps": 477, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }