{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9989528795811519, "eval_steps": 200, "global_step": 477, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0020942408376963353, "grad_norm": 7.1476731300354, "learning_rate": 0.0, "logits/chosen": -0.6103914976119995, "logits/rejected": -0.6099507808685303, "logps/chosen": -318.31317138671875, "logps/ref_chosen": -318.28057861328125, "logps/ref_rejected": -203.32687377929688, "logps/rejected": -203.17298889160156, "loss": 1.3866, "margin_dpo/margin_mean": -0.18641114234924316, "margin_dpo/margin_std": 0.6639037132263184, "step": 1 }, { "epoch": 0.010471204188481676, "grad_norm": 7.389297008514404, "learning_rate": 4.166666666666666e-08, "logits/chosen": -0.6768993139266968, "logits/rejected": -0.6286869049072266, "logps/chosen": -284.77685546875, "logps/ref_chosen": -284.8314514160156, "logps/ref_rejected": -286.5647888183594, "logps/rejected": -286.6424865722656, "loss": 1.3862, "margin_dpo/margin_mean": 0.1322835385799408, "margin_dpo/margin_std": 0.8204990029335022, "step": 5 }, { "epoch": 0.020942408376963352, "grad_norm": 6.636824131011963, "learning_rate": 9.375e-08, "logits/chosen": -0.6935982704162598, "logits/rejected": -0.6999162435531616, "logps/chosen": -276.5225524902344, "logps/ref_chosen": -276.4944152832031, "logps/ref_rejected": -242.5192108154297, "logps/rejected": -242.5146484375, "loss": 1.3861, "margin_dpo/margin_mean": -0.03269507735967636, "margin_dpo/margin_std": 0.6867764592170715, "step": 10 }, { "epoch": 0.031413612565445025, "grad_norm": 7.5604400634765625, "learning_rate": 1.4583333333333335e-07, "logits/chosen": -0.634182333946228, "logits/rejected": -0.6508210897445679, "logps/chosen": -307.7691345214844, "logps/ref_chosen": -307.81634521484375, "logps/ref_rejected": -287.94805908203125, "logps/rejected": -288.03228759765625, "loss": 1.386, "margin_dpo/margin_mean": 0.13143400847911835, "margin_dpo/margin_std": 0.7205628156661987, "step": 15 }, { "epoch": 0.041884816753926704, "grad_norm": 7.414900779724121, "learning_rate": 1.9791666666666664e-07, "logits/chosen": -0.6475816965103149, "logits/rejected": -0.6553324460983276, "logps/chosen": -283.70404052734375, "logps/ref_chosen": -283.7494201660156, "logps/ref_rejected": -234.01278686523438, "logps/rejected": -234.0884246826172, "loss": 1.3854, "margin_dpo/margin_mean": 0.1209900826215744, "margin_dpo/margin_std": 0.5572749376296997, "step": 20 }, { "epoch": 0.05235602094240838, "grad_norm": 6.6036601066589355, "learning_rate": 2.5e-07, "logits/chosen": -0.646403431892395, "logits/rejected": -0.6549252271652222, "logps/chosen": -234.9982147216797, "logps/ref_chosen": -235.056884765625, "logps/ref_rejected": -214.2563934326172, "logps/rejected": -214.32626342773438, "loss": 1.385, "margin_dpo/margin_mean": 0.12857410311698914, "margin_dpo/margin_std": 0.6414791941642761, "step": 25 }, { "epoch": 0.06282722513089005, "grad_norm": 7.229597568511963, "learning_rate": 3.020833333333333e-07, "logits/chosen": -0.6887374520301819, "logits/rejected": -0.6938886642456055, "logps/chosen": -322.88897705078125, "logps/ref_chosen": -323.2079772949219, "logps/ref_rejected": -253.31405639648438, "logps/rejected": -253.1912841796875, "loss": 1.3844, "margin_dpo/margin_mean": 0.19620926678180695, "margin_dpo/margin_std": 0.8317287564277649, "step": 30 }, { "epoch": 0.07329842931937172, "grad_norm": 6.7626142501831055, "learning_rate": 3.541666666666667e-07, "logits/chosen": -0.6849234700202942, "logits/rejected": -0.6820663809776306, "logps/chosen": -300.1553039550781, "logps/ref_chosen": -300.67559814453125, "logps/ref_rejected": -275.8673400878906, "logps/rejected": -275.5356750488281, "loss": 1.3823, "margin_dpo/margin_mean": 0.18862803280353546, "margin_dpo/margin_std": 0.9617260098457336, "step": 35 }, { "epoch": 0.08376963350785341, "grad_norm": 6.857061862945557, "learning_rate": 4.0625e-07, "logits/chosen": -0.5887177586555481, "logits/rejected": -0.6046378016471863, "logps/chosen": -251.55557250976562, "logps/ref_chosen": -252.3434600830078, "logps/ref_rejected": -278.35101318359375, "logps/rejected": -278.2633361816406, "loss": 1.3786, "margin_dpo/margin_mean": 0.7002249956130981, "margin_dpo/margin_std": 1.3109245300292969, "step": 40 }, { "epoch": 0.09424083769633508, "grad_norm": 7.225438594818115, "learning_rate": 4.5833333333333327e-07, "logits/chosen": -0.7441970705986023, "logits/rejected": -0.7299541234970093, "logps/chosen": -311.53662109375, "logps/ref_chosen": -312.97418212890625, "logps/ref_rejected": -304.2184753417969, "logps/rejected": -303.91888427734375, "loss": 1.3746, "margin_dpo/margin_mean": 1.1379705667495728, "margin_dpo/margin_std": 2.3105995655059814, "step": 45 }, { "epoch": 0.10471204188481675, "grad_norm": 6.961248397827148, "learning_rate": 4.999932966293553e-07, "logits/chosen": -0.6633109450340271, "logits/rejected": -0.6941882371902466, "logps/chosen": -274.2547302246094, "logps/ref_chosen": -276.328369140625, "logps/ref_rejected": -249.0668182373047, "logps/rejected": -248.53305053710938, "loss": 1.373, "margin_dpo/margin_mean": 1.5398980379104614, "margin_dpo/margin_std": 2.8042705059051514, "step": 50 }, { "epoch": 0.11518324607329843, "grad_norm": 6.207220554351807, "learning_rate": 4.997587164001815e-07, "logits/chosen": -0.6882608532905579, "logits/rejected": -0.6976534128189087, "logps/chosen": -306.1382751464844, "logps/ref_chosen": -308.47393798828125, "logps/ref_rejected": -298.5058288574219, "logps/rejected": -298.40655517578125, "loss": 1.3643, "margin_dpo/margin_mean": 2.236351728439331, "margin_dpo/margin_std": 4.03792142868042, "step": 55 }, { "epoch": 0.1256544502617801, "grad_norm": 7.07670783996582, "learning_rate": 4.991893270335525e-07, "logits/chosen": -0.6674671173095703, "logits/rejected": -0.6547614336013794, "logps/chosen": -309.98895263671875, "logps/ref_chosen": -312.65618896484375, "logps/ref_rejected": -274.500244140625, "logps/rejected": -274.29986572265625, "loss": 1.3582, "margin_dpo/margin_mean": 2.4668667316436768, "margin_dpo/margin_std": 6.715214729309082, "step": 60 }, { "epoch": 0.13612565445026178, "grad_norm": 6.950833320617676, "learning_rate": 4.982858918131906e-07, "logits/chosen": -0.6531001925468445, "logits/rejected": -0.6600942611694336, "logps/chosen": -329.8468017578125, "logps/ref_chosen": -334.0863952636719, "logps/ref_rejected": -310.85491943359375, "logps/rejected": -310.4468688964844, "loss": 1.3531, "margin_dpo/margin_mean": 3.8315443992614746, "margin_dpo/margin_std": 8.632562637329102, "step": 65 }, { "epoch": 0.14659685863874344, "grad_norm": 7.140095233917236, "learning_rate": 4.970496218214204e-07, "logits/chosen": -0.7720015645027161, "logits/rejected": -0.7810764908790588, "logps/chosen": -281.643798828125, "logps/ref_chosen": -286.09478759765625, "logps/ref_rejected": -269.44683837890625, "logps/rejected": -269.636962890625, "loss": 1.3407, "margin_dpo/margin_mean": 4.641018867492676, "margin_dpo/margin_std": 9.235776901245117, "step": 70 }, { "epoch": 0.15706806282722513, "grad_norm": 7.371065616607666, "learning_rate": 4.954821743156767e-07, "logits/chosen": -0.70263671875, "logits/rejected": -0.7102752923965454, "logps/chosen": -324.5908508300781, "logps/ref_chosen": -329.2369384765625, "logps/ref_rejected": -309.39324951171875, "logps/rejected": -308.8836669921875, "loss": 1.3387, "margin_dpo/margin_mean": 4.13649845123291, "margin_dpo/margin_std": 11.341253280639648, "step": 75 }, { "epoch": 0.16753926701570682, "grad_norm": 7.241215229034424, "learning_rate": 4.935856505068998e-07, "logits/chosen": -0.7236490845680237, "logits/rejected": -0.722406268119812, "logps/chosen": -255.44656372070312, "logps/ref_chosen": -257.80487060546875, "logps/ref_rejected": -247.0098114013672, "logps/rejected": -249.47775268554688, "loss": 1.3201, "margin_dpo/margin_mean": 4.8262553215026855, "margin_dpo/margin_std": 11.936810493469238, "step": 80 }, { "epoch": 0.17801047120418848, "grad_norm": 7.340602874755859, "learning_rate": 4.913625927427995e-07, "logits/chosen": -0.7124683260917664, "logits/rejected": -0.735012412071228, "logps/chosen": -273.90252685546875, "logps/ref_chosen": -277.0785827636719, "logps/ref_rejected": -261.55169677734375, "logps/rejected": -270.66497802734375, "loss": 1.3219, "margin_dpo/margin_mean": 12.289302825927734, "margin_dpo/margin_std": 17.269710540771484, "step": 85 }, { "epoch": 0.18848167539267016, "grad_norm": 7.4462690353393555, "learning_rate": 4.8881598109976e-07, "logits/chosen": -0.7061265707015991, "logits/rejected": -0.7180206775665283, "logps/chosen": -301.48211669921875, "logps/ref_chosen": -300.3891296386719, "logps/ref_rejected": -289.59014892578125, "logps/rejected": -301.53326416015625, "loss": 1.3065, "margin_dpo/margin_mean": 10.850162506103516, "margin_dpo/margin_std": 16.798553466796875, "step": 90 }, { "epoch": 0.19895287958115182, "grad_norm": 8.369804382324219, "learning_rate": 4.859492293879573e-07, "logits/chosen": -0.687911331653595, "logits/rejected": -0.7407578825950623, "logps/chosen": -245.9025421142578, "logps/ref_chosen": -243.02804565429688, "logps/ref_rejected": -219.5611572265625, "logps/rejected": -229.8905792236328, "loss": 1.3033, "margin_dpo/margin_mean": 7.454855918884277, "margin_dpo/margin_std": 19.36737632751465, "step": 95 }, { "epoch": 0.2094240837696335, "grad_norm": 8.873656272888184, "learning_rate": 4.827661805750437e-07, "logits/chosen": -0.7412772178649902, "logits/rejected": -0.7370281219482422, "logps/chosen": -302.4429626464844, "logps/ref_chosen": -297.3129577636719, "logps/ref_rejected": -304.67572021484375, "logps/rejected": -320.0408935546875, "loss": 1.2866, "margin_dpo/margin_mean": 10.235170364379883, "margin_dpo/margin_std": 25.688796997070312, "step": 100 }, { "epoch": 0.2198952879581152, "grad_norm": 8.666266441345215, "learning_rate": 4.792711016345321e-07, "logits/chosen": -0.7506468892097473, "logits/rejected": -0.7723590135574341, "logps/chosen": -285.6722412109375, "logps/ref_chosen": -279.5523376464844, "logps/ref_rejected": -258.6304016113281, "logps/rejected": -277.24481201171875, "loss": 1.2624, "margin_dpo/margin_mean": 12.494502067565918, "margin_dpo/margin_std": 27.29207992553711, "step": 105 }, { "epoch": 0.23036649214659685, "grad_norm": 8.676697731018066, "learning_rate": 4.75468677825789e-07, "logits/chosen": -0.7946863770484924, "logits/rejected": -0.7978917360305786, "logps/chosen": -284.75701904296875, "logps/ref_chosen": -278.9017639160156, "logps/ref_rejected": -230.1315460205078, "logps/rejected": -254.346923828125, "loss": 1.2393, "margin_dpo/margin_mean": 18.360124588012695, "margin_dpo/margin_std": 28.798229217529297, "step": 110 }, { "epoch": 0.24083769633507854, "grad_norm": 10.881580352783203, "learning_rate": 4.7136400641330245e-07, "logits/chosen": -0.7917270660400391, "logits/rejected": -0.7990630865097046, "logps/chosen": -277.28607177734375, "logps/ref_chosen": -262.6755676269531, "logps/ref_rejected": -234.7182159423828, "logps/rejected": -267.9288635253906, "loss": 1.259, "margin_dpo/margin_mean": 18.60015106201172, "margin_dpo/margin_std": 37.60885238647461, "step": 115 }, { "epoch": 0.2513089005235602, "grad_norm": 10.508712768554688, "learning_rate": 4.669625898336438e-07, "logits/chosen": -0.8892138600349426, "logits/rejected": -0.8844587206840515, "logps/chosen": -294.91656494140625, "logps/ref_chosen": -269.8807373046875, "logps/ref_rejected": -268.51702880859375, "logps/rejected": -313.973388671875, "loss": 1.241, "margin_dpo/margin_mean": 20.420486450195312, "margin_dpo/margin_std": 38.39332580566406, "step": 120 }, { "epoch": 0.2617801047120419, "grad_norm": 13.029239654541016, "learning_rate": 4.6227032831928483e-07, "logits/chosen": -0.8507975339889526, "logits/rejected": -0.8206876516342163, "logps/chosen": -322.9510192871094, "logps/ref_chosen": -293.70062255859375, "logps/ref_rejected": -286.66217041015625, "logps/rejected": -330.2171630859375, "loss": 1.2444, "margin_dpo/margin_mean": 14.304577827453613, "margin_dpo/margin_std": 44.559913635253906, "step": 125 }, { "epoch": 0.27225130890052357, "grad_norm": 12.968539237976074, "learning_rate": 4.5729351198915705e-07, "logits/chosen": -0.8192211389541626, "logits/rejected": -0.8181384205818176, "logps/chosen": -308.5555725097656, "logps/ref_chosen": -284.30474853515625, "logps/ref_rejected": -289.4891662597656, "logps/rejected": -341.5650329589844, "loss": 1.1922, "margin_dpo/margin_mean": 27.825063705444336, "margin_dpo/margin_std": 42.415462493896484, "step": 130 }, { "epoch": 0.28272251308900526, "grad_norm": 15.247875213623047, "learning_rate": 4.520388124165564e-07, "logits/chosen": -0.8112742304801941, "logits/rejected": -0.790899932384491, "logps/chosen": -308.3522033691406, "logps/ref_chosen": -279.0638732910156, "logps/ref_rejected": -271.4653015136719, "logps/rejected": -325.9285888671875, "loss": 1.2114, "margin_dpo/margin_mean": 25.174943923950195, "margin_dpo/margin_std": 43.65618896484375, "step": 135 }, { "epoch": 0.2931937172774869, "grad_norm": 15.881244659423828, "learning_rate": 4.4651327368569684e-07, "logits/chosen": -0.8326481580734253, "logits/rejected": -0.8528935313224792, "logps/chosen": -360.31134033203125, "logps/ref_chosen": -319.4598693847656, "logps/ref_rejected": -272.49420166015625, "logps/rejected": -338.5896911621094, "loss": 1.2095, "margin_dpo/margin_mean": 25.244089126586914, "margin_dpo/margin_std": 55.747283935546875, "step": 140 }, { "epoch": 0.3036649214659686, "grad_norm": 13.514267921447754, "learning_rate": 4.4072430294890166e-07, "logits/chosen": -0.8288572430610657, "logits/rejected": -0.8065798878669739, "logps/chosen": -310.3006896972656, "logps/ref_chosen": -282.9032287597656, "logps/ref_rejected": -256.1629638671875, "logps/rejected": -322.1570129394531, "loss": 1.1832, "margin_dpo/margin_mean": 38.596588134765625, "margin_dpo/margin_std": 52.25310134887695, "step": 145 }, { "epoch": 0.31413612565445026, "grad_norm": 15.282773971557617, "learning_rate": 4.346796604970912e-07, "logits/chosen": -0.8817797899246216, "logits/rejected": -0.8567675352096558, "logps/chosen": -337.6163635253906, "logps/ref_chosen": -294.38629150390625, "logps/ref_rejected": -283.258544921875, "logps/rejected": -359.8990783691406, "loss": 1.1792, "margin_dpo/margin_mean": 33.410484313964844, "margin_dpo/margin_std": 48.665000915527344, "step": 150 }, { "epoch": 0.32460732984293195, "grad_norm": 21.630462646484375, "learning_rate": 4.2838744935687716e-07, "logits/chosen": -0.8599667549133301, "logits/rejected": -0.8464357256889343, "logps/chosen": -336.33258056640625, "logps/ref_chosen": -282.7860107421875, "logps/ref_rejected": -277.7061462402344, "logps/rejected": -382.0819396972656, "loss": 1.1005, "margin_dpo/margin_mean": 50.82927703857422, "margin_dpo/margin_std": 65.71281433105469, "step": 155 }, { "epoch": 0.33507853403141363, "grad_norm": 15.092132568359375, "learning_rate": 4.218561044282098e-07, "logits/chosen": -0.8716852068901062, "logits/rejected": -0.8610923886299133, "logps/chosen": -297.11187744140625, "logps/ref_chosen": -245.03564453125, "logps/ref_rejected": -193.0067596435547, "logps/rejected": -286.109130859375, "loss": 1.1212, "margin_dpo/margin_mean": 41.026119232177734, "margin_dpo/margin_std": 55.911903381347656, "step": 160 }, { "epoch": 0.34554973821989526, "grad_norm": 25.69545555114746, "learning_rate": 4.1509438117713863e-07, "logits/chosen": -0.8722988963127136, "logits/rejected": -0.8764654397964478, "logps/chosen": -323.02362060546875, "logps/ref_chosen": -271.78961181640625, "logps/ref_rejected": -231.51443481445312, "logps/rejected": -323.4990539550781, "loss": 1.1301, "margin_dpo/margin_mean": 40.750648498535156, "margin_dpo/margin_std": 60.6348991394043, "step": 165 }, { "epoch": 0.35602094240837695, "grad_norm": 22.663711547851562, "learning_rate": 4.081113438988443e-07, "logits/chosen": -0.8557019233703613, "logits/rejected": -0.8207084536552429, "logps/chosen": -391.30767822265625, "logps/ref_chosen": -335.8885803222656, "logps/ref_rejected": -260.59735107421875, "logps/rejected": -372.26214599609375, "loss": 1.1057, "margin_dpo/margin_mean": 56.2457160949707, "margin_dpo/margin_std": 64.02092742919922, "step": 170 }, { "epoch": 0.36649214659685864, "grad_norm": 24.62014389038086, "learning_rate": 4.00916353566676e-07, "logits/chosen": -0.7955960035324097, "logits/rejected": -0.8043614625930786, "logps/chosen": -392.2655029296875, "logps/ref_chosen": -332.5082092285156, "logps/ref_rejected": -267.3958740234375, "logps/rejected": -373.43511962890625, "loss": 1.1212, "margin_dpo/margin_mean": 46.28200149536133, "margin_dpo/margin_std": 70.63272857666016, "step": 175 }, { "epoch": 0.3769633507853403, "grad_norm": 23.055593490600586, "learning_rate": 3.935190552834828e-07, "logits/chosen": -0.7882012724876404, "logits/rejected": -0.773046612739563, "logps/chosen": -312.5518493652344, "logps/ref_chosen": -251.58407592773438, "logps/ref_rejected": -225.3298797607422, "logps/rejected": -328.76324462890625, "loss": 1.1412, "margin_dpo/margin_mean": 42.46554946899414, "margin_dpo/margin_std": 68.0986328125, "step": 180 }, { "epoch": 0.387434554973822, "grad_norm": 23.7900333404541, "learning_rate": 3.859293653520604e-07, "logits/chosen": -0.8637909889221191, "logits/rejected": -0.856473445892334, "logps/chosen": -346.76739501953125, "logps/ref_chosen": -280.81097412109375, "logps/ref_rejected": -264.32025146484375, "logps/rejected": -396.57025146484375, "loss": 1.0657, "margin_dpo/margin_mean": 66.29359436035156, "margin_dpo/margin_std": 73.91728973388672, "step": 185 }, { "epoch": 0.39790575916230364, "grad_norm": 22.772708892822266, "learning_rate": 3.781574579820464e-07, "logits/chosen": -0.8728398084640503, "logits/rejected": -0.8765541315078735, "logps/chosen": -379.94354248046875, "logps/ref_chosen": -300.13128662109375, "logps/ref_rejected": -263.724609375, "logps/rejected": -397.33709716796875, "loss": 1.1047, "margin_dpo/margin_mean": 53.800254821777344, "margin_dpo/margin_std": 73.82844543457031, "step": 190 }, { "epoch": 0.4083769633507853, "grad_norm": 25.31096649169922, "learning_rate": 3.7021375165108377e-07, "logits/chosen": -0.8635498881340027, "logits/rejected": -0.8397191762924194, "logps/chosen": -384.876708984375, "logps/ref_chosen": -287.50128173828125, "logps/ref_rejected": -278.3941650390625, "logps/rejected": -409.2007141113281, "loss": 1.139, "margin_dpo/margin_mean": 33.431087493896484, "margin_dpo/margin_std": 66.89299011230469, "step": 195 }, { "epoch": 0.418848167539267, "grad_norm": 29.776451110839844, "learning_rate": 3.621088951385353e-07, "logits/chosen": -0.8681343793869019, "logits/rejected": -0.8792493939399719, "logps/chosen": -378.0604553222656, "logps/ref_chosen": -289.7194519042969, "logps/ref_rejected": -264.33685302734375, "logps/rejected": -412.09326171875, "loss": 1.0912, "margin_dpo/margin_mean": 59.41547393798828, "margin_dpo/margin_std": 78.97772216796875, "step": 200 }, { "epoch": 0.418848167539267, "eval_logits/chosen": -0.8521618843078613, "eval_logits/rejected": -0.8391309380531311, "eval_logps/chosen": -374.4195251464844, "eval_logps/ref_chosen": -280.7076110839844, "eval_logps/ref_rejected": -280.1632385253906, "eval_logps/rejected": -434.01593017578125, "eval_loss": 0.5626364350318909, "eval_margin_dpo/margin_mean": 60.140811920166016, "eval_margin_dpo/margin_std": 83.66898345947266, "eval_runtime": 50.9457, "eval_samples_per_second": 39.258, "eval_steps_per_second": 0.628, "step": 200 }, { "epoch": 0.4293193717277487, "grad_norm": 20.72422981262207, "learning_rate": 3.5385375325047163e-07, "logits/chosen": -0.8376694917678833, "logits/rejected": -0.832554817199707, "logps/chosen": -339.31396484375, "logps/ref_chosen": -241.3316650390625, "logps/ref_rejected": -217.2843017578125, "logps/rejected": -353.23785400390625, "loss": 1.1129, "margin_dpo/margin_mean": 37.97121810913086, "margin_dpo/margin_std": 76.61421203613281, "step": 205 }, { "epoch": 0.4397905759162304, "grad_norm": 23.13551139831543, "learning_rate": 3.454593922550693e-07, "logits/chosen": -0.8016048669815063, "logits/rejected": -0.7972103953361511, "logps/chosen": -345.01312255859375, "logps/ref_chosen": -269.7131652832031, "logps/ref_rejected": -256.3033447265625, "logps/rejected": -377.6368408203125, "loss": 1.1331, "margin_dpo/margin_mean": 46.03350830078125, "margin_dpo/margin_std": 87.02532958984375, "step": 210 }, { "epoch": 0.450261780104712, "grad_norm": 28.872802734375, "learning_rate": 3.3693706504794243e-07, "logits/chosen": -0.8741207122802734, "logits/rejected": -0.8464077115058899, "logps/chosen": -361.7008361816406, "logps/ref_chosen": -275.9259033203125, "logps/ref_rejected": -259.9688415527344, "logps/rejected": -391.6307678222656, "loss": 1.1091, "margin_dpo/margin_mean": 45.88698959350586, "margin_dpo/margin_std": 67.70562744140625, "step": 215 }, { "epoch": 0.4607329842931937, "grad_norm": 30.65153694152832, "learning_rate": 3.2829819606729477e-07, "logits/chosen": -0.8631395101547241, "logits/rejected": -0.8379348516464233, "logps/chosen": -368.6546936035156, "logps/ref_chosen": -272.54815673828125, "logps/ref_rejected": -296.7209167480469, "logps/rejected": -459.22528076171875, "loss": 1.0561, "margin_dpo/margin_mean": 66.39784240722656, "margin_dpo/margin_std": 84.69873809814453, "step": 220 }, { "epoch": 0.4712041884816754, "grad_norm": 26.21302604675293, "learning_rate": 3.1955436597911315e-07, "logits/chosen": -0.8575876355171204, "logits/rejected": -0.8386489748954773, "logps/chosen": -449.95989990234375, "logps/ref_chosen": -353.4187927246094, "logps/ref_rejected": -275.82666015625, "logps/rejected": -434.48590087890625, "loss": 1.1035, "margin_dpo/margin_mean": 62.11814498901367, "margin_dpo/margin_std": 93.59648132324219, "step": 225 }, { "epoch": 0.4816753926701571, "grad_norm": 22.068635940551758, "learning_rate": 3.1071729615293424e-07, "logits/chosen": -0.8830578923225403, "logits/rejected": -0.8567687273025513, "logps/chosen": -436.341796875, "logps/ref_chosen": -346.2792663574219, "logps/ref_rejected": -333.0526123046875, "logps/rejected": -487.6953125, "loss": 1.0555, "margin_dpo/margin_mean": 64.58020782470703, "margin_dpo/margin_std": 82.20867919921875, "step": 230 }, { "epoch": 0.49214659685863876, "grad_norm": 22.247276306152344, "learning_rate": 3.017988329489923e-07, "logits/chosen": -0.8050928115844727, "logits/rejected": -0.8064786791801453, "logps/chosen": -377.37249755859375, "logps/ref_chosen": -293.1265869140625, "logps/ref_rejected": -292.41009521484375, "logps/rejected": -430.84149169921875, "loss": 1.0875, "margin_dpo/margin_mean": 54.1854362487793, "margin_dpo/margin_std": 79.34012603759766, "step": 235 }, { "epoch": 0.5026178010471204, "grad_norm": 28.167221069335938, "learning_rate": 2.9281093183781403e-07, "logits/chosen": -0.7880058288574219, "logits/rejected": -0.8285211324691772, "logps/chosen": -289.57073974609375, "logps/ref_chosen": -212.98837280273438, "logps/ref_rejected": -238.8592987060547, "logps/rejected": -380.24237060546875, "loss": 1.0831, "margin_dpo/margin_mean": 64.80075073242188, "margin_dpo/margin_std": 87.5298843383789, "step": 240 }, { "epoch": 0.5130890052356021, "grad_norm": 21.444618225097656, "learning_rate": 2.837656413735479e-07, "logits/chosen": -0.8537348508834839, "logits/rejected": -0.8319905400276184, "logps/chosen": -341.90264892578125, "logps/ref_chosen": -270.50933837890625, "logps/ref_rejected": -283.96795654296875, "logps/rejected": -410.18414306640625, "loss": 1.0671, "margin_dpo/margin_mean": 54.822853088378906, "margin_dpo/margin_std": 83.7713394165039, "step": 245 }, { "epoch": 0.5235602094240838, "grad_norm": 24.929975509643555, "learning_rate": 2.7467508704251135e-07, "logits/chosen": -0.872177243232727, "logits/rejected": -0.8667652010917664, "logps/chosen": -363.5415344238281, "logps/ref_chosen": -278.5409240722656, "logps/ref_rejected": -289.39508056640625, "logps/rejected": -423.7264709472656, "loss": 1.1319, "margin_dpo/margin_mean": 49.33078384399414, "margin_dpo/margin_std": 79.461669921875, "step": 250 }, { "epoch": 0.5340314136125655, "grad_norm": 24.554033279418945, "learning_rate": 2.655514550086086e-07, "logits/chosen": -0.8809840083122253, "logits/rejected": -0.8628867864608765, "logps/chosen": -393.5577392578125, "logps/ref_chosen": -294.0670166015625, "logps/ref_rejected": -255.25021362304688, "logps/rejected": -419.11676025390625, "loss": 1.091, "margin_dpo/margin_mean": 64.37581634521484, "margin_dpo/margin_std": 85.99962615966797, "step": 255 }, { "epoch": 0.5445026178010471, "grad_norm": 24.0455265045166, "learning_rate": 2.5640697577740815e-07, "logits/chosen": -0.8196985125541687, "logits/rejected": -0.8147414326667786, "logps/chosen": -386.85418701171875, "logps/ref_chosen": -276.14508056640625, "logps/ref_rejected": -225.4951171875, "logps/rejected": -392.1053466796875, "loss": 1.1138, "margin_dpo/margin_mean": 55.9011344909668, "margin_dpo/margin_std": 81.55867004394531, "step": 260 }, { "epoch": 0.5549738219895288, "grad_norm": 22.498727798461914, "learning_rate": 2.4725390780077905e-07, "logits/chosen": -0.7659963965415955, "logits/rejected": -0.7464796900749207, "logps/chosen": -369.7604675292969, "logps/ref_chosen": -257.68145751953125, "logps/ref_rejected": -239.24484252929688, "logps/rejected": -427.9410705566406, "loss": 1.0651, "margin_dpo/margin_mean": 76.6172103881836, "margin_dpo/margin_std": 92.36399841308594, "step": 265 }, { "epoch": 0.5654450261780105, "grad_norm": 19.695539474487305, "learning_rate": 2.381045210440644e-07, "logits/chosen": -0.8541289567947388, "logits/rejected": -0.8301209211349487, "logps/chosen": -380.1869812011719, "logps/ref_chosen": -258.50482177734375, "logps/ref_rejected": -252.1217498779297, "logps/rejected": -426.6290588378906, "loss": 1.0646, "margin_dpo/margin_mean": 52.8251838684082, "margin_dpo/margin_std": 84.49119567871094, "step": 270 }, { "epoch": 0.5759162303664922, "grad_norm": 17.279773712158203, "learning_rate": 2.2897108053782e-07, "logits/chosen": -0.8132762908935547, "logits/rejected": -0.815741240978241, "logps/chosen": -415.64617919921875, "logps/ref_chosen": -320.1709289550781, "logps/ref_rejected": -278.55364990234375, "logps/rejected": -443.93499755859375, "loss": 1.0293, "margin_dpo/margin_mean": 69.90602111816406, "margin_dpo/margin_std": 85.13655853271484, "step": 275 }, { "epoch": 0.5863874345549738, "grad_norm": 17.42207145690918, "learning_rate": 2.1986582993616925e-07, "logits/chosen": -0.7942547798156738, "logits/rejected": -0.8006545901298523, "logps/chosen": -399.39056396484375, "logps/ref_chosen": -303.01043701171875, "logps/ref_rejected": -304.8597412109375, "logps/rejected": -473.74407958984375, "loss": 1.0884, "margin_dpo/margin_mean": 72.5042724609375, "margin_dpo/margin_std": 83.59529876708984, "step": 280 }, { "epoch": 0.5968586387434555, "grad_norm": 29.439176559448242, "learning_rate": 2.1080097510381294e-07, "logits/chosen": -0.8379713296890259, "logits/rejected": -0.8423024415969849, "logps/chosen": -410.0982360839844, "logps/ref_chosen": -284.4503479003906, "logps/ref_rejected": -295.3260803222656, "logps/rejected": -488.0785217285156, "loss": 1.1025, "margin_dpo/margin_mean": 67.1044692993164, "margin_dpo/margin_std": 85.94883728027344, "step": 285 }, { "epoch": 0.6073298429319371, "grad_norm": 26.986276626586914, "learning_rate": 2.0178866775369774e-07, "logits/chosen": -0.8594253659248352, "logits/rejected": -0.842856764793396, "logps/chosen": -406.8668212890625, "logps/ref_chosen": -282.28424072265625, "logps/ref_rejected": -245.0934295654297, "logps/rejected": -418.887939453125, "loss": 1.1327, "margin_dpo/margin_mean": 49.21195602416992, "margin_dpo/margin_std": 83.4120101928711, "step": 290 }, { "epoch": 0.6178010471204188, "grad_norm": 17.56438446044922, "learning_rate": 1.928409891572757e-07, "logits/chosen": -0.826249897480011, "logits/rejected": -0.8039449453353882, "logps/chosen": -403.50164794921875, "logps/ref_chosen": -280.5242004394531, "logps/ref_rejected": -262.0945129394531, "logps/rejected": -443.8013610839844, "loss": 1.0541, "margin_dpo/margin_mean": 58.729454040527344, "margin_dpo/margin_std": 91.69632720947266, "step": 295 }, { "epoch": 0.6282722513089005, "grad_norm": 28.401439666748047, "learning_rate": 1.839699339491937e-07, "logits/chosen": -0.8613263964653015, "logits/rejected": -0.8485943675041199, "logps/chosen": -364.01141357421875, "logps/ref_chosen": -255.989990234375, "logps/ref_rejected": -263.67828369140625, "logps/rejected": -434.90423583984375, "loss": 1.0651, "margin_dpo/margin_mean": 63.20460891723633, "margin_dpo/margin_std": 98.46973419189453, "step": 300 }, { "epoch": 0.6387434554973822, "grad_norm": 21.466861724853516, "learning_rate": 1.7518739404812155e-07, "logits/chosen": -0.8165709376335144, "logits/rejected": -0.813851535320282, "logps/chosen": -397.8428649902344, "logps/ref_chosen": -275.48211669921875, "logps/ref_rejected": -239.78201293945312, "logps/rejected": -420.2521057128906, "loss": 1.0263, "margin_dpo/margin_mean": 58.10936737060547, "margin_dpo/margin_std": 86.70993041992188, "step": 305 }, { "epoch": 0.6492146596858639, "grad_norm": 23.69497299194336, "learning_rate": 1.6650514271527465e-07, "logits/chosen": -0.8850182294845581, "logits/rejected": -0.8541525602340698, "logps/chosen": -434.433349609375, "logps/ref_chosen": -298.96624755859375, "logps/ref_rejected": -281.476806640625, "logps/rejected": -478.77020263671875, "loss": 1.061, "margin_dpo/margin_mean": 61.826271057128906, "margin_dpo/margin_std": 83.03723907470703, "step": 310 }, { "epoch": 0.6596858638743456, "grad_norm": 23.33465576171875, "learning_rate": 1.5793481877199943e-07, "logits/chosen": -0.8339045643806458, "logits/rejected": -0.8051811456680298, "logps/chosen": -461.6744079589844, "logps/ref_chosen": -319.5059814453125, "logps/ref_rejected": -295.2557067871094, "logps/rejected": -506.0589294433594, "loss": 1.0603, "margin_dpo/margin_mean": 68.63478088378906, "margin_dpo/margin_std": 97.61984252929688, "step": 315 }, { "epoch": 0.6701570680628273, "grad_norm": 23.42766761779785, "learning_rate": 1.4948791099758052e-07, "logits/chosen": -0.8916772603988647, "logits/rejected": -0.8678488731384277, "logps/chosen": -403.6375732421875, "logps/ref_chosen": -274.22412109375, "logps/ref_rejected": -253.42117309570312, "logps/rejected": -431.6688537597656, "loss": 0.9986, "margin_dpo/margin_mean": 48.834320068359375, "margin_dpo/margin_std": 76.32218933105469, "step": 320 }, { "epoch": 0.680628272251309, "grad_norm": 31.045774459838867, "learning_rate": 1.4117574272818386e-07, "logits/chosen": -0.8033218383789062, "logits/rejected": -0.7930227518081665, "logps/chosen": -395.0705871582031, "logps/ref_chosen": -259.73590087890625, "logps/ref_rejected": -271.35369873046875, "logps/rejected": -458.1595153808594, "loss": 1.0877, "margin_dpo/margin_mean": 51.47113037109375, "margin_dpo/margin_std": 90.4473648071289, "step": 325 }, { "epoch": 0.6910994764397905, "grad_norm": 24.11140251159668, "learning_rate": 1.3300945667758012e-07, "logits/chosen": -0.8636866807937622, "logits/rejected": -0.8337670564651489, "logps/chosen": -421.36749267578125, "logps/ref_chosen": -288.8438720703125, "logps/ref_rejected": -285.99853515625, "logps/rejected": -491.48919677734375, "loss": 1.0256, "margin_dpo/margin_mean": 72.96711730957031, "margin_dpo/margin_std": 98.3145523071289, "step": 330 }, { "epoch": 0.7015706806282722, "grad_norm": 30.136749267578125, "learning_rate": 1.2500000000000005e-07, "logits/chosen": -0.8628665208816528, "logits/rejected": -0.8348671197891235, "logps/chosen": -424.05352783203125, "logps/ref_chosen": -284.08929443359375, "logps/ref_rejected": -245.40060424804688, "logps/rejected": -441.79852294921875, "loss": 1.0605, "margin_dpo/margin_mean": 56.433692932128906, "margin_dpo/margin_std": 105.3170166015625, "step": 335 }, { "epoch": 0.7120418848167539, "grad_norm": 33.01960754394531, "learning_rate": 1.1715810961514072e-07, "logits/chosen": -0.9020859599113464, "logits/rejected": -0.8989129066467285, "logps/chosen": -425.54400634765625, "logps/ref_chosen": -281.47247314453125, "logps/ref_rejected": -256.0491027832031, "logps/rejected": -472.6197204589844, "loss": 1.0478, "margin_dpo/margin_mean": 72.49906921386719, "margin_dpo/margin_std": 109.55391693115234, "step": 340 }, { "epoch": 0.7225130890052356, "grad_norm": 26.777374267578125, "learning_rate": 1.09494297815e-07, "logits/chosen": -0.8701359033584595, "logits/rejected": -0.8566125631332397, "logps/chosen": -399.4504699707031, "logps/ref_chosen": -258.94598388671875, "logps/ref_rejected": -262.75830078125, "logps/rejected": -455.2816467285156, "loss": 1.1001, "margin_dpo/margin_mean": 52.018768310546875, "margin_dpo/margin_std": 89.29933166503906, "step": 345 }, { "epoch": 0.7329842931937173, "grad_norm": 26.882720947265625, "learning_rate": 1.0201883817182949e-07, "logits/chosen": -0.8565577268600464, "logits/rejected": -0.8601759672164917, "logps/chosen": -385.0584411621094, "logps/ref_chosen": -242.0128936767578, "logps/ref_rejected": -219.5886993408203, "logps/rejected": -417.0812072753906, "loss": 1.065, "margin_dpo/margin_mean": 54.44697952270508, "margin_dpo/margin_std": 90.08226013183594, "step": 350 }, { "epoch": 0.743455497382199, "grad_norm": 26.08757781982422, "learning_rate": 9.474175176609956e-08, "logits/chosen": -0.8443655967712402, "logits/rejected": -0.838814914226532, "logps/chosen": -453.3646545410156, "logps/ref_chosen": -309.3603820800781, "logps/ref_rejected": -281.742431640625, "logps/rejected": -493.95404052734375, "loss": 1.1191, "margin_dpo/margin_mean": 68.20734405517578, "margin_dpo/margin_std": 108.90913391113281, "step": 355 }, { "epoch": 0.7539267015706806, "grad_norm": 26.802879333496094, "learning_rate": 8.76727937529367e-08, "logits/chosen": -0.7590880393981934, "logits/rejected": -0.7674987316131592, "logps/chosen": -392.798828125, "logps/ref_chosen": -254.25436401367188, "logps/ref_rejected": -221.65145874023438, "logps/rejected": -422.1822204589844, "loss": 1.0568, "margin_dpo/margin_mean": 61.9863166809082, "margin_dpo/margin_std": 87.15711975097656, "step": 360 }, { "epoch": 0.7643979057591623, "grad_norm": 21.973236083984375, "learning_rate": 8.082144028504231e-08, "logits/chosen": -0.8197676539421082, "logits/rejected": -0.8475183248519897, "logps/chosen": -364.2659912109375, "logps/ref_chosen": -242.38778686523438, "logps/ref_rejected": -208.74813842773438, "logps/rejected": -386.58636474609375, "loss": 1.055, "margin_dpo/margin_mean": 55.96006393432617, "margin_dpo/margin_std": 79.6556167602539, "step": 365 }, { "epoch": 0.774869109947644, "grad_norm": 18.004220962524414, "learning_rate": 7.419687580962222e-08, "logits/chosen": -0.850188136100769, "logits/rejected": -0.8511862754821777, "logps/chosen": -414.29644775390625, "logps/ref_chosen": -293.2469787597656, "logps/ref_rejected": -280.17156982421875, "logps/rejected": -468.7672424316406, "loss": 0.9856, "margin_dpo/margin_mean": 67.54612731933594, "margin_dpo/margin_std": 89.1795425415039, "step": 370 }, { "epoch": 0.7853403141361257, "grad_norm": 22.512737274169922, "learning_rate": 6.780798075635675e-08, "logits/chosen": -0.8369812965393066, "logits/rejected": -0.8140292167663574, "logps/chosen": -450.01275634765625, "logps/ref_chosen": -329.85968017578125, "logps/ref_rejected": -237.68679809570312, "logps/rejected": -409.7642517089844, "loss": 1.0446, "margin_dpo/margin_mean": 51.924415588378906, "margin_dpo/margin_std": 90.19831848144531, "step": 375 }, { "epoch": 0.7958115183246073, "grad_norm": 29.517168045043945, "learning_rate": 6.166331963291519e-08, "logits/chosen": -0.8067518472671509, "logits/rejected": -0.8046929240226746, "logps/chosen": -407.7606506347656, "logps/ref_chosen": -294.3583679199219, "logps/ref_rejected": -266.3445739746094, "logps/rejected": -433.68463134765625, "loss": 1.1103, "margin_dpo/margin_mean": 53.93775177001953, "margin_dpo/margin_std": 93.24676513671875, "step": 380 }, { "epoch": 0.806282722513089, "grad_norm": 24.049985885620117, "learning_rate": 5.57711295439732e-08, "logits/chosen": -0.8399543762207031, "logits/rejected": -0.8283275365829468, "logps/chosen": -422.85137939453125, "logps/ref_chosen": -314.8267822265625, "logps/ref_rejected": -292.8830871582031, "logps/rejected": -470.80560302734375, "loss": 1.0282, "margin_dpo/margin_mean": 69.89784240722656, "margin_dpo/margin_std": 93.79012298583984, "step": 385 }, { "epoch": 0.8167539267015707, "grad_norm": 19.402238845825195, "learning_rate": 5.013930914912476e-08, "logits/chosen": -0.8243298530578613, "logits/rejected": -0.8369429707527161, "logps/chosen": -411.41290283203125, "logps/ref_chosen": -283.4703063964844, "logps/ref_rejected": -237.56201171875, "logps/rejected": -418.91900634765625, "loss": 1.0603, "margin_dpo/margin_mean": 53.41436767578125, "margin_dpo/margin_std": 72.03800964355469, "step": 390 }, { "epoch": 0.8272251308900523, "grad_norm": 19.115346908569336, "learning_rate": 4.477540807448832e-08, "logits/chosen": -0.8676174879074097, "logits/rejected": -0.8561701774597168, "logps/chosen": -418.7401428222656, "logps/ref_chosen": -307.20220947265625, "logps/ref_rejected": -245.660400390625, "logps/rejected": -418.5318908691406, "loss": 1.0248, "margin_dpo/margin_mean": 61.33369827270508, "margin_dpo/margin_std": 90.22969055175781, "step": 395 }, { "epoch": 0.837696335078534, "grad_norm": 28.535594940185547, "learning_rate": 3.968661679220467e-08, "logits/chosen": -0.8202043771743774, "logits/rejected": -0.8029823303222656, "logps/chosen": -394.8623046875, "logps/ref_chosen": -266.63018798828125, "logps/ref_rejected": -285.74224853515625, "logps/rejected": -481.6102600097656, "loss": 1.1088, "margin_dpo/margin_mean": 67.63584899902344, "margin_dpo/margin_std": 88.04866027832031, "step": 400 }, { "epoch": 0.837696335078534, "eval_logits/chosen": -0.8223316073417664, "eval_logits/rejected": -0.8121299743652344, "eval_logps/chosen": -403.2485046386719, "eval_logps/ref_chosen": -280.7076110839844, "eval_logps/ref_rejected": -280.1632385253906, "eval_logps/rejected": -474.8625183105469, "eval_loss": 0.5358365774154663, "eval_margin_dpo/margin_mean": 72.15837860107422, "eval_margin_dpo/margin_std": 96.15919494628906, "eval_runtime": 50.8127, "eval_samples_per_second": 39.36, "eval_steps_per_second": 0.63, "step": 400 }, { "epoch": 0.8481675392670157, "grad_norm": 16.877710342407227, "learning_rate": 3.487975698139084e-08, "logits/chosen": -0.8321081399917603, "logits/rejected": -0.8053818941116333, "logps/chosen": -420.28216552734375, "logps/ref_chosen": -293.24041748046875, "logps/ref_rejected": -270.2939453125, "logps/rejected": -457.31097412109375, "loss": 1.0084, "margin_dpo/margin_mean": 59.9753303527832, "margin_dpo/margin_std": 80.72087097167969, "step": 405 }, { "epoch": 0.8586387434554974, "grad_norm": 21.314437866210938, "learning_rate": 3.036127238347164e-08, "logits/chosen": -0.9076123237609863, "logits/rejected": -0.9141916036605835, "logps/chosen": -440.6188049316406, "logps/ref_chosen": -298.21405029296875, "logps/ref_rejected": -287.6912536621094, "logps/rejected": -481.20684814453125, "loss": 1.0709, "margin_dpo/margin_mean": 51.1108512878418, "margin_dpo/margin_std": 80.4267349243164, "step": 410 }, { "epoch": 0.8691099476439791, "grad_norm": 19.411727905273438, "learning_rate": 2.613722016414943e-08, "logits/chosen": -0.7660075426101685, "logits/rejected": -0.7644788026809692, "logps/chosen": -384.01080322265625, "logps/ref_chosen": -254.6023712158203, "logps/ref_rejected": -278.03485107421875, "logps/rejected": -464.08380126953125, "loss": 1.0408, "margin_dpo/margin_mean": 56.64048385620117, "margin_dpo/margin_std": 90.71663665771484, "step": 415 }, { "epoch": 0.8795811518324608, "grad_norm": 33.511924743652344, "learning_rate": 2.2213262793589482e-08, "logits/chosen": -0.8396707773208618, "logits/rejected": -0.7929636240005493, "logps/chosen": -455.02288818359375, "logps/ref_chosen": -328.2628479003906, "logps/ref_rejected": -267.16888427734375, "logps/rejected": -482.4369201660156, "loss": 1.0272, "margin_dpo/margin_mean": 88.50798034667969, "margin_dpo/margin_std": 90.31407928466797, "step": 420 }, { "epoch": 0.8900523560209425, "grad_norm": 25.034595489501953, "learning_rate": 1.8594660455706763e-08, "logits/chosen": -0.8549168705940247, "logits/rejected": -0.8669727444648743, "logps/chosen": -471.65289306640625, "logps/ref_chosen": -341.20465087890625, "logps/ref_rejected": -259.51910400390625, "logps/rejected": -463.3876037597656, "loss": 1.0265, "margin_dpo/margin_mean": 73.42025756835938, "margin_dpo/margin_std": 100.29209899902344, "step": 425 }, { "epoch": 0.900523560209424, "grad_norm": 26.290597915649414, "learning_rate": 1.5286263996730026e-08, "logits/chosen": -0.8484354019165039, "logits/rejected": -0.831134021282196, "logps/chosen": -436.2567443847656, "logps/ref_chosen": -300.48785400390625, "logps/ref_rejected": -252.441162109375, "logps/rejected": -447.345703125, "loss": 1.0004, "margin_dpo/margin_mean": 59.13561248779297, "margin_dpo/margin_std": 92.1114273071289, "step": 430 }, { "epoch": 0.9109947643979057, "grad_norm": 32.428932189941406, "learning_rate": 1.2292508422495157e-08, "logits/chosen": -0.8825214505195618, "logits/rejected": -0.8753899335861206, "logps/chosen": -389.99371337890625, "logps/ref_chosen": -264.25225830078125, "logps/ref_rejected": -237.9748992919922, "logps/rejected": -427.88104248046875, "loss": 1.0395, "margin_dpo/margin_mean": 64.16473388671875, "margin_dpo/margin_std": 94.56092834472656, "step": 435 }, { "epoch": 0.9214659685863874, "grad_norm": 24.546009063720703, "learning_rate": 9.617406953185136e-09, "logits/chosen": -0.8393553495407104, "logits/rejected": -0.8456501960754395, "logps/chosen": -449.50836181640625, "logps/ref_chosen": -304.78619384765625, "logps/ref_rejected": -334.1107482910156, "logps/rejected": -516.7224731445312, "loss": 1.1006, "margin_dpo/margin_mean": 37.8895378112793, "margin_dpo/margin_std": 85.40432739257812, "step": 440 }, { "epoch": 0.9319371727748691, "grad_norm": 23.76576805114746, "learning_rate": 7.2645456434869965e-09, "logits/chosen": -0.838398277759552, "logits/rejected": -0.8250478506088257, "logps/chosen": -448.4908142089844, "logps/ref_chosen": -293.74560546875, "logps/ref_rejected": -246.31625366210938, "logps/rejected": -445.4239807128906, "loss": 1.0524, "margin_dpo/margin_mean": 44.362464904785156, "margin_dpo/margin_std": 85.54446411132812, "step": 445 }, { "epoch": 0.9424083769633508, "grad_norm": 23.184553146362305, "learning_rate": 5.2370785753763356e-09, "logits/chosen": -0.8070716857910156, "logits/rejected": -0.8125391006469727, "logps/chosen": -381.4776306152344, "logps/ref_chosen": -258.7225646972656, "logps/ref_rejected": -240.88504028320312, "logps/rejected": -421.876220703125, "loss": 1.0018, "margin_dpo/margin_mean": 58.236106872558594, "margin_dpo/margin_std": 83.58892822265625, "step": 450 }, { "epoch": 0.9528795811518325, "grad_norm": 22.7464542388916, "learning_rate": 3.5377236299748147e-09, "logits/chosen": -0.8362730741500854, "logits/rejected": -0.8388243913650513, "logps/chosen": -474.7862854003906, "logps/ref_chosen": -331.77642822265625, "logps/ref_rejected": -305.6571960449219, "logps/rejected": -511.6985778808594, "loss": 1.0562, "margin_dpo/margin_mean": 63.031532287597656, "margin_dpo/margin_std": 97.10409545898438, "step": 455 }, { "epoch": 0.9633507853403142, "grad_norm": 24.231159210205078, "learning_rate": 2.168758844148272e-09, "logits/chosen": -0.8514804840087891, "logits/rejected": -0.8368584513664246, "logps/chosen": -404.0189514160156, "logps/ref_chosen": -267.02508544921875, "logps/ref_rejected": -267.7117919921875, "logps/rejected": -456.15289306640625, "loss": 1.0653, "margin_dpo/margin_mean": 51.44722366333008, "margin_dpo/margin_std": 92.69256591796875, "step": 460 }, { "epoch": 0.9738219895287958, "grad_norm": 19.262453079223633, "learning_rate": 1.1320193567288527e-09, "logits/chosen": -0.859015166759491, "logits/rejected": -0.8542447090148926, "logps/chosen": -407.23431396484375, "logps/ref_chosen": -276.2010803222656, "logps/ref_rejected": -229.0570068359375, "logps/rejected": -442.80023193359375, "loss": 1.0555, "margin_dpo/margin_mean": 82.71009826660156, "margin_dpo/margin_std": 78.12190246582031, "step": 465 }, { "epoch": 0.9842931937172775, "grad_norm": 20.87303352355957, "learning_rate": 4.288949484559934e-10, "logits/chosen": -0.8113743662834167, "logits/rejected": -0.7894054651260376, "logps/chosen": -377.26910400390625, "logps/ref_chosen": -257.05743408203125, "logps/ref_rejected": -235.47738647460938, "logps/rejected": -443.84393310546875, "loss": 1.0227, "margin_dpo/margin_mean": 88.15492248535156, "margin_dpo/margin_std": 88.85401153564453, "step": 470 }, { "epoch": 0.9947643979057592, "grad_norm": 22.590627670288086, "learning_rate": 6.032817893297793e-11, "logits/chosen": -0.8724172711372375, "logits/rejected": -0.8867457509040833, "logps/chosen": -366.89349365234375, "logps/ref_chosen": -235.49948120117188, "logps/ref_rejected": -264.59051513671875, "logps/rejected": -458.58197021484375, "loss": 1.0225, "margin_dpo/margin_mean": 62.59749221801758, "margin_dpo/margin_std": 81.80487060546875, "step": 475 }, { "epoch": 0.9989528795811519, "step": 477, "total_flos": 0.0, "train_loss": 1.1476524381017785, "train_runtime": 4456.6592, "train_samples_per_second": 13.718, "train_steps_per_second": 0.107 } ], "logging_steps": 5, "max_steps": 477, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }