{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 50, "global_step": 2784, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01078566854292359, "grad_norm": 20.25, "learning_rate": 2.990301724137931e-06, "logits/chosen": -1.2338563203811646, "logits/rejected": -1.2257438898086548, "logps/chosen": -490.282470703125, "logps/rejected": -482.52545166015625, "loss": 0.6903, "rewards/accuracies": 0.5, "rewards/chosen": 0.0017994878580793738, "rewards/margins": 0.006464972160756588, "rewards/rejected": -0.004665483720600605, "step": 10 }, { "epoch": 0.02157133708584718, "grad_norm": 22.0, "learning_rate": 2.979525862068966e-06, "logits/chosen": -1.2723968029022217, "logits/rejected": -1.2785792350769043, "logps/chosen": -520.5911254882812, "logps/rejected": -514.4813842773438, "loss": 0.6856, "rewards/accuracies": 0.6078125238418579, "rewards/chosen": 0.0043027508072555065, "rewards/margins": 0.0164328720420599, "rewards/rejected": -0.01213012170046568, "step": 20 }, { "epoch": 0.03235700562877077, "grad_norm": 19.125, "learning_rate": 2.96875e-06, "logits/chosen": -1.2637840509414673, "logits/rejected": -1.246565818786621, "logps/chosen": -507.4403381347656, "logps/rejected": -501.22479248046875, "loss": 0.6789, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.0017711544642224908, "rewards/margins": 0.030538281425833702, "rewards/rejected": -0.032309435307979584, "step": 30 }, { "epoch": 0.04314267417169436, "grad_norm": 20.75, "learning_rate": 2.9579741379310345e-06, "logits/chosen": -1.2590439319610596, "logits/rejected": -1.2330172061920166, "logps/chosen": -485.248291015625, "logps/rejected": -482.63458251953125, "loss": 0.6765, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.00601751497015357, "rewards/margins": 0.03583287447690964, "rewards/rejected": -0.041850391775369644, "step": 40 }, { "epoch": 0.05392834271461795, "grad_norm": 20.25, "learning_rate": 2.947198275862069e-06, "logits/chosen": -1.3426216840744019, "logits/rejected": -1.3334815502166748, "logps/chosen": -532.8280029296875, "logps/rejected": -514.1970825195312, "loss": 0.6684, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.00931160431355238, "rewards/margins": 0.053057052195072174, "rewards/rejected": -0.06236865371465683, "step": 50 }, { "epoch": 0.05392834271461795, "eval_logits/chosen": -1.3231499195098877, "eval_logits/rejected": -1.3642845153808594, "eval_logps/chosen": -549.2651977539062, "eval_logps/rejected": -494.54583740234375, "eval_loss": 0.6826924681663513, "eval_rewards/accuracies": 0.6153846383094788, "eval_rewards/chosen": -0.05200628936290741, "eval_rewards/margins": 0.02476159855723381, "eval_rewards/rejected": -0.07676788419485092, "eval_runtime": 13.2928, "eval_samples_per_second": 7.523, "eval_steps_per_second": 0.978, "step": 50 }, { "epoch": 0.06471401125754155, "grad_norm": 20.25, "learning_rate": 2.9364224137931035e-06, "logits/chosen": -1.3449946641921997, "logits/rejected": -1.343201756477356, "logps/chosen": -548.1566772460938, "logps/rejected": -531.6286010742188, "loss": 0.6705, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.028984328731894493, "rewards/margins": 0.04919930547475815, "rewards/rejected": -0.07818363606929779, "step": 60 }, { "epoch": 0.07549967980046514, "grad_norm": 20.875, "learning_rate": 2.925646551724138e-06, "logits/chosen": -1.3576488494873047, "logits/rejected": -1.3385895490646362, "logps/chosen": -516.7993774414062, "logps/rejected": -502.6502990722656, "loss": 0.6624, "rewards/accuracies": 0.6875, "rewards/chosen": -0.03774970397353172, "rewards/margins": 0.0672496110200882, "rewards/rejected": -0.10499931871891022, "step": 70 }, { "epoch": 0.08628534834338872, "grad_norm": 19.5, "learning_rate": 2.9148706896551725e-06, "logits/chosen": -1.4393236637115479, "logits/rejected": -1.4375989437103271, "logps/chosen": -534.2485961914062, "logps/rejected": -516.8245849609375, "loss": 0.6547, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.05567573383450508, "rewards/margins": 0.08416923880577087, "rewards/rejected": -0.13984496891498566, "step": 80 }, { "epoch": 0.09707101688631231, "grad_norm": 20.625, "learning_rate": 2.904094827586207e-06, "logits/chosen": -1.4080358743667603, "logits/rejected": -1.399465799331665, "logps/chosen": -503.4671936035156, "logps/rejected": -488.74383544921875, "loss": 0.6565, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.07379738986492157, "rewards/margins": 0.08224000781774521, "rewards/rejected": -0.15603742003440857, "step": 90 }, { "epoch": 0.1078566854292359, "grad_norm": 20.375, "learning_rate": 2.8933189655172415e-06, "logits/chosen": -1.3748817443847656, "logits/rejected": -1.366629958152771, "logps/chosen": -518.0892944335938, "logps/rejected": -499.7432556152344, "loss": 0.6541, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.09199288487434387, "rewards/margins": 0.08948174864053726, "rewards/rejected": -0.18147462606430054, "step": 100 }, { "epoch": 0.1078566854292359, "eval_logits/chosen": -1.4332343339920044, "eval_logits/rejected": -1.479236364364624, "eval_logps/chosen": -551.7933349609375, "eval_logps/rejected": -497.4646911621094, "eval_loss": 0.6763917803764343, "eval_rewards/accuracies": 0.5769230723381042, "eval_rewards/chosen": -0.17841331660747528, "eval_rewards/margins": 0.04429765045642853, "eval_rewards/rejected": -0.222710981965065, "eval_runtime": 12.8741, "eval_samples_per_second": 7.768, "eval_steps_per_second": 1.01, "step": 100 }, { "epoch": 0.1186423539721595, "grad_norm": 19.25, "learning_rate": 2.8825431034482758e-06, "logits/chosen": -1.436632513999939, "logits/rejected": -1.4507520198822021, "logps/chosen": -508.725830078125, "logps/rejected": -500.9781799316406, "loss": 0.6421, "rewards/accuracies": 0.729687511920929, "rewards/chosen": -0.11871524900197983, "rewards/margins": 0.11672022193670273, "rewards/rejected": -0.23543548583984375, "step": 110 }, { "epoch": 0.1294280225150831, "grad_norm": 19.375, "learning_rate": 2.8717672413793105e-06, "logits/chosen": -1.4648942947387695, "logits/rejected": -1.474923849105835, "logps/chosen": -522.5699462890625, "logps/rejected": -521.1087036132812, "loss": 0.635, "rewards/accuracies": 0.7171875238418579, "rewards/chosen": -0.14662253856658936, "rewards/margins": 0.13384078443050385, "rewards/rejected": -0.2804633677005768, "step": 120 }, { "epoch": 0.14021369105800668, "grad_norm": 21.0, "learning_rate": 2.860991379310345e-06, "logits/chosen": -1.5117241144180298, "logits/rejected": -1.5187674760818481, "logps/chosen": -526.3902587890625, "logps/rejected": -518.88525390625, "loss": 0.6413, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.1973995864391327, "rewards/margins": 0.12558409571647644, "rewards/rejected": -0.32298368215560913, "step": 130 }, { "epoch": 0.15099935960093028, "grad_norm": 19.75, "learning_rate": 2.8502155172413795e-06, "logits/chosen": -1.5119014978408813, "logits/rejected": -1.5130690336227417, "logps/chosen": -533.3895263671875, "logps/rejected": -527.9937744140625, "loss": 0.6401, "rewards/accuracies": 0.6703125238418579, "rewards/chosen": -0.22728869318962097, "rewards/margins": 0.13429734110832214, "rewards/rejected": -0.3615860342979431, "step": 140 }, { "epoch": 0.16178502814385384, "grad_norm": 19.375, "learning_rate": 2.839439655172414e-06, "logits/chosen": -1.5527749061584473, "logits/rejected": -1.5703589916229248, "logps/chosen": -515.7926025390625, "logps/rejected": -503.88360595703125, "loss": 0.6397, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2661042809486389, "rewards/margins": 0.13428668677806854, "rewards/rejected": -0.40039095282554626, "step": 150 }, { "epoch": 0.16178502814385384, "eval_logits/chosen": -1.5442339181900024, "eval_logits/rejected": -1.5949413776397705, "eval_logps/chosen": -555.982666015625, "eval_logps/rejected": -502.18585205078125, "eval_loss": 0.6722157001495361, "eval_rewards/accuracies": 0.5865384340286255, "eval_rewards/chosen": -0.38787999749183655, "eval_rewards/margins": 0.07088876515626907, "eval_rewards/rejected": -0.4587687849998474, "eval_runtime": 12.8342, "eval_samples_per_second": 7.792, "eval_steps_per_second": 1.013, "step": 150 }, { "epoch": 0.17257069668677744, "grad_norm": 20.0, "learning_rate": 2.8286637931034485e-06, "logits/chosen": -1.5724695920944214, "logits/rejected": -1.5730245113372803, "logps/chosen": -506.13671875, "logps/rejected": -495.46142578125, "loss": 0.6699, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.29719287157058716, "rewards/margins": 0.10155584663152695, "rewards/rejected": -0.3987486958503723, "step": 160 }, { "epoch": 0.18335636522970103, "grad_norm": 21.125, "learning_rate": 2.817887931034483e-06, "logits/chosen": -1.5566643476486206, "logits/rejected": -1.555086612701416, "logps/chosen": -557.1043701171875, "logps/rejected": -539.7008666992188, "loss": 0.6322, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.32251477241516113, "rewards/margins": 0.1617726981639862, "rewards/rejected": -0.48428741097450256, "step": 170 }, { "epoch": 0.19414203377262462, "grad_norm": 20.5, "learning_rate": 2.807112068965517e-06, "logits/chosen": -1.5289297103881836, "logits/rejected": -1.5290358066558838, "logps/chosen": -527.3707275390625, "logps/rejected": -511.8185119628906, "loss": 0.6347, "rewards/accuracies": 0.667187511920929, "rewards/chosen": -0.3478879928588867, "rewards/margins": 0.1548273265361786, "rewards/rejected": -0.5027152895927429, "step": 180 }, { "epoch": 0.20492770231554822, "grad_norm": 21.0, "learning_rate": 2.796336206896552e-06, "logits/chosen": -1.5106637477874756, "logits/rejected": -1.4951450824737549, "logps/chosen": -535.6061401367188, "logps/rejected": -528.7681884765625, "loss": 0.623, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.35832709074020386, "rewards/margins": 0.20923912525177002, "rewards/rejected": -0.5675662755966187, "step": 190 }, { "epoch": 0.2157133708584718, "grad_norm": 19.75, "learning_rate": 2.785560344827586e-06, "logits/chosen": -1.6122379302978516, "logits/rejected": -1.6149158477783203, "logps/chosen": -556.2003173828125, "logps/rejected": -536.0861206054688, "loss": 0.6237, "rewards/accuracies": 0.707812488079071, "rewards/chosen": -0.40547609329223633, "rewards/margins": 0.18066053092479706, "rewards/rejected": -0.5861365795135498, "step": 200 }, { "epoch": 0.2157133708584718, "eval_logits/chosen": -1.6185290813446045, "eval_logits/rejected": -1.671698808670044, "eval_logps/chosen": -559.1907348632812, "eval_logps/rejected": -505.73992919921875, "eval_loss": 0.6703996062278748, "eval_rewards/accuracies": 0.5769230723381042, "eval_rewards/chosen": -0.5482814311981201, "eval_rewards/margins": 0.08819277584552765, "eval_rewards/rejected": -0.6364741921424866, "eval_runtime": 12.8368, "eval_samples_per_second": 7.79, "eval_steps_per_second": 1.013, "step": 200 }, { "epoch": 0.2264990394013954, "grad_norm": 20.0, "learning_rate": 2.774784482758621e-06, "logits/chosen": -1.602872610092163, "logits/rejected": -1.6062767505645752, "logps/chosen": -494.86566162109375, "logps/rejected": -486.106689453125, "loss": 0.6313, "rewards/accuracies": 0.676562488079071, "rewards/chosen": -0.41412362456321716, "rewards/margins": 0.16503319144248962, "rewards/rejected": -0.5791568160057068, "step": 210 }, { "epoch": 0.237284707944319, "grad_norm": 22.875, "learning_rate": 2.764008620689655e-06, "logits/chosen": -1.6498100757598877, "logits/rejected": -1.6630780696868896, "logps/chosen": -540.1375732421875, "logps/rejected": -533.9187622070312, "loss": 0.6307, "rewards/accuracies": 0.6546875238418579, "rewards/chosen": -0.4561203420162201, "rewards/margins": 0.18029221892356873, "rewards/rejected": -0.636412501335144, "step": 220 }, { "epoch": 0.2480703764872426, "grad_norm": 19.5, "learning_rate": 2.75323275862069e-06, "logits/chosen": -1.667067527770996, "logits/rejected": -1.681429147720337, "logps/chosen": -525.0696411132812, "logps/rejected": -508.50067138671875, "loss": 0.6261, "rewards/accuracies": 0.6703125238418579, "rewards/chosen": -0.4919908940792084, "rewards/margins": 0.19147078692913055, "rewards/rejected": -0.6834616661071777, "step": 230 }, { "epoch": 0.2588560450301662, "grad_norm": 22.375, "learning_rate": 2.742456896551724e-06, "logits/chosen": -1.646775484085083, "logits/rejected": -1.6376352310180664, "logps/chosen": -517.5604248046875, "logps/rejected": -504.1005859375, "loss": 0.6228, "rewards/accuracies": 0.692187488079071, "rewards/chosen": -0.4839399755001068, "rewards/margins": 0.1933339387178421, "rewards/rejected": -0.6772739291191101, "step": 240 }, { "epoch": 0.26964171357308975, "grad_norm": 20.125, "learning_rate": 2.7316810344827584e-06, "logits/chosen": -1.630406379699707, "logits/rejected": -1.6203149557113647, "logps/chosen": -511.6064453125, "logps/rejected": -504.46112060546875, "loss": 0.6224, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.5028296709060669, "rewards/margins": 0.19740687310695648, "rewards/rejected": -0.700236439704895, "step": 250 }, { "epoch": 0.26964171357308975, "eval_logits/chosen": -1.671502947807312, "eval_logits/rejected": -1.727898359298706, "eval_logps/chosen": -561.3909912109375, "eval_logps/rejected": -507.87896728515625, "eval_loss": 0.6769301891326904, "eval_rewards/accuracies": 0.567307710647583, "eval_rewards/chosen": -0.6582961678504944, "eval_rewards/margins": 0.08512917906045914, "eval_rewards/rejected": -0.7434254288673401, "eval_runtime": 12.8313, "eval_samples_per_second": 7.793, "eval_steps_per_second": 1.013, "step": 250 }, { "epoch": 0.28042738211601337, "grad_norm": 22.75, "learning_rate": 2.720905172413793e-06, "logits/chosen": -1.6629798412322998, "logits/rejected": -1.664310097694397, "logps/chosen": -564.4110717773438, "logps/rejected": -550.1747436523438, "loss": 0.6344, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.51663738489151, "rewards/margins": 0.17350056767463684, "rewards/rejected": -0.6901379823684692, "step": 260 }, { "epoch": 0.29121305065893693, "grad_norm": 20.5, "learning_rate": 2.7101293103448275e-06, "logits/chosen": -1.6739919185638428, "logits/rejected": -1.6669447422027588, "logps/chosen": -562.1046752929688, "logps/rejected": -553.902099609375, "loss": 0.6256, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.5245037078857422, "rewards/margins": 0.19063076376914978, "rewards/rejected": -0.7151345014572144, "step": 270 }, { "epoch": 0.30199871920186055, "grad_norm": 19.875, "learning_rate": 2.699353448275862e-06, "logits/chosen": -1.6442272663116455, "logits/rejected": -1.6505457162857056, "logps/chosen": -529.9203491210938, "logps/rejected": -513.3055419921875, "loss": 0.6378, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5445322394371033, "rewards/margins": 0.16621330380439758, "rewards/rejected": -0.710745632648468, "step": 280 }, { "epoch": 0.3127843877447841, "grad_norm": 21.125, "learning_rate": 2.6885775862068965e-06, "logits/chosen": -1.6472547054290771, "logits/rejected": -1.6532646417617798, "logps/chosen": -527.1898193359375, "logps/rejected": -516.4867553710938, "loss": 0.6132, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.49309319257736206, "rewards/margins": 0.21812161803245544, "rewards/rejected": -0.7112148404121399, "step": 290 }, { "epoch": 0.3235700562877077, "grad_norm": 19.0, "learning_rate": 2.677801724137931e-06, "logits/chosen": -1.6490532159805298, "logits/rejected": -1.6721302270889282, "logps/chosen": -567.3697509765625, "logps/rejected": -550.6924438476562, "loss": 0.6216, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5456413626670837, "rewards/margins": 0.2058580368757248, "rewards/rejected": -0.7514994144439697, "step": 300 }, { "epoch": 0.3235700562877077, "eval_logits/chosen": -1.6934152841567993, "eval_logits/rejected": -1.7509509325027466, "eval_logps/chosen": -562.0770874023438, "eval_logps/rejected": -508.9422302246094, "eval_loss": 0.6661080121994019, "eval_rewards/accuracies": 0.5865384340286255, "eval_rewards/chosen": -0.692603349685669, "eval_rewards/margins": 0.10398232936859131, "eval_rewards/rejected": -0.7965856194496155, "eval_runtime": 12.8789, "eval_samples_per_second": 7.765, "eval_steps_per_second": 1.009, "step": 300 }, { "epoch": 0.3343557248306313, "grad_norm": 21.875, "learning_rate": 2.6670258620689655e-06, "logits/chosen": -1.7095540761947632, "logits/rejected": -1.7263736724853516, "logps/chosen": -530.0485229492188, "logps/rejected": -530.2100830078125, "loss": 0.6314, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.6013873815536499, "rewards/margins": 0.17918090522289276, "rewards/rejected": -0.7805682420730591, "step": 310 }, { "epoch": 0.3451413933735549, "grad_norm": 20.375, "learning_rate": 2.6562499999999998e-06, "logits/chosen": -1.7012121677398682, "logits/rejected": -1.6956939697265625, "logps/chosen": -520.4534301757812, "logps/rejected": -511.6394958496094, "loss": 0.6502, "rewards/accuracies": 0.629687488079071, "rewards/chosen": -0.6033864617347717, "rewards/margins": 0.14426395297050476, "rewards/rejected": -0.7476503252983093, "step": 320 }, { "epoch": 0.3559270619164785, "grad_norm": 20.625, "learning_rate": 2.6454741379310345e-06, "logits/chosen": -1.6911704540252686, "logits/rejected": -1.6552823781967163, "logps/chosen": -537.2453002929688, "logps/rejected": -534.7839965820312, "loss": 0.6255, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.568359375, "rewards/margins": 0.22105315327644348, "rewards/rejected": -0.7894124388694763, "step": 330 }, { "epoch": 0.36671273045940206, "grad_norm": 21.5, "learning_rate": 2.6346982758620688e-06, "logits/chosen": -1.664223313331604, "logits/rejected": -1.6563825607299805, "logps/chosen": -565.0128784179688, "logps/rejected": -551.3712158203125, "loss": 0.6117, "rewards/accuracies": 0.692187488079071, "rewards/chosen": -0.5529137849807739, "rewards/margins": 0.23182833194732666, "rewards/rejected": -0.7847420573234558, "step": 340 }, { "epoch": 0.3774983990023257, "grad_norm": 21.75, "learning_rate": 2.6239224137931035e-06, "logits/chosen": -1.6517295837402344, "logits/rejected": -1.6636890172958374, "logps/chosen": -499.03717041015625, "logps/rejected": -469.7442932128906, "loss": 0.6454, "rewards/accuracies": 0.625, "rewards/chosen": -0.559511125087738, "rewards/margins": 0.14985349774360657, "rewards/rejected": -0.7093645930290222, "step": 350 }, { "epoch": 0.3774983990023257, "eval_logits/chosen": -1.7148549556732178, "eval_logits/rejected": -1.7745919227600098, "eval_logps/chosen": -562.5575561523438, "eval_logps/rejected": -509.7618713378906, "eval_loss": 0.6621462106704712, "eval_rewards/accuracies": 0.625, "eval_rewards/chosen": -0.7166208624839783, "eval_rewards/margins": 0.12094759196043015, "eval_rewards/rejected": -0.8375685811042786, "eval_runtime": 12.8579, "eval_samples_per_second": 7.777, "eval_steps_per_second": 1.011, "step": 350 }, { "epoch": 0.38828406754524925, "grad_norm": 18.625, "learning_rate": 2.613146551724138e-06, "logits/chosen": -1.6570751667022705, "logits/rejected": -1.673710584640503, "logps/chosen": -550.8926391601562, "logps/rejected": -530.0281982421875, "loss": 0.635, "rewards/accuracies": 0.682812511920929, "rewards/chosen": -0.5474696755409241, "rewards/margins": 0.180132657289505, "rewards/rejected": -0.7276023626327515, "step": 360 }, { "epoch": 0.39906973608817287, "grad_norm": 22.125, "learning_rate": 2.6023706896551725e-06, "logits/chosen": -1.7166540622711182, "logits/rejected": -1.7119417190551758, "logps/chosen": -535.0701904296875, "logps/rejected": -521.47509765625, "loss": 0.6346, "rewards/accuracies": 0.6421874761581421, "rewards/chosen": -0.6037973761558533, "rewards/margins": 0.18038420379161835, "rewards/rejected": -0.7841815948486328, "step": 370 }, { "epoch": 0.40985540463109643, "grad_norm": 21.5, "learning_rate": 2.591594827586207e-06, "logits/chosen": -1.711259126663208, "logits/rejected": -1.7223602533340454, "logps/chosen": -554.1033935546875, "logps/rejected": -536.9190673828125, "loss": 0.6215, "rewards/accuracies": 0.667187511920929, "rewards/chosen": -0.5765694379806519, "rewards/margins": 0.21333327889442444, "rewards/rejected": -0.7899028062820435, "step": 380 }, { "epoch": 0.42064107317402, "grad_norm": 20.125, "learning_rate": 2.580818965517241e-06, "logits/chosen": -1.674808144569397, "logits/rejected": -1.7261720895767212, "logps/chosen": -534.8923950195312, "logps/rejected": -509.17437744140625, "loss": 0.6234, "rewards/accuracies": 0.645312488079071, "rewards/chosen": -0.5851498246192932, "rewards/margins": 0.20684213936328888, "rewards/rejected": -0.7919918298721313, "step": 390 }, { "epoch": 0.4314267417169436, "grad_norm": 21.375, "learning_rate": 2.5700431034482762e-06, "logits/chosen": -1.7043625116348267, "logits/rejected": -1.721663236618042, "logps/chosen": -547.33935546875, "logps/rejected": -535.8040771484375, "loss": 0.6314, "rewards/accuracies": 0.645312488079071, "rewards/chosen": -0.6050974130630493, "rewards/margins": 0.18475469946861267, "rewards/rejected": -0.7898520827293396, "step": 400 }, { "epoch": 0.4314267417169436, "eval_logits/chosen": -1.7199591398239136, "eval_logits/rejected": -1.7800374031066895, "eval_logps/chosen": -562.4886474609375, "eval_logps/rejected": -509.6477355957031, "eval_loss": 0.6607488989830017, "eval_rewards/accuracies": 0.6153846383094788, "eval_rewards/chosen": -0.7131773829460144, "eval_rewards/margins": 0.11868361383676529, "eval_rewards/rejected": -0.831861138343811, "eval_runtime": 12.8809, "eval_samples_per_second": 7.763, "eval_steps_per_second": 1.009, "step": 400 }, { "epoch": 0.4422124102598672, "grad_norm": 20.25, "learning_rate": 2.5592672413793105e-06, "logits/chosen": -1.7056515216827393, "logits/rejected": -1.7032356262207031, "logps/chosen": -544.6566162109375, "logps/rejected": -526.3981323242188, "loss": 0.6346, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.6110819578170776, "rewards/margins": 0.17850695550441742, "rewards/rejected": -0.7895889282226562, "step": 410 }, { "epoch": 0.4529980788027908, "grad_norm": 21.125, "learning_rate": 2.5484913793103452e-06, "logits/chosen": -1.7077720165252686, "logits/rejected": -1.7262470722198486, "logps/chosen": -538.9916381835938, "logps/rejected": -514.1024780273438, "loss": 0.6319, "rewards/accuracies": 0.6390625238418579, "rewards/chosen": -0.6130255460739136, "rewards/margins": 0.18954649567604065, "rewards/rejected": -0.8025720715522766, "step": 420 }, { "epoch": 0.46378374734571437, "grad_norm": 21.125, "learning_rate": 2.5377155172413795e-06, "logits/chosen": -1.6869211196899414, "logits/rejected": -1.7010374069213867, "logps/chosen": -519.823486328125, "logps/rejected": -514.0619506835938, "loss": 0.6187, "rewards/accuracies": 0.6734374761581421, "rewards/chosen": -0.6134659647941589, "rewards/margins": 0.21337684988975525, "rewards/rejected": -0.8268427848815918, "step": 430 }, { "epoch": 0.474569415888638, "grad_norm": 20.75, "learning_rate": 2.526939655172414e-06, "logits/chosen": -1.7328685522079468, "logits/rejected": -1.7307264804840088, "logps/chosen": -519.9312744140625, "logps/rejected": -513.1961669921875, "loss": 0.6278, "rewards/accuracies": 0.676562488079071, "rewards/chosen": -0.6580926179885864, "rewards/margins": 0.20139017701148987, "rewards/rejected": -0.8594827651977539, "step": 440 }, { "epoch": 0.48535508443156156, "grad_norm": 21.0, "learning_rate": 2.5161637931034486e-06, "logits/chosen": -1.7335723638534546, "logits/rejected": -1.7415554523468018, "logps/chosen": -526.5194702148438, "logps/rejected": -518.3421020507812, "loss": 0.6219, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.6824163198471069, "rewards/margins": 0.2106122523546219, "rewards/rejected": -0.8930285573005676, "step": 450 }, { "epoch": 0.48535508443156156, "eval_logits/chosen": -1.7433769702911377, "eval_logits/rejected": -1.8047083616256714, "eval_logps/chosen": -564.2274169921875, "eval_logps/rejected": -511.63067626953125, "eval_loss": 0.6593914031982422, "eval_rewards/accuracies": 0.5769230723381042, "eval_rewards/chosen": -0.8001139760017395, "eval_rewards/margins": 0.13089530169963837, "eval_rewards/rejected": -0.9310091733932495, "eval_runtime": 12.8159, "eval_samples_per_second": 7.803, "eval_steps_per_second": 1.014, "step": 450 }, { "epoch": 0.4961407529744852, "grad_norm": 20.875, "learning_rate": 2.505387931034483e-06, "logits/chosen": -1.7393690347671509, "logits/rejected": -1.7588695287704468, "logps/chosen": -552.0422973632812, "logps/rejected": -530.3653564453125, "loss": 0.6202, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.686679482460022, "rewards/margins": 0.22524280846118927, "rewards/rejected": -0.9119223356246948, "step": 460 }, { "epoch": 0.5069264215174087, "grad_norm": 20.25, "learning_rate": 2.4946120689655176e-06, "logits/chosen": -1.7419532537460327, "logits/rejected": -1.7394039630889893, "logps/chosen": -526.1238403320312, "logps/rejected": -506.7391052246094, "loss": 0.6301, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.6740504503250122, "rewards/margins": 0.18307238817214966, "rewards/rejected": -0.8571227788925171, "step": 470 }, { "epoch": 0.5177120900603324, "grad_norm": 22.375, "learning_rate": 2.483836206896552e-06, "logits/chosen": -1.6943247318267822, "logits/rejected": -1.6783298254013062, "logps/chosen": -568.21435546875, "logps/rejected": -553.17822265625, "loss": 0.6339, "rewards/accuracies": 0.6390625238418579, "rewards/chosen": -0.6776462197303772, "rewards/margins": 0.1882401704788208, "rewards/rejected": -0.865886390209198, "step": 480 }, { "epoch": 0.5284977586032559, "grad_norm": 21.125, "learning_rate": 2.473060344827586e-06, "logits/chosen": -1.7065378427505493, "logits/rejected": -1.7249234914779663, "logps/chosen": -529.4561767578125, "logps/rejected": -506.04071044921875, "loss": 0.6378, "rewards/accuracies": 0.645312488079071, "rewards/chosen": -0.6808103322982788, "rewards/margins": 0.17550134658813477, "rewards/rejected": -0.8563116788864136, "step": 490 }, { "epoch": 0.5392834271461795, "grad_norm": 23.75, "learning_rate": 2.462284482758621e-06, "logits/chosen": -1.6840463876724243, "logits/rejected": -1.70773184299469, "logps/chosen": -530.1900634765625, "logps/rejected": -512.8983764648438, "loss": 0.6382, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6517452001571655, "rewards/margins": 0.17097657918930054, "rewards/rejected": -0.8227217793464661, "step": 500 }, { "epoch": 0.5392834271461795, "eval_logits/chosen": -1.7378157377243042, "eval_logits/rejected": -1.7989575862884521, "eval_logps/chosen": -563.543212890625, "eval_logps/rejected": -510.96929931640625, "eval_loss": 0.6553998589515686, "eval_rewards/accuracies": 0.6442307829856873, "eval_rewards/chosen": -0.7659082412719727, "eval_rewards/margins": 0.1320342719554901, "eval_rewards/rejected": -0.8979425430297852, "eval_runtime": 12.8216, "eval_samples_per_second": 7.799, "eval_steps_per_second": 1.014, "step": 500 }, { "epoch": 0.5500690956891031, "grad_norm": 23.375, "learning_rate": 2.451508620689655e-06, "logits/chosen": -1.7287929058074951, "logits/rejected": -1.747097373008728, "logps/chosen": -568.22900390625, "logps/rejected": -537.8626098632812, "loss": 0.6234, "rewards/accuracies": 0.660937488079071, "rewards/chosen": -0.6510963439941406, "rewards/margins": 0.20002727210521698, "rewards/rejected": -0.8511236310005188, "step": 510 }, { "epoch": 0.5608547642320267, "grad_norm": 21.375, "learning_rate": 2.44073275862069e-06, "logits/chosen": -1.7096725702285767, "logits/rejected": -1.713513970375061, "logps/chosen": -578.6484375, "logps/rejected": -552.06884765625, "loss": 0.618, "rewards/accuracies": 0.6734374761581421, "rewards/chosen": -0.6893950700759888, "rewards/margins": 0.22265811264514923, "rewards/rejected": -0.9120532274246216, "step": 520 }, { "epoch": 0.5716404327749502, "grad_norm": 24.375, "learning_rate": 2.429956896551724e-06, "logits/chosen": -1.7773420810699463, "logits/rejected": -1.7806062698364258, "logps/chosen": -561.3190307617188, "logps/rejected": -557.246826171875, "loss": 0.6274, "rewards/accuracies": 0.660937488079071, "rewards/chosen": -0.6820749640464783, "rewards/margins": 0.23882977664470673, "rewards/rejected": -0.9209047555923462, "step": 530 }, { "epoch": 0.5824261013178739, "grad_norm": 21.625, "learning_rate": 2.419181034482759e-06, "logits/chosen": -1.7619960308074951, "logits/rejected": -1.7642993927001953, "logps/chosen": -520.9690551757812, "logps/rejected": -509.5274353027344, "loss": 0.6309, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7083293795585632, "rewards/margins": 0.1926591694355011, "rewards/rejected": -0.9009885787963867, "step": 540 }, { "epoch": 0.5932117698607975, "grad_norm": 18.75, "learning_rate": 2.408405172413793e-06, "logits/chosen": -1.7466360330581665, "logits/rejected": -1.746272325515747, "logps/chosen": -513.3010864257812, "logps/rejected": -490.46624755859375, "loss": 0.6377, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.7287647128105164, "rewards/margins": 0.16981419920921326, "rewards/rejected": -0.8985790014266968, "step": 550 }, { "epoch": 0.5932117698607975, "eval_logits/chosen": -1.7363488674163818, "eval_logits/rejected": -1.7969601154327393, "eval_logps/chosen": -564.2831420898438, "eval_logps/rejected": -511.74945068359375, "eval_loss": 0.6561428904533386, "eval_rewards/accuracies": 0.6442307829856873, "eval_rewards/chosen": -0.80290687084198, "eval_rewards/margins": 0.13403938710689545, "eval_rewards/rejected": -0.9369462728500366, "eval_runtime": 12.8373, "eval_samples_per_second": 7.79, "eval_steps_per_second": 1.013, "step": 550 }, { "epoch": 0.6039974384037211, "grad_norm": 21.75, "learning_rate": 2.3976293103448275e-06, "logits/chosen": -1.7120717763900757, "logits/rejected": -1.717246651649475, "logps/chosen": -544.954833984375, "logps/rejected": -519.8233032226562, "loss": 0.6182, "rewards/accuracies": 0.6703125238418579, "rewards/chosen": -0.6614271402359009, "rewards/margins": 0.22156758606433868, "rewards/rejected": -0.8829947710037231, "step": 560 }, { "epoch": 0.6147831069466446, "grad_norm": 22.625, "learning_rate": 2.386853448275862e-06, "logits/chosen": -1.6905927658081055, "logits/rejected": -1.709628701210022, "logps/chosen": -529.6423950195312, "logps/rejected": -513.8201904296875, "loss": 0.626, "rewards/accuracies": 0.692187488079071, "rewards/chosen": -0.6795625686645508, "rewards/margins": 0.20236878097057343, "rewards/rejected": -0.8819311857223511, "step": 570 }, { "epoch": 0.6255687754895682, "grad_norm": 21.375, "learning_rate": 2.3760775862068965e-06, "logits/chosen": -1.7218097448349, "logits/rejected": -1.7233638763427734, "logps/chosen": -566.1005249023438, "logps/rejected": -550.3419799804688, "loss": 0.6288, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7431906461715698, "rewards/margins": 0.20129895210266113, "rewards/rejected": -0.9444894790649414, "step": 580 }, { "epoch": 0.6363544440324919, "grad_norm": 21.625, "learning_rate": 2.365301724137931e-06, "logits/chosen": -1.718971848487854, "logits/rejected": -1.7108612060546875, "logps/chosen": -552.8258056640625, "logps/rejected": -534.0645141601562, "loss": 0.6376, "rewards/accuracies": 0.6421874761581421, "rewards/chosen": -0.7128755450248718, "rewards/margins": 0.18357399106025696, "rewards/rejected": -0.8964495658874512, "step": 590 }, { "epoch": 0.6471401125754154, "grad_norm": 21.75, "learning_rate": 2.3545258620689655e-06, "logits/chosen": -1.6846719980239868, "logits/rejected": -1.6692850589752197, "logps/chosen": -575.1969604492188, "logps/rejected": -551.0662841796875, "loss": 0.6428, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7172713279724121, "rewards/margins": 0.16418126225471497, "rewards/rejected": -0.8814526796340942, "step": 600 }, { "epoch": 0.6471401125754154, "eval_logits/chosen": -1.7446637153625488, "eval_logits/rejected": -1.805701494216919, "eval_logps/chosen": -564.8399658203125, "eval_logps/rejected": -512.3929443359375, "eval_loss": 0.6552779674530029, "eval_rewards/accuracies": 0.6153846383094788, "eval_rewards/chosen": -0.8307470083236694, "eval_rewards/margins": 0.13837800920009613, "eval_rewards/rejected": -0.9691251516342163, "eval_runtime": 12.8832, "eval_samples_per_second": 7.762, "eval_steps_per_second": 1.009, "step": 600 }, { "epoch": 0.657925781118339, "grad_norm": 19.625, "learning_rate": 2.3437500000000002e-06, "logits/chosen": -1.7127196788787842, "logits/rejected": -1.722394585609436, "logps/chosen": -534.408447265625, "logps/rejected": -524.6024169921875, "loss": 0.6382, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.7230153679847717, "rewards/margins": 0.17467036843299866, "rewards/rejected": -0.897685706615448, "step": 610 }, { "epoch": 0.6687114496612626, "grad_norm": 19.25, "learning_rate": 2.3329741379310345e-06, "logits/chosen": -1.731837511062622, "logits/rejected": -1.7623554468154907, "logps/chosen": -528.728515625, "logps/rejected": -500.19500732421875, "loss": 0.6294, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.695734441280365, "rewards/margins": 0.19728422164916992, "rewards/rejected": -0.8930186033248901, "step": 620 }, { "epoch": 0.6794971182041862, "grad_norm": 22.875, "learning_rate": 2.322198275862069e-06, "logits/chosen": -1.6935323476791382, "logits/rejected": -1.7253952026367188, "logps/chosen": -557.6375122070312, "logps/rejected": -530.9302368164062, "loss": 0.6425, "rewards/accuracies": 0.6421874761581421, "rewards/chosen": -0.7235587239265442, "rewards/margins": 0.1692447066307068, "rewards/rejected": -0.8928033709526062, "step": 630 }, { "epoch": 0.6902827867471097, "grad_norm": 22.75, "learning_rate": 2.3114224137931035e-06, "logits/chosen": -1.7822033166885376, "logits/rejected": -1.7954456806182861, "logps/chosen": -548.27294921875, "logps/rejected": -529.1452026367188, "loss": 0.6357, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.733366847038269, "rewards/margins": 0.18996267020702362, "rewards/rejected": -0.9233294725418091, "step": 640 }, { "epoch": 0.7010684552900334, "grad_norm": 23.625, "learning_rate": 2.300646551724138e-06, "logits/chosen": -1.7504284381866455, "logits/rejected": -1.7501709461212158, "logps/chosen": -563.0472412109375, "logps/rejected": -547.3125, "loss": 0.6376, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7303886413574219, "rewards/margins": 0.17333553731441498, "rewards/rejected": -0.903724193572998, "step": 650 }, { "epoch": 0.7010684552900334, "eval_logits/chosen": -1.7363975048065186, "eval_logits/rejected": -1.7971389293670654, "eval_logps/chosen": -564.5283813476562, "eval_logps/rejected": -512.2813110351562, "eval_loss": 0.6501194834709167, "eval_rewards/accuracies": 0.6442307829856873, "eval_rewards/chosen": -0.8151662349700928, "eval_rewards/margins": 0.1483728587627411, "eval_rewards/rejected": -0.9635391235351562, "eval_runtime": 12.8651, "eval_samples_per_second": 7.773, "eval_steps_per_second": 1.01, "step": 650 }, { "epoch": 0.711854123832957, "grad_norm": 21.0, "learning_rate": 2.2898706896551725e-06, "logits/chosen": -1.6629375219345093, "logits/rejected": -1.686265230178833, "logps/chosen": -579.2559814453125, "logps/rejected": -550.0233764648438, "loss": 0.6339, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7081414461135864, "rewards/margins": 0.19901154935359955, "rewards/rejected": -0.9071530103683472, "step": 660 }, { "epoch": 0.7226397923758805, "grad_norm": 24.25, "learning_rate": 2.279094827586207e-06, "logits/chosen": -1.772698998451233, "logits/rejected": -1.8019428253173828, "logps/chosen": -505.48577880859375, "logps/rejected": -490.10662841796875, "loss": 0.6425, "rewards/accuracies": 0.6078125238418579, "rewards/chosen": -0.7496585249900818, "rewards/margins": 0.17492921650409698, "rewards/rejected": -0.9245878458023071, "step": 670 }, { "epoch": 0.7334254609188041, "grad_norm": 23.125, "learning_rate": 2.2683189655172415e-06, "logits/chosen": -1.7226699590682983, "logits/rejected": -1.7439358234405518, "logps/chosen": -551.814208984375, "logps/rejected": -530.10888671875, "loss": 0.6147, "rewards/accuracies": 0.6703125238418579, "rewards/chosen": -0.7403059601783752, "rewards/margins": 0.22587330639362335, "rewards/rejected": -0.966179370880127, "step": 680 }, { "epoch": 0.7442111294617277, "grad_norm": 19.625, "learning_rate": 2.257543103448276e-06, "logits/chosen": -1.7711864709854126, "logits/rejected": -1.7668606042861938, "logps/chosen": -540.2945556640625, "logps/rejected": -520.48828125, "loss": 0.6112, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.7231726050376892, "rewards/margins": 0.24855723977088928, "rewards/rejected": -0.9717298746109009, "step": 690 }, { "epoch": 0.7549967980046514, "grad_norm": 22.375, "learning_rate": 2.24676724137931e-06, "logits/chosen": -1.7296479940414429, "logits/rejected": -1.7336111068725586, "logps/chosen": -525.4596557617188, "logps/rejected": -499.53826904296875, "loss": 0.644, "rewards/accuracies": 0.6578124761581421, "rewards/chosen": -0.7867623567581177, "rewards/margins": 0.18225276470184326, "rewards/rejected": -0.9690152406692505, "step": 700 }, { "epoch": 0.7549967980046514, "eval_logits/chosen": -1.7320126295089722, "eval_logits/rejected": -1.7923235893249512, "eval_logps/chosen": -565.2008056640625, "eval_logps/rejected": -512.877197265625, "eval_loss": 0.6524822115898132, "eval_rewards/accuracies": 0.6153846383094788, "eval_rewards/chosen": -0.8487869501113892, "eval_rewards/margins": 0.1445501744747162, "eval_rewards/rejected": -0.9933372139930725, "eval_runtime": 12.8132, "eval_samples_per_second": 7.804, "eval_steps_per_second": 1.015, "step": 700 }, { "epoch": 0.7657824665475749, "grad_norm": 36.25, "learning_rate": 2.235991379310345e-06, "logits/chosen": -1.6764347553253174, "logits/rejected": -1.6931612491607666, "logps/chosen": -555.44091796875, "logps/rejected": -529.9513549804688, "loss": 0.6177, "rewards/accuracies": 0.667187511920929, "rewards/chosen": -0.7847446203231812, "rewards/margins": 0.23240776360034943, "rewards/rejected": -1.0171524286270142, "step": 710 }, { "epoch": 0.7765681350904985, "grad_norm": 21.375, "learning_rate": 2.225215517241379e-06, "logits/chosen": -1.7237228155136108, "logits/rejected": -1.7398170232772827, "logps/chosen": -542.1912231445312, "logps/rejected": -518.4722900390625, "loss": 0.6354, "rewards/accuracies": 0.6421874761581421, "rewards/chosen": -0.7876893281936646, "rewards/margins": 0.1978759765625, "rewards/rejected": -0.9855653643608093, "step": 720 }, { "epoch": 0.7873538036334221, "grad_norm": 23.0, "learning_rate": 2.214439655172414e-06, "logits/chosen": -1.752096176147461, "logits/rejected": -1.7941181659698486, "logps/chosen": -548.2041015625, "logps/rejected": -513.8216552734375, "loss": 0.6555, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.7717311978340149, "rewards/margins": 0.14397189021110535, "rewards/rejected": -0.9157029986381531, "step": 730 }, { "epoch": 0.7981394721763457, "grad_norm": 19.75, "learning_rate": 2.203663793103448e-06, "logits/chosen": -1.709242820739746, "logits/rejected": -1.7145198583602905, "logps/chosen": -539.0947265625, "logps/rejected": -521.4993896484375, "loss": 0.6394, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.7486387491226196, "rewards/margins": 0.18467199802398682, "rewards/rejected": -0.933310866355896, "step": 740 }, { "epoch": 0.8089251407192692, "grad_norm": 23.875, "learning_rate": 2.192887931034483e-06, "logits/chosen": -1.632927656173706, "logits/rejected": -1.6515719890594482, "logps/chosen": -559.2658081054688, "logps/rejected": -526.0423583984375, "loss": 0.6322, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.7207778692245483, "rewards/margins": 0.19092032313346863, "rewards/rejected": -0.911698043346405, "step": 750 }, { "epoch": 0.8089251407192692, "eval_logits/chosen": -1.717968463897705, "eval_logits/rejected": -1.7769808769226074, "eval_logps/chosen": -564.5967407226562, "eval_logps/rejected": -512.3129272460938, "eval_loss": 0.6494570970535278, "eval_rewards/accuracies": 0.6634615659713745, "eval_rewards/chosen": -0.8185831308364868, "eval_rewards/margins": 0.1465369611978531, "eval_rewards/rejected": -0.9651200175285339, "eval_runtime": 12.8208, "eval_samples_per_second": 7.8, "eval_steps_per_second": 1.014, "step": 750 }, { "epoch": 0.8197108092621929, "grad_norm": 22.25, "learning_rate": 2.182112068965517e-06, "logits/chosen": -1.7085649967193604, "logits/rejected": -1.7567522525787354, "logps/chosen": -565.7103271484375, "logps/rejected": -539.7507934570312, "loss": 0.626, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7772555351257324, "rewards/margins": 0.20999836921691895, "rewards/rejected": -0.9872539639472961, "step": 760 }, { "epoch": 0.8304964778051165, "grad_norm": 22.125, "learning_rate": 2.1713362068965515e-06, "logits/chosen": -1.7073513269424438, "logits/rejected": -1.7328789234161377, "logps/chosen": -536.3829345703125, "logps/rejected": -514.23779296875, "loss": 0.6285, "rewards/accuracies": 0.6578124761581421, "rewards/chosen": -0.7473222613334656, "rewards/margins": 0.20041945576667786, "rewards/rejected": -0.947741687297821, "step": 770 }, { "epoch": 0.84128214634804, "grad_norm": 21.75, "learning_rate": 2.160560344827586e-06, "logits/chosen": -1.6973340511322021, "logits/rejected": -1.691319465637207, "logps/chosen": -557.1148071289062, "logps/rejected": -558.3483276367188, "loss": 0.6268, "rewards/accuracies": 0.6578124761581421, "rewards/chosen": -0.7272000312805176, "rewards/margins": 0.21510732173919678, "rewards/rejected": -0.9423073530197144, "step": 780 }, { "epoch": 0.8520678148909636, "grad_norm": 20.875, "learning_rate": 2.1497844827586205e-06, "logits/chosen": -1.679863691329956, "logits/rejected": -1.6904770135879517, "logps/chosen": -543.8391723632812, "logps/rejected": -531.5890502929688, "loss": 0.625, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.756318986415863, "rewards/margins": 0.20582714676856995, "rewards/rejected": -0.9621461629867554, "step": 790 }, { "epoch": 0.8628534834338872, "grad_norm": 21.875, "learning_rate": 2.139008620689655e-06, "logits/chosen": -1.7281272411346436, "logits/rejected": -1.745347261428833, "logps/chosen": -547.1482543945312, "logps/rejected": -527.178466796875, "loss": 0.6278, "rewards/accuracies": 0.671875, "rewards/chosen": -0.7604348063468933, "rewards/margins": 0.19645504653453827, "rewards/rejected": -0.9568899273872375, "step": 800 }, { "epoch": 0.8628534834338872, "eval_logits/chosen": -1.7295496463775635, "eval_logits/rejected": -1.7896690368652344, "eval_logps/chosen": -565.3712158203125, "eval_logps/rejected": -513.1586303710938, "eval_loss": 0.6495404243469238, "eval_rewards/accuracies": 0.6730769276618958, "eval_rewards/chosen": -0.8573046922683716, "eval_rewards/margins": 0.15010415017604828, "eval_rewards/rejected": -1.0074087381362915, "eval_runtime": 12.8304, "eval_samples_per_second": 7.794, "eval_steps_per_second": 1.013, "step": 800 }, { "epoch": 0.8736391519768109, "grad_norm": 24.0, "learning_rate": 2.1282327586206895e-06, "logits/chosen": -1.6808828115463257, "logits/rejected": -1.7251535654067993, "logps/chosen": -545.0057373046875, "logps/rejected": -514.2200317382812, "loss": 0.6473, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7858768701553345, "rewards/margins": 0.16710534691810608, "rewards/rejected": -0.9529821276664734, "step": 810 }, { "epoch": 0.8844248205197344, "grad_norm": 21.625, "learning_rate": 2.117456896551724e-06, "logits/chosen": -1.7713664770126343, "logits/rejected": -1.7678531408309937, "logps/chosen": -544.8568115234375, "logps/rejected": -516.0807495117188, "loss": 0.6342, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.8098779916763306, "rewards/margins": 0.199488565325737, "rewards/rejected": -1.009366512298584, "step": 820 }, { "epoch": 0.895210489062658, "grad_norm": 22.375, "learning_rate": 2.106681034482759e-06, "logits/chosen": -1.7336772680282593, "logits/rejected": -1.7515900135040283, "logps/chosen": -549.8850708007812, "logps/rejected": -530.6832275390625, "loss": 0.6074, "rewards/accuracies": 0.703125, "rewards/chosen": -0.7838916778564453, "rewards/margins": 0.2659648060798645, "rewards/rejected": -1.049856424331665, "step": 830 }, { "epoch": 0.9059961576055816, "grad_norm": 21.375, "learning_rate": 2.0959051724137932e-06, "logits/chosen": -1.7343952655792236, "logits/rejected": -1.7316901683807373, "logps/chosen": -574.8560791015625, "logps/rejected": -536.2384033203125, "loss": 0.6342, "rewards/accuracies": 0.651562511920929, "rewards/chosen": -0.7904583811759949, "rewards/margins": 0.1979624629020691, "rewards/rejected": -0.988420844078064, "step": 840 }, { "epoch": 0.9167818261485051, "grad_norm": 21.5, "learning_rate": 2.085129310344828e-06, "logits/chosen": -1.733007788658142, "logits/rejected": -1.727246880531311, "logps/chosen": -524.6848754882812, "logps/rejected": -510.7845764160156, "loss": 0.6307, "rewards/accuracies": 0.6421874761581421, "rewards/chosen": -0.8132136464118958, "rewards/margins": 0.20207151770591736, "rewards/rejected": -1.0152852535247803, "step": 850 }, { "epoch": 0.9167818261485051, "eval_logits/chosen": -1.724961757659912, "eval_logits/rejected": -1.7844202518463135, "eval_logps/chosen": -565.8350830078125, "eval_logps/rejected": -513.8818969726562, "eval_loss": 0.6429941058158875, "eval_rewards/accuracies": 0.6538461446762085, "eval_rewards/chosen": -0.8805010318756104, "eval_rewards/margins": 0.16307112574577332, "eval_rewards/rejected": -1.043572187423706, "eval_runtime": 12.8678, "eval_samples_per_second": 7.771, "eval_steps_per_second": 1.01, "step": 850 }, { "epoch": 0.9275674946914287, "grad_norm": 21.0, "learning_rate": 2.0743534482758622e-06, "logits/chosen": -1.678770661354065, "logits/rejected": -1.6889280080795288, "logps/chosen": -549.0493774414062, "logps/rejected": -529.0235595703125, "loss": 0.6311, "rewards/accuracies": 0.6578124761581421, "rewards/chosen": -0.8022142648696899, "rewards/margins": 0.21385040879249573, "rewards/rejected": -1.0160646438598633, "step": 860 }, { "epoch": 0.9383531632343524, "grad_norm": 21.25, "learning_rate": 2.063577586206897e-06, "logits/chosen": -1.677046775817871, "logits/rejected": -1.715746521949768, "logps/chosen": -558.4188842773438, "logps/rejected": -526.5531616210938, "loss": 0.6388, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7892870903015137, "rewards/margins": 0.1829388290643692, "rewards/rejected": -0.9722259640693665, "step": 870 }, { "epoch": 0.949138831777276, "grad_norm": 22.25, "learning_rate": 2.0528017241379312e-06, "logits/chosen": -1.71084463596344, "logits/rejected": -1.7499706745147705, "logps/chosen": -547.9580078125, "logps/rejected": -525.6466674804688, "loss": 0.6215, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7694340348243713, "rewards/margins": 0.2342531681060791, "rewards/rejected": -1.0036872625350952, "step": 880 }, { "epoch": 0.9599245003201995, "grad_norm": 22.5, "learning_rate": 2.0420258620689655e-06, "logits/chosen": -1.7067375183105469, "logits/rejected": -1.7360172271728516, "logps/chosen": -553.2872924804688, "logps/rejected": -533.8543090820312, "loss": 0.6409, "rewards/accuracies": 0.6390625238418579, "rewards/chosen": -0.7895435690879822, "rewards/margins": 0.18897457420825958, "rewards/rejected": -0.9785181879997253, "step": 890 }, { "epoch": 0.9707101688631231, "grad_norm": 22.125, "learning_rate": 2.0312500000000002e-06, "logits/chosen": -1.7305114269256592, "logits/rejected": -1.733814001083374, "logps/chosen": -544.6137084960938, "logps/rejected": -523.6353149414062, "loss": 0.6191, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.786180317401886, "rewards/margins": 0.22723452746868134, "rewards/rejected": -1.013414978981018, "step": 900 }, { "epoch": 0.9707101688631231, "eval_logits/chosen": -1.7138627767562866, "eval_logits/rejected": -1.773409128189087, "eval_logps/chosen": -565.8074340820312, "eval_logps/rejected": -513.8848266601562, "eval_loss": 0.6439433097839355, "eval_rewards/accuracies": 0.6634615659713745, "eval_rewards/chosen": -0.8791185617446899, "eval_rewards/margins": 0.16459915041923523, "eval_rewards/rejected": -1.043717622756958, "eval_runtime": 12.8601, "eval_samples_per_second": 7.776, "eval_steps_per_second": 1.011, "step": 900 }, { "epoch": 0.9814958374060467, "grad_norm": 20.5, "learning_rate": 2.0204741379310345e-06, "logits/chosen": -1.7137806415557861, "logits/rejected": -1.714971899986267, "logps/chosen": -546.196533203125, "logps/rejected": -519.9130859375, "loss": 0.6336, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7919280529022217, "rewards/margins": 0.189212828874588, "rewards/rejected": -0.9811409115791321, "step": 910 }, { "epoch": 0.9922815059489704, "grad_norm": 21.375, "learning_rate": 2.0096982758620693e-06, "logits/chosen": -1.7518551349639893, "logits/rejected": -1.789482831954956, "logps/chosen": -539.8787841796875, "logps/rejected": -518.9136962890625, "loss": 0.6408, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.8202158808708191, "rewards/margins": 0.1931016743183136, "rewards/rejected": -1.0133177042007446, "step": 920 }, { "epoch": 1.0021571337085846, "grad_norm": 22.125, "learning_rate": 1.9989224137931036e-06, "logits/chosen": -1.667576789855957, "logits/rejected": -1.6723417043685913, "logps/chosen": -560.371826171875, "logps/rejected": -535.334228515625, "loss": 0.6217, "rewards/accuracies": 0.6416382193565369, "rewards/chosen": -0.7891966700553894, "rewards/margins": 0.2126684933900833, "rewards/rejected": -1.0018651485443115, "step": 930 }, { "epoch": 1.0129428022515083, "grad_norm": 20.125, "learning_rate": 1.9881465517241383e-06, "logits/chosen": -1.6589832305908203, "logits/rejected": -1.6481475830078125, "logps/chosen": -539.683837890625, "logps/rejected": -524.0894165039062, "loss": 0.6041, "rewards/accuracies": 0.7015625238418579, "rewards/chosen": -0.7246257662773132, "rewards/margins": 0.25880277156829834, "rewards/rejected": -0.9834285974502563, "step": 940 }, { "epoch": 1.0237284707944319, "grad_norm": 20.25, "learning_rate": 1.9773706896551726e-06, "logits/chosen": -1.7167028188705444, "logits/rejected": -1.722948431968689, "logps/chosen": -543.1702270507812, "logps/rejected": -523.4526977539062, "loss": 0.6061, "rewards/accuracies": 0.6859375238418579, "rewards/chosen": -0.7710289359092712, "rewards/margins": 0.25660115480422974, "rewards/rejected": -1.027630090713501, "step": 950 }, { "epoch": 1.0237284707944319, "eval_logits/chosen": -1.7156367301940918, "eval_logits/rejected": -1.7743936777114868, "eval_logps/chosen": -566.1229248046875, "eval_logps/rejected": -514.0870971679688, "eval_loss": 0.6457732319831848, "eval_rewards/accuracies": 0.6730769276618958, "eval_rewards/chosen": -0.8948926329612732, "eval_rewards/margins": 0.15893827378749847, "eval_rewards/rejected": -1.053830862045288, "eval_runtime": 12.8266, "eval_samples_per_second": 7.796, "eval_steps_per_second": 1.014, "step": 950 }, { "epoch": 1.0345141393373556, "grad_norm": 21.875, "learning_rate": 1.966594827586207e-06, "logits/chosen": -1.7188589572906494, "logits/rejected": -1.7054466009140015, "logps/chosen": -549.7354736328125, "logps/rejected": -536.328125, "loss": 0.6156, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -0.779514729976654, "rewards/margins": 0.24889163672924042, "rewards/rejected": -1.0284063816070557, "step": 960 }, { "epoch": 1.045299807880279, "grad_norm": 19.875, "learning_rate": 1.9558189655172416e-06, "logits/chosen": -1.6784546375274658, "logits/rejected": -1.6850860118865967, "logps/chosen": -554.5667724609375, "logps/rejected": -547.0436401367188, "loss": 0.5838, "rewards/accuracies": 0.753125011920929, "rewards/chosen": -0.7458912134170532, "rewards/margins": 0.3095359802246094, "rewards/rejected": -1.0554273128509521, "step": 970 }, { "epoch": 1.0560854764232026, "grad_norm": 19.25, "learning_rate": 1.945043103448276e-06, "logits/chosen": -1.7073291540145874, "logits/rejected": -1.7111352682113647, "logps/chosen": -538.4148559570312, "logps/rejected": -526.5668334960938, "loss": 0.6054, "rewards/accuracies": 0.6890624761581421, "rewards/chosen": -0.7842422127723694, "rewards/margins": 0.26182490587234497, "rewards/rejected": -1.0460671186447144, "step": 980 }, { "epoch": 1.0668711449661263, "grad_norm": 22.125, "learning_rate": 1.9342672413793106e-06, "logits/chosen": -1.6965501308441162, "logits/rejected": -1.7186329364776611, "logps/chosen": -569.547119140625, "logps/rejected": -548.1654052734375, "loss": 0.5954, "rewards/accuracies": 0.723437488079071, "rewards/chosen": -0.7711578607559204, "rewards/margins": 0.2901317775249481, "rewards/rejected": -1.0612895488739014, "step": 990 }, { "epoch": 1.0776568135090498, "grad_norm": 20.75, "learning_rate": 1.923491379310345e-06, "logits/chosen": -1.7303965091705322, "logits/rejected": -1.7479015588760376, "logps/chosen": -535.7818603515625, "logps/rejected": -525.8051147460938, "loss": 0.6085, "rewards/accuracies": 0.671875, "rewards/chosen": -0.8089873194694519, "rewards/margins": 0.2565574049949646, "rewards/rejected": -1.065544605255127, "step": 1000 }, { "epoch": 1.0776568135090498, "eval_logits/chosen": -1.7369959354400635, "eval_logits/rejected": -1.796852946281433, "eval_logps/chosen": -566.9774169921875, "eval_logps/rejected": -515.3145751953125, "eval_loss": 0.6404527425765991, "eval_rewards/accuracies": 0.6442307829856873, "eval_rewards/chosen": -0.9376153349876404, "eval_rewards/margins": 0.1775885373353958, "eval_rewards/rejected": -1.115203857421875, "eval_runtime": 12.8545, "eval_samples_per_second": 7.779, "eval_steps_per_second": 1.011, "step": 1000 }, { "epoch": 1.0884424820519734, "grad_norm": 19.75, "learning_rate": 1.9127155172413796e-06, "logits/chosen": -1.7162755727767944, "logits/rejected": -1.7419646978378296, "logps/chosen": -533.6424560546875, "logps/rejected": -518.5228881835938, "loss": 0.5936, "rewards/accuracies": 0.7171875238418579, "rewards/chosen": -0.7938119173049927, "rewards/margins": 0.28399401903152466, "rewards/rejected": -1.0778058767318726, "step": 1010 }, { "epoch": 1.099228150594897, "grad_norm": 20.75, "learning_rate": 1.9019396551724139e-06, "logits/chosen": -1.6942980289459229, "logits/rejected": -1.707862138748169, "logps/chosen": -572.4893798828125, "logps/rejected": -546.4171142578125, "loss": 0.5999, "rewards/accuracies": 0.692187488079071, "rewards/chosen": -0.8284515142440796, "rewards/margins": 0.2772662043571472, "rewards/rejected": -1.1057178974151611, "step": 1020 }, { "epoch": 1.1100138191378206, "grad_norm": 21.25, "learning_rate": 1.8911637931034484e-06, "logits/chosen": -1.670231580734253, "logits/rejected": -1.6834871768951416, "logps/chosen": -534.8599243164062, "logps/rejected": -513.09375, "loss": 0.618, "rewards/accuracies": 0.645312488079071, "rewards/chosen": -0.8424933552742004, "rewards/margins": 0.24433858692646027, "rewards/rejected": -1.08683180809021, "step": 1030 }, { "epoch": 1.120799487680744, "grad_norm": 21.5, "learning_rate": 1.880387931034483e-06, "logits/chosen": -1.6783840656280518, "logits/rejected": -1.6901963949203491, "logps/chosen": -579.30322265625, "logps/rejected": -550.1730346679688, "loss": 0.5868, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.7912231087684631, "rewards/margins": 0.30552273988723755, "rewards/rejected": -1.0967457294464111, "step": 1040 }, { "epoch": 1.1315851562236678, "grad_norm": 20.75, "learning_rate": 1.8696120689655172e-06, "logits/chosen": -1.7412052154541016, "logits/rejected": -1.7302591800689697, "logps/chosen": -552.0925903320312, "logps/rejected": -544.4852294921875, "loss": 0.5885, "rewards/accuracies": 0.707812488079071, "rewards/chosen": -0.780128002166748, "rewards/margins": 0.3037031292915344, "rewards/rejected": -1.0838311910629272, "step": 1050 }, { "epoch": 1.1315851562236678, "eval_logits/chosen": -1.7486636638641357, "eval_logits/rejected": -1.8092385530471802, "eval_logps/chosen": -567.4676513671875, "eval_logps/rejected": -515.6458740234375, "eval_loss": 0.6456284523010254, "eval_rewards/accuracies": 0.6538461446762085, "eval_rewards/chosen": -0.9621263742446899, "eval_rewards/margins": 0.16964147984981537, "eval_rewards/rejected": -1.1317678689956665, "eval_runtime": 12.8512, "eval_samples_per_second": 7.781, "eval_steps_per_second": 1.012, "step": 1050 }, { "epoch": 1.1423708247665914, "grad_norm": 20.625, "learning_rate": 1.8588362068965517e-06, "logits/chosen": -1.6765000820159912, "logits/rejected": -1.6997390985488892, "logps/chosen": -557.3540649414062, "logps/rejected": -541.9246826171875, "loss": 0.5946, "rewards/accuracies": 0.7328125238418579, "rewards/chosen": -0.7725203037261963, "rewards/margins": 0.28744369745254517, "rewards/rejected": -1.0599639415740967, "step": 1060 }, { "epoch": 1.153156493309515, "grad_norm": 20.0, "learning_rate": 1.8480603448275862e-06, "logits/chosen": -1.7148462533950806, "logits/rejected": -1.6918646097183228, "logps/chosen": -542.8770751953125, "logps/rejected": -535.9141845703125, "loss": 0.6051, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.818956196308136, "rewards/margins": 0.269645631313324, "rewards/rejected": -1.08860182762146, "step": 1070 }, { "epoch": 1.1639421618524386, "grad_norm": 21.375, "learning_rate": 1.8372844827586207e-06, "logits/chosen": -1.736419677734375, "logits/rejected": -1.751819372177124, "logps/chosen": -533.5783081054688, "logps/rejected": -524.98046875, "loss": 0.5886, "rewards/accuracies": 0.723437488079071, "rewards/chosen": -0.8092219233512878, "rewards/margins": 0.3041132390499115, "rewards/rejected": -1.1133352518081665, "step": 1080 }, { "epoch": 1.174727830395362, "grad_norm": 24.875, "learning_rate": 1.8265086206896552e-06, "logits/chosen": -1.794930100440979, "logits/rejected": -1.8180984258651733, "logps/chosen": -542.23583984375, "logps/rejected": -527.9422607421875, "loss": 0.5897, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.8398362994194031, "rewards/margins": 0.3197947144508362, "rewards/rejected": -1.1596310138702393, "step": 1090 }, { "epoch": 1.1855134989382858, "grad_norm": 19.75, "learning_rate": 1.8157327586206897e-06, "logits/chosen": -1.7432777881622314, "logits/rejected": -1.7690292596817017, "logps/chosen": -536.7589721679688, "logps/rejected": -513.2578735351562, "loss": 0.5885, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8450536727905273, "rewards/margins": 0.3091221749782562, "rewards/rejected": -1.154175877571106, "step": 1100 }, { "epoch": 1.1855134989382858, "eval_logits/chosen": -1.764207124710083, "eval_logits/rejected": -1.8264764547348022, "eval_logps/chosen": -568.4033203125, "eval_logps/rejected": -516.8681030273438, "eval_loss": 0.6412051320075989, "eval_rewards/accuracies": 0.6634615659713745, "eval_rewards/chosen": -1.0089114904403687, "eval_rewards/margins": 0.18396909534931183, "eval_rewards/rejected": -1.192880630493164, "eval_runtime": 12.9704, "eval_samples_per_second": 7.71, "eval_steps_per_second": 1.002, "step": 1100 }, { "epoch": 1.1962991674812093, "grad_norm": 19.625, "learning_rate": 1.8049568965517242e-06, "logits/chosen": -1.739747405052185, "logits/rejected": -1.7718337774276733, "logps/chosen": -540.0535888671875, "logps/rejected": -523.0402221679688, "loss": 0.5792, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.8350652456283569, "rewards/margins": 0.3389832377433777, "rewards/rejected": -1.1740485429763794, "step": 1110 }, { "epoch": 1.2070848360241329, "grad_norm": 23.75, "learning_rate": 1.7941810344827585e-06, "logits/chosen": -1.7355035543441772, "logits/rejected": -1.740517258644104, "logps/chosen": -553.8098754882812, "logps/rejected": -549.6728515625, "loss": 0.5757, "rewards/accuracies": 0.760937511920929, "rewards/chosen": -0.8573840856552124, "rewards/margins": 0.35040709376335144, "rewards/rejected": -1.2077910900115967, "step": 1120 }, { "epoch": 1.2178705045670566, "grad_norm": 20.25, "learning_rate": 1.783405172413793e-06, "logits/chosen": -1.743051528930664, "logits/rejected": -1.7518497705459595, "logps/chosen": -539.1353759765625, "logps/rejected": -531.3056640625, "loss": 0.5989, "rewards/accuracies": 0.7203124761581421, "rewards/chosen": -0.8813650012016296, "rewards/margins": 0.28817206621170044, "rewards/rejected": -1.16953706741333, "step": 1130 }, { "epoch": 1.22865617310998, "grad_norm": 21.875, "learning_rate": 1.7726293103448275e-06, "logits/chosen": -1.780400037765503, "logits/rejected": -1.797412633895874, "logps/chosen": -538.9906005859375, "logps/rejected": -524.0303955078125, "loss": 0.6014, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.8999722599983215, "rewards/margins": 0.29530972242355347, "rewards/rejected": -1.1952821016311646, "step": 1140 }, { "epoch": 1.2394418416529036, "grad_norm": 21.375, "learning_rate": 1.761853448275862e-06, "logits/chosen": -1.7877986431121826, "logits/rejected": -1.7923310995101929, "logps/chosen": -525.5618286132812, "logps/rejected": -501.60028076171875, "loss": 0.591, "rewards/accuracies": 0.7171875238418579, "rewards/chosen": -0.9110702276229858, "rewards/margins": 0.30747610330581665, "rewards/rejected": -1.2185462713241577, "step": 1150 }, { "epoch": 1.2394418416529036, "eval_logits/chosen": -1.7822145223617554, "eval_logits/rejected": -1.844849705696106, "eval_logps/chosen": -569.3731689453125, "eval_logps/rejected": -517.9542846679688, "eval_loss": 0.6419017314910889, "eval_rewards/accuracies": 0.6634615659713745, "eval_rewards/chosen": -1.0574066638946533, "eval_rewards/margins": 0.18978391587734222, "eval_rewards/rejected": -1.2471905946731567, "eval_runtime": 12.8369, "eval_samples_per_second": 7.79, "eval_steps_per_second": 1.013, "step": 1150 }, { "epoch": 1.2502275101958273, "grad_norm": 20.375, "learning_rate": 1.7510775862068965e-06, "logits/chosen": -1.7190322875976562, "logits/rejected": -1.730328917503357, "logps/chosen": -524.3182373046875, "logps/rejected": -510.26251220703125, "loss": 0.5975, "rewards/accuracies": 0.682812511920929, "rewards/chosen": -0.8995493650436401, "rewards/margins": 0.28964298963546753, "rewards/rejected": -1.189192295074463, "step": 1160 }, { "epoch": 1.2610131787387509, "grad_norm": 19.875, "learning_rate": 1.740301724137931e-06, "logits/chosen": -1.7682945728302002, "logits/rejected": -1.7626434564590454, "logps/chosen": -545.8931884765625, "logps/rejected": -537.7764892578125, "loss": 0.584, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.8533053398132324, "rewards/margins": 0.31390124559402466, "rewards/rejected": -1.1672066450119019, "step": 1170 }, { "epoch": 1.2717988472816746, "grad_norm": 21.5, "learning_rate": 1.7295258620689656e-06, "logits/chosen": -1.7791554927825928, "logits/rejected": -1.8204240798950195, "logps/chosen": -569.207275390625, "logps/rejected": -549.9439697265625, "loss": 0.6009, "rewards/accuracies": 0.692187488079071, "rewards/chosen": -0.8925979733467102, "rewards/margins": 0.3236784338951111, "rewards/rejected": -1.2162764072418213, "step": 1180 }, { "epoch": 1.282584515824598, "grad_norm": 20.125, "learning_rate": 1.7187499999999998e-06, "logits/chosen": -1.7999378442764282, "logits/rejected": -1.8201545476913452, "logps/chosen": -541.1659545898438, "logps/rejected": -517.34765625, "loss": 0.5867, "rewards/accuracies": 0.729687511920929, "rewards/chosen": -0.9191481471061707, "rewards/margins": 0.32020530104637146, "rewards/rejected": -1.2393534183502197, "step": 1190 }, { "epoch": 1.2933701843675216, "grad_norm": 24.125, "learning_rate": 1.7079741379310344e-06, "logits/chosen": -1.7037147283554077, "logits/rejected": -1.7427594661712646, "logps/chosen": -560.2366943359375, "logps/rejected": -532.6011962890625, "loss": 0.599, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.9538860321044922, "rewards/margins": 0.2922009825706482, "rewards/rejected": -1.2460869550704956, "step": 1200 }, { "epoch": 1.2933701843675216, "eval_logits/chosen": -1.790303349494934, "eval_logits/rejected": -1.853431224822998, "eval_logps/chosen": -569.9891967773438, "eval_logps/rejected": -518.47216796875, "eval_loss": 0.6446880102157593, "eval_rewards/accuracies": 0.682692289352417, "eval_rewards/chosen": -1.0882103443145752, "eval_rewards/margins": 0.18487460911273956, "eval_rewards/rejected": -1.273085117340088, "eval_runtime": 12.8511, "eval_samples_per_second": 7.781, "eval_steps_per_second": 1.012, "step": 1200 }, { "epoch": 1.3041558529104451, "grad_norm": 21.875, "learning_rate": 1.6971982758620689e-06, "logits/chosen": -1.737642526626587, "logits/rejected": -1.7478208541870117, "logps/chosen": -555.0159301757812, "logps/rejected": -539.2918701171875, "loss": 0.6072, "rewards/accuracies": 0.6703125238418579, "rewards/chosen": -0.9134560823440552, "rewards/margins": 0.280630499124527, "rewards/rejected": -1.1940864324569702, "step": 1210 }, { "epoch": 1.3149415214533688, "grad_norm": 21.75, "learning_rate": 1.6864224137931034e-06, "logits/chosen": -1.7710367441177368, "logits/rejected": -1.7797397375106812, "logps/chosen": -581.2069702148438, "logps/rejected": -565.8582153320312, "loss": 0.5837, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.9405378103256226, "rewards/margins": 0.3353542387485504, "rewards/rejected": -1.2758920192718506, "step": 1220 }, { "epoch": 1.3257271899962924, "grad_norm": 20.125, "learning_rate": 1.6756465517241379e-06, "logits/chosen": -1.7992254495620728, "logits/rejected": -1.8066284656524658, "logps/chosen": -534.38037109375, "logps/rejected": -526.4951782226562, "loss": 0.6098, "rewards/accuracies": 0.692187488079071, "rewards/chosen": -0.9164705276489258, "rewards/margins": 0.28371772170066833, "rewards/rejected": -1.200188398361206, "step": 1230 }, { "epoch": 1.336512858539216, "grad_norm": 21.375, "learning_rate": 1.6648706896551724e-06, "logits/chosen": -1.718231201171875, "logits/rejected": -1.7256206274032593, "logps/chosen": -565.9593505859375, "logps/rejected": -551.340576171875, "loss": 0.5799, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9390344619750977, "rewards/margins": 0.33844050765037537, "rewards/rejected": -1.2774749994277954, "step": 1240 }, { "epoch": 1.3472985270821396, "grad_norm": 21.5, "learning_rate": 1.6540948275862069e-06, "logits/chosen": -1.8084981441497803, "logits/rejected": -1.8036025762557983, "logps/chosen": -536.2025756835938, "logps/rejected": -532.8880615234375, "loss": 0.5941, "rewards/accuracies": 0.698437511920929, "rewards/chosen": -0.9396790266036987, "rewards/margins": 0.3215208649635315, "rewards/rejected": -1.261199712753296, "step": 1250 }, { "epoch": 1.3472985270821396, "eval_logits/chosen": -1.7861320972442627, "eval_logits/rejected": -1.8498179912567139, "eval_logps/chosen": -569.7782592773438, "eval_logps/rejected": -518.2344970703125, "eval_loss": 0.6447553038597107, "eval_rewards/accuracies": 0.6442307829856873, "eval_rewards/chosen": -1.0776572227478027, "eval_rewards/margins": 0.1835414469242096, "eval_rewards/rejected": -1.2611987590789795, "eval_runtime": 12.8276, "eval_samples_per_second": 7.796, "eval_steps_per_second": 1.013, "step": 1250 }, { "epoch": 1.358084195625063, "grad_norm": 19.625, "learning_rate": 1.6433189655172412e-06, "logits/chosen": -1.674708604812622, "logits/rejected": -1.6811631917953491, "logps/chosen": -543.1943359375, "logps/rejected": -522.18505859375, "loss": 0.5817, "rewards/accuracies": 0.714062511920929, "rewards/chosen": -0.8475159406661987, "rewards/margins": 0.33499693870544434, "rewards/rejected": -1.182512879371643, "step": 1260 }, { "epoch": 1.3688698641679868, "grad_norm": 21.125, "learning_rate": 1.6325431034482761e-06, "logits/chosen": -1.7509996891021729, "logits/rejected": -1.7384611368179321, "logps/chosen": -561.0062255859375, "logps/rejected": -544.8842163085938, "loss": 0.5948, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8584550619125366, "rewards/margins": 0.2931164801120758, "rewards/rejected": -1.15157151222229, "step": 1270 }, { "epoch": 1.3796555327109103, "grad_norm": 21.75, "learning_rate": 1.6217672413793106e-06, "logits/chosen": -1.7544962167739868, "logits/rejected": -1.7836391925811768, "logps/chosen": -558.3255615234375, "logps/rejected": -549.2784423828125, "loss": 0.594, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.887876033782959, "rewards/margins": 0.2937227785587311, "rewards/rejected": -1.1815989017486572, "step": 1280 }, { "epoch": 1.390441201253834, "grad_norm": 20.5, "learning_rate": 1.6109913793103451e-06, "logits/chosen": -1.766278862953186, "logits/rejected": -1.765802025794983, "logps/chosen": -527.0892333984375, "logps/rejected": -512.3470458984375, "loss": 0.5855, "rewards/accuracies": 0.7015625238418579, "rewards/chosen": -0.9098323583602905, "rewards/margins": 0.3318841755390167, "rewards/rejected": -1.2417166233062744, "step": 1290 }, { "epoch": 1.4012268697967576, "grad_norm": 22.25, "learning_rate": 1.6002155172413794e-06, "logits/chosen": -1.767046332359314, "logits/rejected": -1.7728359699249268, "logps/chosen": -531.3649291992188, "logps/rejected": -521.9031982421875, "loss": 0.5806, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9428930282592773, "rewards/margins": 0.34324660897254944, "rewards/rejected": -1.2861396074295044, "step": 1300 }, { "epoch": 1.4012268697967576, "eval_logits/chosen": -1.7951500415802002, "eval_logits/rejected": -1.859129786491394, "eval_logps/chosen": -570.1704711914062, "eval_logps/rejected": -518.6593017578125, "eval_loss": 0.645327091217041, "eval_rewards/accuracies": 0.6153846383094788, "eval_rewards/chosen": -1.0972670316696167, "eval_rewards/margins": 0.18517184257507324, "eval_rewards/rejected": -1.2824387550354004, "eval_runtime": 12.8396, "eval_samples_per_second": 7.788, "eval_steps_per_second": 1.012, "step": 1300 }, { "epoch": 1.412012538339681, "grad_norm": 20.875, "learning_rate": 1.589439655172414e-06, "logits/chosen": -1.7911393642425537, "logits/rejected": -1.798229455947876, "logps/chosen": -542.2491455078125, "logps/rejected": -538.056640625, "loss": 0.5839, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.9245232343673706, "rewards/margins": 0.33389586210250854, "rewards/rejected": -1.2584190368652344, "step": 1310 }, { "epoch": 1.4227982068826046, "grad_norm": 21.625, "learning_rate": 1.5786637931034484e-06, "logits/chosen": -1.773233413696289, "logits/rejected": -1.7785921096801758, "logps/chosen": -529.9383544921875, "logps/rejected": -518.8634643554688, "loss": 0.6115, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -0.9675337672233582, "rewards/margins": 0.279288113117218, "rewards/rejected": -1.2468219995498657, "step": 1320 }, { "epoch": 1.4335838754255283, "grad_norm": 22.125, "learning_rate": 1.567887931034483e-06, "logits/chosen": -1.7141765356063843, "logits/rejected": -1.7172152996063232, "logps/chosen": -541.616943359375, "logps/rejected": -534.3272705078125, "loss": 0.5887, "rewards/accuracies": 0.729687511920929, "rewards/chosen": -0.8925272226333618, "rewards/margins": 0.3137276768684387, "rewards/rejected": -1.2062549591064453, "step": 1330 }, { "epoch": 1.4443695439684519, "grad_norm": 20.5, "learning_rate": 1.5571120689655174e-06, "logits/chosen": -1.7413345575332642, "logits/rejected": -1.762304663658142, "logps/chosen": -568.6903076171875, "logps/rejected": -551.1282348632812, "loss": 0.5889, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.9233890771865845, "rewards/margins": 0.32237863540649414, "rewards/rejected": -1.2457677125930786, "step": 1340 }, { "epoch": 1.4551552125113756, "grad_norm": 21.625, "learning_rate": 1.546336206896552e-06, "logits/chosen": -1.7776886224746704, "logits/rejected": -1.7672054767608643, "logps/chosen": -537.7005004882812, "logps/rejected": -535.3060302734375, "loss": 0.6075, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.9527568817138672, "rewards/margins": 0.28692346811294556, "rewards/rejected": -1.239680528640747, "step": 1350 }, { "epoch": 1.4551552125113756, "eval_logits/chosen": -1.799996018409729, "eval_logits/rejected": -1.8631552457809448, "eval_logps/chosen": -570.1870727539062, "eval_logps/rejected": -518.773681640625, "eval_loss": 0.6434743404388428, "eval_rewards/accuracies": 0.6538461446762085, "eval_rewards/chosen": -1.0980980396270752, "eval_rewards/margins": 0.19006212055683136, "eval_rewards/rejected": -1.2881600856781006, "eval_runtime": 12.8518, "eval_samples_per_second": 7.781, "eval_steps_per_second": 1.012, "step": 1350 }, { "epoch": 1.465940881054299, "grad_norm": 22.0, "learning_rate": 1.5355603448275862e-06, "logits/chosen": -1.7860710620880127, "logits/rejected": -1.7912073135375977, "logps/chosen": -555.429931640625, "logps/rejected": -541.3480224609375, "loss": 0.5877, "rewards/accuracies": 0.7015625238418579, "rewards/chosen": -0.9607505798339844, "rewards/margins": 0.33639881014823914, "rewards/rejected": -1.297149419784546, "step": 1360 }, { "epoch": 1.4767265495972226, "grad_norm": 20.125, "learning_rate": 1.5247844827586207e-06, "logits/chosen": -1.7754265069961548, "logits/rejected": -1.7940123081207275, "logps/chosen": -540.836181640625, "logps/rejected": -531.376953125, "loss": 0.5871, "rewards/accuracies": 0.7046874761581421, "rewards/chosen": -0.8988661766052246, "rewards/margins": 0.32094043493270874, "rewards/rejected": -1.2198066711425781, "step": 1370 }, { "epoch": 1.4875122181401463, "grad_norm": 21.25, "learning_rate": 1.5140086206896552e-06, "logits/chosen": -1.7347028255462646, "logits/rejected": -1.7507202625274658, "logps/chosen": -550.8851928710938, "logps/rejected": -533.6559448242188, "loss": 0.6083, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.8865949511528015, "rewards/margins": 0.271259605884552, "rewards/rejected": -1.157854437828064, "step": 1380 }, { "epoch": 1.4982978866830698, "grad_norm": 19.625, "learning_rate": 1.5032327586206897e-06, "logits/chosen": -1.815341591835022, "logits/rejected": -1.816318154335022, "logps/chosen": -524.8699951171875, "logps/rejected": -511.8518981933594, "loss": 0.5777, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.9125308990478516, "rewards/margins": 0.3498198091983795, "rewards/rejected": -1.2623505592346191, "step": 1390 }, { "epoch": 1.5090835552259936, "grad_norm": 19.875, "learning_rate": 1.4924568965517243e-06, "logits/chosen": -1.7852647304534912, "logits/rejected": -1.799068808555603, "logps/chosen": -533.41455078125, "logps/rejected": -519.7373657226562, "loss": 0.5829, "rewards/accuracies": 0.7171875238418579, "rewards/chosen": -0.9632998704910278, "rewards/margins": 0.3437105119228363, "rewards/rejected": -1.3070104122161865, "step": 1400 }, { "epoch": 1.5090835552259936, "eval_logits/chosen": -1.7990481853485107, "eval_logits/rejected": -1.8625297546386719, "eval_logps/chosen": -570.4349975585938, "eval_logps/rejected": -519.1015014648438, "eval_loss": 0.6433591246604919, "eval_rewards/accuracies": 0.6346153616905212, "eval_rewards/chosen": -1.1104950904846191, "eval_rewards/margins": 0.19405566155910492, "eval_rewards/rejected": -1.3045506477355957, "eval_runtime": 12.8758, "eval_samples_per_second": 7.767, "eval_steps_per_second": 1.01, "step": 1400 }, { "epoch": 1.519869223768917, "grad_norm": 20.875, "learning_rate": 1.4816810344827588e-06, "logits/chosen": -1.7966508865356445, "logits/rejected": -1.8273112773895264, "logps/chosen": -548.8226318359375, "logps/rejected": -528.164306640625, "loss": 0.5799, "rewards/accuracies": 0.714062511920929, "rewards/chosen": -0.9004064798355103, "rewards/margins": 0.3501775860786438, "rewards/rejected": -1.2505838871002197, "step": 1410 }, { "epoch": 1.5306548923118406, "grad_norm": 21.25, "learning_rate": 1.4709051724137933e-06, "logits/chosen": -1.7765086889266968, "logits/rejected": -1.8015928268432617, "logps/chosen": -549.3746948242188, "logps/rejected": -529.9730224609375, "loss": 0.5816, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9200440645217896, "rewards/margins": 0.33418479561805725, "rewards/rejected": -1.2542288303375244, "step": 1420 }, { "epoch": 1.541440560854764, "grad_norm": 20.0, "learning_rate": 1.4601293103448276e-06, "logits/chosen": -1.8374097347259521, "logits/rejected": -1.8128893375396729, "logps/chosen": -553.1683349609375, "logps/rejected": -540.2759399414062, "loss": 0.5902, "rewards/accuracies": 0.7046874761581421, "rewards/chosen": -0.964668869972229, "rewards/margins": 0.3389004170894623, "rewards/rejected": -1.3035693168640137, "step": 1430 }, { "epoch": 1.5522262293976878, "grad_norm": 22.375, "learning_rate": 1.449353448275862e-06, "logits/chosen": -1.828120231628418, "logits/rejected": -1.8493430614471436, "logps/chosen": -569.1422119140625, "logps/rejected": -556.3524780273438, "loss": 0.5997, "rewards/accuracies": 0.714062511920929, "rewards/chosen": -1.006941795349121, "rewards/margins": 0.31565380096435547, "rewards/rejected": -1.3225957155227661, "step": 1440 }, { "epoch": 1.5630118979406116, "grad_norm": 22.0, "learning_rate": 1.4385775862068966e-06, "logits/chosen": -1.7910667657852173, "logits/rejected": -1.8110182285308838, "logps/chosen": -550.4522094726562, "logps/rejected": -535.9798583984375, "loss": 0.5846, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9523816108703613, "rewards/margins": 0.34083276987075806, "rewards/rejected": -1.2932144403457642, "step": 1450 }, { "epoch": 1.5630118979406116, "eval_logits/chosen": -1.8040180206298828, "eval_logits/rejected": -1.8676948547363281, "eval_logps/chosen": -570.4541015625, "eval_logps/rejected": -519.1150512695312, "eval_loss": 0.6397081613540649, "eval_rewards/accuracies": 0.682692289352417, "eval_rewards/chosen": -1.1114522218704224, "eval_rewards/margins": 0.19377943873405457, "eval_rewards/rejected": -1.3052315711975098, "eval_runtime": 12.8296, "eval_samples_per_second": 7.794, "eval_steps_per_second": 1.013, "step": 1450 }, { "epoch": 1.573797566483535, "grad_norm": 20.0, "learning_rate": 1.427801724137931e-06, "logits/chosen": -1.7791173458099365, "logits/rejected": -1.78426992893219, "logps/chosen": -552.9260864257812, "logps/rejected": -532.0748291015625, "loss": 0.5938, "rewards/accuracies": 0.6734374761581421, "rewards/chosen": -0.9482347369194031, "rewards/margins": 0.3171328008174896, "rewards/rejected": -1.2653675079345703, "step": 1460 }, { "epoch": 1.5845832350264586, "grad_norm": 21.375, "learning_rate": 1.4170258620689656e-06, "logits/chosen": -1.8079183101654053, "logits/rejected": -1.8390960693359375, "logps/chosen": -544.3607177734375, "logps/rejected": -525.732421875, "loss": 0.5943, "rewards/accuracies": 0.698437511920929, "rewards/chosen": -0.9570525288581848, "rewards/margins": 0.3199313282966614, "rewards/rejected": -1.2769839763641357, "step": 1470 }, { "epoch": 1.595368903569382, "grad_norm": 21.375, "learning_rate": 1.40625e-06, "logits/chosen": -1.8212658166885376, "logits/rejected": -1.8325313329696655, "logps/chosen": -531.6943969726562, "logps/rejected": -520.8181762695312, "loss": 0.5909, "rewards/accuracies": 0.7046874761581421, "rewards/chosen": -0.9884265065193176, "rewards/margins": 0.3185192048549652, "rewards/rejected": -1.30694580078125, "step": 1480 }, { "epoch": 1.6061545721123056, "grad_norm": 22.625, "learning_rate": 1.3954741379310346e-06, "logits/chosen": -1.7863876819610596, "logits/rejected": -1.7843122482299805, "logps/chosen": -527.3268432617188, "logps/rejected": -513.3717041015625, "loss": 0.5917, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.9644559025764465, "rewards/margins": 0.3113047182559967, "rewards/rejected": -1.2757607698440552, "step": 1490 }, { "epoch": 1.6169402406552293, "grad_norm": 21.625, "learning_rate": 1.3846982758620689e-06, "logits/chosen": -1.7883723974227905, "logits/rejected": -1.7981631755828857, "logps/chosen": -559.0615234375, "logps/rejected": -548.7954711914062, "loss": 0.5805, "rewards/accuracies": 0.707812488079071, "rewards/chosen": -0.943580150604248, "rewards/margins": 0.36219388246536255, "rewards/rejected": -1.3057740926742554, "step": 1500 }, { "epoch": 1.6169402406552293, "eval_logits/chosen": -1.805317759513855, "eval_logits/rejected": -1.8692779541015625, "eval_logps/chosen": -570.7977905273438, "eval_logps/rejected": -519.4142456054688, "eval_loss": 0.6438877582550049, "eval_rewards/accuracies": 0.6442307829856873, "eval_rewards/chosen": -1.1286320686340332, "eval_rewards/margins": 0.19155457615852356, "eval_rewards/rejected": -1.320186734199524, "eval_runtime": 12.8603, "eval_samples_per_second": 7.776, "eval_steps_per_second": 1.011, "step": 1500 }, { "epoch": 1.627725909198153, "grad_norm": 21.375, "learning_rate": 1.3739224137931034e-06, "logits/chosen": -1.7962541580200195, "logits/rejected": -1.8288599252700806, "logps/chosen": -543.5225219726562, "logps/rejected": -517.6636962890625, "loss": 0.6067, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9828208088874817, "rewards/margins": 0.28606343269348145, "rewards/rejected": -1.2688841819763184, "step": 1510 }, { "epoch": 1.6385115777410766, "grad_norm": 22.25, "learning_rate": 1.363146551724138e-06, "logits/chosen": -1.7885265350341797, "logits/rejected": -1.8046941757202148, "logps/chosen": -537.9309692382812, "logps/rejected": -522.3575439453125, "loss": 0.6148, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9945136904716492, "rewards/margins": 0.2747846841812134, "rewards/rejected": -1.2692983150482178, "step": 1520 }, { "epoch": 1.649297246284, "grad_norm": 21.375, "learning_rate": 1.3523706896551724e-06, "logits/chosen": -1.8596391677856445, "logits/rejected": -1.880578637123108, "logps/chosen": -543.6522827148438, "logps/rejected": -529.7686767578125, "loss": 0.5866, "rewards/accuracies": 0.698437511920929, "rewards/chosen": -0.9741571545600891, "rewards/margins": 0.33137011528015137, "rewards/rejected": -1.3055272102355957, "step": 1530 }, { "epoch": 1.6600829148269236, "grad_norm": 21.375, "learning_rate": 1.341594827586207e-06, "logits/chosen": -1.7963918447494507, "logits/rejected": -1.8010103702545166, "logps/chosen": -547.7319946289062, "logps/rejected": -522.956298828125, "loss": 0.5891, "rewards/accuracies": 0.703125, "rewards/chosen": -0.9565073251724243, "rewards/margins": 0.31486618518829346, "rewards/rejected": -1.2713735103607178, "step": 1540 }, { "epoch": 1.6708685833698473, "grad_norm": 20.25, "learning_rate": 1.3308189655172414e-06, "logits/chosen": -1.8648087978363037, "logits/rejected": -1.8721832036972046, "logps/chosen": -559.5469360351562, "logps/rejected": -548.2471313476562, "loss": 0.5717, "rewards/accuracies": 0.745312511920929, "rewards/chosen": -0.9510286450386047, "rewards/margins": 0.36009079217910767, "rewards/rejected": -1.3111193180084229, "step": 1550 }, { "epoch": 1.6708685833698473, "eval_logits/chosen": -1.806795597076416, "eval_logits/rejected": -1.871440052986145, "eval_logps/chosen": -570.8436889648438, "eval_logps/rejected": -519.555908203125, "eval_loss": 0.641202986240387, "eval_rewards/accuracies": 0.6538461446762085, "eval_rewards/chosen": -1.130926489830017, "eval_rewards/margins": 0.19634580612182617, "eval_rewards/rejected": -1.3272722959518433, "eval_runtime": 12.8672, "eval_samples_per_second": 7.772, "eval_steps_per_second": 1.01, "step": 1550 }, { "epoch": 1.681654251912771, "grad_norm": 23.25, "learning_rate": 1.320043103448276e-06, "logits/chosen": -1.7812159061431885, "logits/rejected": -1.7934643030166626, "logps/chosen": -552.8800048828125, "logps/rejected": -549.5223999023438, "loss": 0.5768, "rewards/accuracies": 0.7171875238418579, "rewards/chosen": -0.9710870981216431, "rewards/margins": 0.3766544461250305, "rewards/rejected": -1.3477414846420288, "step": 1560 }, { "epoch": 1.6924399204556946, "grad_norm": 19.875, "learning_rate": 1.3092672413793102e-06, "logits/chosen": -1.8353354930877686, "logits/rejected": -1.8481401205062866, "logps/chosen": -519.0048217773438, "logps/rejected": -509.49591064453125, "loss": 0.5817, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9764666557312012, "rewards/margins": 0.3405064642429352, "rewards/rejected": -1.316973090171814, "step": 1570 }, { "epoch": 1.703225588998618, "grad_norm": 20.375, "learning_rate": 1.2984913793103447e-06, "logits/chosen": -1.7921619415283203, "logits/rejected": -1.804537057876587, "logps/chosen": -537.7366943359375, "logps/rejected": -510.7959899902344, "loss": 0.5992, "rewards/accuracies": 0.676562488079071, "rewards/chosen": -0.9829401969909668, "rewards/margins": 0.31215885281562805, "rewards/rejected": -1.2950990200042725, "step": 1580 }, { "epoch": 1.7140112575415416, "grad_norm": 21.875, "learning_rate": 1.2877155172413794e-06, "logits/chosen": -1.8133258819580078, "logits/rejected": -1.8023666143417358, "logps/chosen": -543.982666015625, "logps/rejected": -542.6021728515625, "loss": 0.5781, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.9853167533874512, "rewards/margins": 0.36080607771873474, "rewards/rejected": -1.3461229801177979, "step": 1590 }, { "epoch": 1.724796926084465, "grad_norm": 21.25, "learning_rate": 1.276939655172414e-06, "logits/chosen": -1.8179728984832764, "logits/rejected": -1.8313579559326172, "logps/chosen": -551.9962158203125, "logps/rejected": -533.030029296875, "loss": 0.5797, "rewards/accuracies": 0.698437511920929, "rewards/chosen": -1.0232875347137451, "rewards/margins": 0.3661686182022095, "rewards/rejected": -1.3894561529159546, "step": 1600 }, { "epoch": 1.724796926084465, "eval_logits/chosen": -1.80728280544281, "eval_logits/rejected": -1.871204137802124, "eval_logps/chosen": -571.1751098632812, "eval_logps/rejected": -519.9603881835938, "eval_loss": 0.6392231583595276, "eval_rewards/accuracies": 0.6634615659713745, "eval_rewards/chosen": -1.1474987268447876, "eval_rewards/margins": 0.19999684393405914, "eval_rewards/rejected": -1.347495436668396, "eval_runtime": 12.8417, "eval_samples_per_second": 7.787, "eval_steps_per_second": 1.012, "step": 1600 }, { "epoch": 1.7355825946273888, "grad_norm": 23.75, "learning_rate": 1.2661637931034484e-06, "logits/chosen": -1.7909305095672607, "logits/rejected": -1.8273541927337646, "logps/chosen": -543.5694580078125, "logps/rejected": -525.0535888671875, "loss": 0.6002, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9681240320205688, "rewards/margins": 0.2939276099205017, "rewards/rejected": -1.2620514631271362, "step": 1610 }, { "epoch": 1.7463682631703126, "grad_norm": 20.875, "learning_rate": 1.255387931034483e-06, "logits/chosen": -1.8298015594482422, "logits/rejected": -1.8496816158294678, "logps/chosen": -559.7938232421875, "logps/rejected": -542.3406982421875, "loss": 0.5936, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -1.017887830734253, "rewards/margins": 0.32336899638175964, "rewards/rejected": -1.341256856918335, "step": 1620 }, { "epoch": 1.757153931713236, "grad_norm": 20.75, "learning_rate": 1.2446120689655172e-06, "logits/chosen": -1.795771598815918, "logits/rejected": -1.8096462488174438, "logps/chosen": -550.1193237304688, "logps/rejected": -526.5523071289062, "loss": 0.5825, "rewards/accuracies": 0.714062511920929, "rewards/chosen": -0.9442169070243835, "rewards/margins": 0.3580732047557831, "rewards/rejected": -1.3022902011871338, "step": 1630 }, { "epoch": 1.7679396002561596, "grad_norm": 21.625, "learning_rate": 1.2338362068965518e-06, "logits/chosen": -1.7974588871002197, "logits/rejected": -1.820927381515503, "logps/chosen": -539.9898681640625, "logps/rejected": -533.06689453125, "loss": 0.5873, "rewards/accuracies": 0.7203124761581421, "rewards/chosen": -0.9383485913276672, "rewards/margins": 0.336906760931015, "rewards/rejected": -1.2752554416656494, "step": 1640 }, { "epoch": 1.778725268799083, "grad_norm": 20.25, "learning_rate": 1.2230603448275863e-06, "logits/chosen": -1.7889001369476318, "logits/rejected": -1.7788540124893188, "logps/chosen": -549.4357299804688, "logps/rejected": -537.9326171875, "loss": 0.5948, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9701086282730103, "rewards/margins": 0.31890323758125305, "rewards/rejected": -1.289011836051941, "step": 1650 }, { "epoch": 1.778725268799083, "eval_logits/chosen": -1.808042049407959, "eval_logits/rejected": -1.8723417520523071, "eval_logps/chosen": -570.9359741210938, "eval_logps/rejected": -519.9014892578125, "eval_loss": 0.6358182430267334, "eval_rewards/accuracies": 0.6730769276618958, "eval_rewards/chosen": -1.1355457305908203, "eval_rewards/margins": 0.2090054303407669, "eval_rewards/rejected": -1.3445510864257812, "eval_runtime": 12.9132, "eval_samples_per_second": 7.744, "eval_steps_per_second": 1.007, "step": 1650 }, { "epoch": 1.7895109373420068, "grad_norm": 19.75, "learning_rate": 1.2122844827586208e-06, "logits/chosen": -1.8406860828399658, "logits/rejected": -1.8749334812164307, "logps/chosen": -565.1243896484375, "logps/rejected": -564.97119140625, "loss": 0.5715, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.0176526308059692, "rewards/margins": 0.35605573654174805, "rewards/rejected": -1.3737082481384277, "step": 1660 }, { "epoch": 1.8002966058849303, "grad_norm": 23.125, "learning_rate": 1.2015086206896553e-06, "logits/chosen": -1.8156760931015015, "logits/rejected": -1.821239709854126, "logps/chosen": -532.1543579101562, "logps/rejected": -519.3363037109375, "loss": 0.5788, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0197910070419312, "rewards/margins": 0.367012083530426, "rewards/rejected": -1.3868030309677124, "step": 1670 }, { "epoch": 1.811082274427854, "grad_norm": 21.5, "learning_rate": 1.1907327586206898e-06, "logits/chosen": -1.7746264934539795, "logits/rejected": -1.7720403671264648, "logps/chosen": -531.2168579101562, "logps/rejected": -520.6815185546875, "loss": 0.594, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0364903211593628, "rewards/margins": 0.32573047280311584, "rewards/rejected": -1.3622207641601562, "step": 1680 }, { "epoch": 1.8218679429707776, "grad_norm": 21.5, "learning_rate": 1.179956896551724e-06, "logits/chosen": -1.8178173303604126, "logits/rejected": -1.8478190898895264, "logps/chosen": -531.7720947265625, "logps/rejected": -516.5214233398438, "loss": 0.6073, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -1.040016531944275, "rewards/margins": 0.2878865599632263, "rewards/rejected": -1.3279030323028564, "step": 1690 }, { "epoch": 1.832653611513701, "grad_norm": 21.875, "learning_rate": 1.1691810344827586e-06, "logits/chosen": -1.8465070724487305, "logits/rejected": -1.8316357135772705, "logps/chosen": -533.6699829101562, "logps/rejected": -520.6783447265625, "loss": 0.5861, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.9806226491928101, "rewards/margins": 0.33271270990371704, "rewards/rejected": -1.313335657119751, "step": 1700 }, { "epoch": 1.832653611513701, "eval_logits/chosen": -1.8129793405532837, "eval_logits/rejected": -1.8767609596252441, "eval_logps/chosen": -571.2252197265625, "eval_logps/rejected": -519.8920288085938, "eval_loss": 0.6430386900901794, "eval_rewards/accuracies": 0.6442307829856873, "eval_rewards/chosen": -1.1500080823898315, "eval_rewards/margins": 0.19407115876674652, "eval_rewards/rejected": -1.3440792560577393, "eval_runtime": 12.8277, "eval_samples_per_second": 7.796, "eval_steps_per_second": 1.013, "step": 1700 }, { "epoch": 1.8434392800566246, "grad_norm": 20.125, "learning_rate": 1.158405172413793e-06, "logits/chosen": -1.769142508506775, "logits/rejected": -1.7703624963760376, "logps/chosen": -527.4718627929688, "logps/rejected": -509.33013916015625, "loss": 0.5793, "rewards/accuracies": 0.7046874761581421, "rewards/chosen": -0.9673894643783569, "rewards/margins": 0.36447519063949585, "rewards/rejected": -1.3318647146224976, "step": 1710 }, { "epoch": 1.8542249485995483, "grad_norm": 21.0, "learning_rate": 1.1476293103448276e-06, "logits/chosen": -1.7493419647216797, "logits/rejected": -1.740152359008789, "logps/chosen": -551.5736083984375, "logps/rejected": -546.8021240234375, "loss": 0.5859, "rewards/accuracies": 0.7109375, "rewards/chosen": -1.000551700592041, "rewards/margins": 0.3465580344200134, "rewards/rejected": -1.3471099138259888, "step": 1720 }, { "epoch": 1.865010617142472, "grad_norm": 22.0, "learning_rate": 1.136853448275862e-06, "logits/chosen": -1.795250654220581, "logits/rejected": -1.8162511587142944, "logps/chosen": -535.1650390625, "logps/rejected": -517.9396362304688, "loss": 0.5948, "rewards/accuracies": 0.714062511920929, "rewards/chosen": -1.001084566116333, "rewards/margins": 0.31550517678260803, "rewards/rejected": -1.3165898323059082, "step": 1730 }, { "epoch": 1.8757962856853956, "grad_norm": 20.375, "learning_rate": 1.1260775862068966e-06, "logits/chosen": -1.8540292978286743, "logits/rejected": -1.866869330406189, "logps/chosen": -534.6907958984375, "logps/rejected": -514.3424072265625, "loss": 0.5831, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0445291996002197, "rewards/margins": 0.34380587935447693, "rewards/rejected": -1.3883349895477295, "step": 1740 }, { "epoch": 1.886581954228319, "grad_norm": 22.125, "learning_rate": 1.115301724137931e-06, "logits/chosen": -1.8096578121185303, "logits/rejected": -1.8162269592285156, "logps/chosen": -558.2589721679688, "logps/rejected": -544.1400146484375, "loss": 0.594, "rewards/accuracies": 0.703125, "rewards/chosen": -1.0290067195892334, "rewards/margins": 0.3109704852104187, "rewards/rejected": -1.3399772644042969, "step": 1750 }, { "epoch": 1.886581954228319, "eval_logits/chosen": -1.8141746520996094, "eval_logits/rejected": -1.8783446550369263, "eval_logps/chosen": -571.4691772460938, "eval_logps/rejected": -520.206787109375, "eval_loss": 0.6430192589759827, "eval_rewards/accuracies": 0.6538461446762085, "eval_rewards/chosen": -1.1622072458267212, "eval_rewards/margins": 0.19761131703853607, "eval_rewards/rejected": -1.359818696975708, "eval_runtime": 12.8695, "eval_samples_per_second": 7.77, "eval_steps_per_second": 1.01, "step": 1750 }, { "epoch": 1.8973676227712426, "grad_norm": 21.0, "learning_rate": 1.1045258620689654e-06, "logits/chosen": -1.8399471044540405, "logits/rejected": -1.8462409973144531, "logps/chosen": -528.4866943359375, "logps/rejected": -521.9978637695312, "loss": 0.5739, "rewards/accuracies": 0.707812488079071, "rewards/chosen": -0.996717095375061, "rewards/margins": 0.3896409571170807, "rewards/rejected": -1.3863580226898193, "step": 1760 }, { "epoch": 1.9081532913141663, "grad_norm": 20.375, "learning_rate": 1.09375e-06, "logits/chosen": -1.824342131614685, "logits/rejected": -1.851354956626892, "logps/chosen": -541.2124633789062, "logps/rejected": -521.5025024414062, "loss": 0.5939, "rewards/accuracies": 0.6734374761581421, "rewards/chosen": -1.0108633041381836, "rewards/margins": 0.32563039660453796, "rewards/rejected": -1.3364938497543335, "step": 1770 }, { "epoch": 1.9189389598570898, "grad_norm": 21.375, "learning_rate": 1.0829741379310344e-06, "logits/chosen": -1.7720905542373657, "logits/rejected": -1.7949879169464111, "logps/chosen": -558.3751220703125, "logps/rejected": -534.5128173828125, "loss": 0.604, "rewards/accuracies": 0.692187488079071, "rewards/chosen": -0.9803739786148071, "rewards/margins": 0.30562636256217957, "rewards/rejected": -1.2860002517700195, "step": 1780 }, { "epoch": 1.9297246284000136, "grad_norm": 22.0, "learning_rate": 1.072198275862069e-06, "logits/chosen": -1.8202836513519287, "logits/rejected": -1.830665946006775, "logps/chosen": -528.1467895507812, "logps/rejected": -520.9862060546875, "loss": 0.6036, "rewards/accuracies": 0.7015625238418579, "rewards/chosen": -1.015037178993225, "rewards/margins": 0.2926950752735138, "rewards/rejected": -1.307732343673706, "step": 1790 }, { "epoch": 1.940510296942937, "grad_norm": 21.0, "learning_rate": 1.0614224137931034e-06, "logits/chosen": -1.8530925512313843, "logits/rejected": -1.8568578958511353, "logps/chosen": -514.7476196289062, "logps/rejected": -507.7210998535156, "loss": 0.6026, "rewards/accuracies": 0.682812511920929, "rewards/chosen": -1.0476781129837036, "rewards/margins": 0.30604255199432373, "rewards/rejected": -1.3537206649780273, "step": 1800 }, { "epoch": 1.940510296942937, "eval_logits/chosen": -1.8167260885238647, "eval_logits/rejected": -1.881202220916748, "eval_logps/chosen": -571.269775390625, "eval_logps/rejected": -520.2171630859375, "eval_loss": 0.6376249194145203, "eval_rewards/accuracies": 0.6346153616905212, "eval_rewards/chosen": -1.1522341966629028, "eval_rewards/margins": 0.20809604227542877, "eval_rewards/rejected": -1.3603301048278809, "eval_runtime": 12.8279, "eval_samples_per_second": 7.796, "eval_steps_per_second": 1.013, "step": 1800 }, { "epoch": 1.9512959654858606, "grad_norm": 19.875, "learning_rate": 1.0506465517241381e-06, "logits/chosen": -1.7945997714996338, "logits/rejected": -1.8242772817611694, "logps/chosen": -558.9090576171875, "logps/rejected": -544.4629516601562, "loss": 0.6057, "rewards/accuracies": 0.6953125, "rewards/chosen": -1.033698558807373, "rewards/margins": 0.2901691198348999, "rewards/rejected": -1.323867678642273, "step": 1810 }, { "epoch": 1.962081634028784, "grad_norm": 20.625, "learning_rate": 1.0398706896551724e-06, "logits/chosen": -1.763892412185669, "logits/rejected": -1.7766506671905518, "logps/chosen": -549.2996826171875, "logps/rejected": -538.8092041015625, "loss": 0.5675, "rewards/accuracies": 0.746874988079071, "rewards/chosen": -0.9804860353469849, "rewards/margins": 0.3770001232624054, "rewards/rejected": -1.3574860095977783, "step": 1820 }, { "epoch": 1.9728673025717078, "grad_norm": 20.125, "learning_rate": 1.029094827586207e-06, "logits/chosen": -1.8451900482177734, "logits/rejected": -1.8786299228668213, "logps/chosen": -567.0401611328125, "logps/rejected": -549.7001953125, "loss": 0.5843, "rewards/accuracies": 0.7015625238418579, "rewards/chosen": -0.9965025782585144, "rewards/margins": 0.3378158509731293, "rewards/rejected": -1.3343183994293213, "step": 1830 }, { "epoch": 1.9836529711146316, "grad_norm": 21.0, "learning_rate": 1.0183189655172414e-06, "logits/chosen": -1.8494704961776733, "logits/rejected": -1.8769400119781494, "logps/chosen": -552.8043823242188, "logps/rejected": -537.8701782226562, "loss": 0.5857, "rewards/accuracies": 0.703125, "rewards/chosen": -1.03322172164917, "rewards/margins": 0.4004359841346741, "rewards/rejected": -1.4336576461791992, "step": 1840 }, { "epoch": 1.994438639657555, "grad_norm": 24.25, "learning_rate": 1.007543103448276e-06, "logits/chosen": -1.8407787084579468, "logits/rejected": -1.8548295497894287, "logps/chosen": -557.3915405273438, "logps/rejected": -536.4610595703125, "loss": 0.6012, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -1.0534279346466064, "rewards/margins": 0.3128618001937866, "rewards/rejected": -1.3662898540496826, "step": 1850 }, { "epoch": 1.994438639657555, "eval_logits/chosen": -1.8164806365966797, "eval_logits/rejected": -1.8804469108581543, "eval_logps/chosen": -571.3651123046875, "eval_logps/rejected": -520.392333984375, "eval_loss": 0.6372777819633484, "eval_rewards/accuracies": 0.6730769276618958, "eval_rewards/chosen": -1.1570004224777222, "eval_rewards/margins": 0.2120911180973053, "eval_rewards/rejected": -1.3690916299819946, "eval_runtime": 12.854, "eval_samples_per_second": 7.78, "eval_steps_per_second": 1.011, "step": 1850 }, { "epoch": 2.0043142674171692, "grad_norm": 20.5, "learning_rate": 9.967672413793105e-07, "logits/chosen": -1.8243948221206665, "logits/rejected": -1.8106485605239868, "logps/chosen": -563.4216918945312, "logps/rejected": -554.7511596679688, "loss": 0.5964, "rewards/accuracies": 0.7047781348228455, "rewards/chosen": -1.0186046361923218, "rewards/margins": 0.33347877860069275, "rewards/rejected": -1.3520833253860474, "step": 1860 }, { "epoch": 2.015099935960093, "grad_norm": 22.25, "learning_rate": 9.85991379310345e-07, "logits/chosen": -1.816082239151001, "logits/rejected": -1.821375846862793, "logps/chosen": -525.9388427734375, "logps/rejected": -514.32861328125, "loss": 0.5994, "rewards/accuracies": 0.707812488079071, "rewards/chosen": -1.0036041736602783, "rewards/margins": 0.3070666193962097, "rewards/rejected": -1.3106707334518433, "step": 1870 }, { "epoch": 2.0258856045030167, "grad_norm": 19.75, "learning_rate": 9.752155172413795e-07, "logits/chosen": -1.8075281381607056, "logits/rejected": -1.8388919830322266, "logps/chosen": -552.760009765625, "logps/rejected": -546.732421875, "loss": 0.5852, "rewards/accuracies": 0.692187488079071, "rewards/chosen": -1.0357027053833008, "rewards/margins": 0.33780843019485474, "rewards/rejected": -1.3735110759735107, "step": 1880 }, { "epoch": 2.03667127304594, "grad_norm": 20.125, "learning_rate": 9.644396551724138e-07, "logits/chosen": -1.767423391342163, "logits/rejected": -1.7957046031951904, "logps/chosen": -553.3221435546875, "logps/rejected": -534.2494506835938, "loss": 0.574, "rewards/accuracies": 0.714062511920929, "rewards/chosen": -0.977568507194519, "rewards/margins": 0.36738520860671997, "rewards/rejected": -1.3449536561965942, "step": 1890 }, { "epoch": 2.0474569415888637, "grad_norm": 21.75, "learning_rate": 9.536637931034483e-07, "logits/chosen": -1.8196423053741455, "logits/rejected": -1.8127524852752686, "logps/chosen": -534.7515258789062, "logps/rejected": -519.46484375, "loss": 0.5833, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.9940136075019836, "rewards/margins": 0.34578242897987366, "rewards/rejected": -1.3397960662841797, "step": 1900 }, { "epoch": 2.0474569415888637, "eval_logits/chosen": -1.8176302909851074, "eval_logits/rejected": -1.8824341297149658, "eval_logps/chosen": -571.3641967773438, "eval_logps/rejected": -520.0811157226562, "eval_loss": 0.6425002813339233, "eval_rewards/accuracies": 0.6442307829856873, "eval_rewards/chosen": -1.1569527387619019, "eval_rewards/margins": 0.19657853245735168, "eval_rewards/rejected": -1.3535313606262207, "eval_runtime": 12.834, "eval_samples_per_second": 7.792, "eval_steps_per_second": 1.013, "step": 1900 }, { "epoch": 2.058242610131787, "grad_norm": 20.75, "learning_rate": 9.428879310344828e-07, "logits/chosen": -1.7818291187286377, "logits/rejected": -1.7981443405151367, "logps/chosen": -541.3946533203125, "logps/rejected": -527.3668823242188, "loss": 0.5742, "rewards/accuracies": 0.734375, "rewards/chosen": -0.9855313301086426, "rewards/margins": 0.37073832750320435, "rewards/rejected": -1.3562697172164917, "step": 1910 }, { "epoch": 2.069028278674711, "grad_norm": 18.5, "learning_rate": 9.321120689655173e-07, "logits/chosen": -1.833970069885254, "logits/rejected": -1.8262455463409424, "logps/chosen": -546.7005004882812, "logps/rejected": -538.780029296875, "loss": 0.5723, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.9842655062675476, "rewards/margins": 0.3666042685508728, "rewards/rejected": -1.3508696556091309, "step": 1920 }, { "epoch": 2.0798139472176347, "grad_norm": 19.625, "learning_rate": 9.213362068965518e-07, "logits/chosen": -1.7877027988433838, "logits/rejected": -1.7972294092178345, "logps/chosen": -541.6177978515625, "logps/rejected": -527.7945556640625, "loss": 0.5934, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9702165722846985, "rewards/margins": 0.3309406340122223, "rewards/rejected": -1.3011572360992432, "step": 1930 }, { "epoch": 2.090599615760558, "grad_norm": 21.375, "learning_rate": 9.105603448275862e-07, "logits/chosen": -1.7990188598632812, "logits/rejected": -1.8132622241973877, "logps/chosen": -550.9437255859375, "logps/rejected": -540.9758911132812, "loss": 0.5811, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9730231165885925, "rewards/margins": 0.35077935457229614, "rewards/rejected": -1.3238024711608887, "step": 1940 }, { "epoch": 2.1013852843034817, "grad_norm": 22.25, "learning_rate": 8.997844827586207e-07, "logits/chosen": -1.8408838510513306, "logits/rejected": -1.860565423965454, "logps/chosen": -528.8717041015625, "logps/rejected": -515.1477661132812, "loss": 0.591, "rewards/accuracies": 0.707812488079071, "rewards/chosen": -1.0306185483932495, "rewards/margins": 0.3226325213909149, "rewards/rejected": -1.3532510995864868, "step": 1950 }, { "epoch": 2.1013852843034817, "eval_logits/chosen": -1.8168575763702393, "eval_logits/rejected": -1.8818013668060303, "eval_logps/chosen": -571.19580078125, "eval_logps/rejected": -520.1525268554688, "eval_loss": 0.6379492282867432, "eval_rewards/accuracies": 0.6538461446762085, "eval_rewards/chosen": -1.1485364437103271, "eval_rewards/margins": 0.20856307446956635, "eval_rewards/rejected": -1.3570995330810547, "eval_runtime": 12.8416, "eval_samples_per_second": 7.787, "eval_steps_per_second": 1.012, "step": 1950 }, { "epoch": 2.112170952846405, "grad_norm": 22.625, "learning_rate": 8.890086206896552e-07, "logits/chosen": -1.8283774852752686, "logits/rejected": -1.8340866565704346, "logps/chosen": -553.7230224609375, "logps/rejected": -539.7496337890625, "loss": 0.5792, "rewards/accuracies": 0.7046874761581421, "rewards/chosen": -0.9757739901542664, "rewards/margins": 0.35080114006996155, "rewards/rejected": -1.3265750408172607, "step": 1960 }, { "epoch": 2.1229566213893287, "grad_norm": 21.125, "learning_rate": 8.782327586206896e-07, "logits/chosen": -1.8246581554412842, "logits/rejected": -1.8176138401031494, "logps/chosen": -556.3699951171875, "logps/rejected": -536.9313354492188, "loss": 0.5713, "rewards/accuracies": 0.7203124761581421, "rewards/chosen": -0.939971923828125, "rewards/margins": 0.3758017420768738, "rewards/rejected": -1.3157737255096436, "step": 1970 }, { "epoch": 2.1337422899322527, "grad_norm": 19.75, "learning_rate": 8.674568965517241e-07, "logits/chosen": -1.7579801082611084, "logits/rejected": -1.792454719543457, "logps/chosen": -506.5731506347656, "logps/rejected": -492.03741455078125, "loss": 0.5807, "rewards/accuracies": 0.7406250238418579, "rewards/chosen": -0.9504076242446899, "rewards/margins": 0.3443571925163269, "rewards/rejected": -1.294764757156372, "step": 1980 }, { "epoch": 2.144527958475176, "grad_norm": 20.625, "learning_rate": 8.566810344827586e-07, "logits/chosen": -1.8132164478302002, "logits/rejected": -1.8298476934432983, "logps/chosen": -543.1507568359375, "logps/rejected": -532.009033203125, "loss": 0.5807, "rewards/accuracies": 0.707812488079071, "rewards/chosen": -0.953994631767273, "rewards/margins": 0.35666000843048096, "rewards/rejected": -1.310654640197754, "step": 1990 }, { "epoch": 2.1553136270180997, "grad_norm": 21.375, "learning_rate": 8.459051724137931e-07, "logits/chosen": -1.8091182708740234, "logits/rejected": -1.8285820484161377, "logps/chosen": -507.40240478515625, "logps/rejected": -491.46307373046875, "loss": 0.5944, "rewards/accuracies": 0.692187488079071, "rewards/chosen": -0.9950447082519531, "rewards/margins": 0.3224475085735321, "rewards/rejected": -1.3174922466278076, "step": 2000 }, { "epoch": 2.1553136270180997, "eval_logits/chosen": -1.8191330432891846, "eval_logits/rejected": -1.8835722208023071, "eval_logps/chosen": -571.5982666015625, "eval_logps/rejected": -520.4741821289062, "eval_loss": 0.6388756036758423, "eval_rewards/accuracies": 0.6538461446762085, "eval_rewards/chosen": -1.1686609983444214, "eval_rewards/margins": 0.20452477037906647, "eval_rewards/rejected": -1.3731858730316162, "eval_runtime": 12.8754, "eval_samples_per_second": 7.767, "eval_steps_per_second": 1.01, "step": 2000 }, { "epoch": 2.166099295561023, "grad_norm": 22.875, "learning_rate": 8.351293103448275e-07, "logits/chosen": -1.734623670578003, "logits/rejected": -1.738992691040039, "logps/chosen": -557.7481689453125, "logps/rejected": -540.2991943359375, "loss": 0.6125, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.022360920906067, "rewards/margins": 0.27458736300468445, "rewards/rejected": -1.2969481945037842, "step": 2010 }, { "epoch": 2.1768849641039467, "grad_norm": 22.875, "learning_rate": 8.24353448275862e-07, "logits/chosen": -1.8071972131729126, "logits/rejected": -1.8338969945907593, "logps/chosen": -535.3201293945312, "logps/rejected": -517.41796875, "loss": 0.5833, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -1.0038800239562988, "rewards/margins": 0.34552377462387085, "rewards/rejected": -1.3494038581848145, "step": 2020 }, { "epoch": 2.1876706326468707, "grad_norm": 19.375, "learning_rate": 8.135775862068966e-07, "logits/chosen": -1.8387718200683594, "logits/rejected": -1.84176766872406, "logps/chosen": -546.7960205078125, "logps/rejected": -534.9925537109375, "loss": 0.5817, "rewards/accuracies": 0.7046874761581421, "rewards/chosen": -1.0042190551757812, "rewards/margins": 0.3391868472099304, "rewards/rejected": -1.3434059619903564, "step": 2030 }, { "epoch": 2.198456301189794, "grad_norm": 21.75, "learning_rate": 8.028017241379311e-07, "logits/chosen": -1.840580701828003, "logits/rejected": -1.8485066890716553, "logps/chosen": -529.6387939453125, "logps/rejected": -517.8297119140625, "loss": 0.6042, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.0427838563919067, "rewards/margins": 0.29375189542770386, "rewards/rejected": -1.336535930633545, "step": 2040 }, { "epoch": 2.2092419697327177, "grad_norm": 21.625, "learning_rate": 7.920258620689656e-07, "logits/chosen": -1.7776069641113281, "logits/rejected": -1.7974965572357178, "logps/chosen": -557.3050537109375, "logps/rejected": -541.1644897460938, "loss": 0.5813, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.0097219944000244, "rewards/margins": 0.3600006699562073, "rewards/rejected": -1.369722843170166, "step": 2050 }, { "epoch": 2.2092419697327177, "eval_logits/chosen": -1.8171120882034302, "eval_logits/rejected": -1.8817297220230103, "eval_logps/chosen": -571.6415405273438, "eval_logps/rejected": -520.3262939453125, "eval_loss": 0.6439670324325562, "eval_rewards/accuracies": 0.6153846383094788, "eval_rewards/chosen": -1.1708241701126099, "eval_rewards/margins": 0.19496536254882812, "eval_rewards/rejected": -1.365789532661438, "eval_runtime": 12.8352, "eval_samples_per_second": 7.791, "eval_steps_per_second": 1.013, "step": 2050 }, { "epoch": 2.220027638275641, "grad_norm": 20.375, "learning_rate": 7.8125e-07, "logits/chosen": -1.8244011402130127, "logits/rejected": -1.8032004833221436, "logps/chosen": -533.1487426757812, "logps/rejected": -523.3587646484375, "loss": 0.5793, "rewards/accuracies": 0.7046874761581421, "rewards/chosen": -1.0152502059936523, "rewards/margins": 0.3824854791164398, "rewards/rejected": -1.397735834121704, "step": 2060 }, { "epoch": 2.2308133068185647, "grad_norm": 19.375, "learning_rate": 7.704741379310345e-07, "logits/chosen": -1.840766191482544, "logits/rejected": -1.8367950916290283, "logps/chosen": -548.8405151367188, "logps/rejected": -544.241943359375, "loss": 0.5811, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.9654202461242676, "rewards/margins": 0.3421328663825989, "rewards/rejected": -1.3075530529022217, "step": 2070 }, { "epoch": 2.241598975361488, "grad_norm": 22.625, "learning_rate": 7.59698275862069e-07, "logits/chosen": -1.7699668407440186, "logits/rejected": -1.7745425701141357, "logps/chosen": -566.4268188476562, "logps/rejected": -563.216796875, "loss": 0.5914, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9823424220085144, "rewards/margins": 0.3274309039115906, "rewards/rejected": -1.309773325920105, "step": 2080 }, { "epoch": 2.252384643904412, "grad_norm": 21.625, "learning_rate": 7.489224137931034e-07, "logits/chosen": -1.761265516281128, "logits/rejected": -1.7772560119628906, "logps/chosen": -553.53564453125, "logps/rejected": -528.5107421875, "loss": 0.5898, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.9453617930412292, "rewards/margins": 0.3203446567058563, "rewards/rejected": -1.2657064199447632, "step": 2090 }, { "epoch": 2.2631703124473357, "grad_norm": 18.875, "learning_rate": 7.38146551724138e-07, "logits/chosen": -1.769878625869751, "logits/rejected": -1.7828420400619507, "logps/chosen": -575.3273315429688, "logps/rejected": -545.9244995117188, "loss": 0.5906, "rewards/accuracies": 0.7046874761581421, "rewards/chosen": -1.0056273937225342, "rewards/margins": 0.33718934655189514, "rewards/rejected": -1.342816710472107, "step": 2100 }, { "epoch": 2.2631703124473357, "eval_logits/chosen": -1.8217333555221558, "eval_logits/rejected": -1.8864943981170654, "eval_logps/chosen": -571.5655517578125, "eval_logps/rejected": -520.4244384765625, "eval_loss": 0.6396628618240356, "eval_rewards/accuracies": 0.6538461446762085, "eval_rewards/chosen": -1.1670244932174683, "eval_rewards/margins": 0.2036767601966858, "eval_rewards/rejected": -1.3707013130187988, "eval_runtime": 12.8554, "eval_samples_per_second": 7.779, "eval_steps_per_second": 1.011, "step": 2100 }, { "epoch": 2.273955980990259, "grad_norm": 23.25, "learning_rate": 7.273706896551725e-07, "logits/chosen": -1.816007375717163, "logits/rejected": -1.8105392456054688, "logps/chosen": -547.5933837890625, "logps/rejected": -546.60888671875, "loss": 0.5834, "rewards/accuracies": 0.6890624761581421, "rewards/chosen": -0.9704920649528503, "rewards/margins": 0.33989059925079346, "rewards/rejected": -1.310382604598999, "step": 2110 }, { "epoch": 2.2847416495331827, "grad_norm": 22.25, "learning_rate": 7.16594827586207e-07, "logits/chosen": -1.8106517791748047, "logits/rejected": -1.7941129207611084, "logps/chosen": -539.07568359375, "logps/rejected": -527.458740234375, "loss": 0.5906, "rewards/accuracies": 0.7109375, "rewards/chosen": -1.0233216285705566, "rewards/margins": 0.33753374218940735, "rewards/rejected": -1.3608553409576416, "step": 2120 }, { "epoch": 2.295527318076106, "grad_norm": 21.375, "learning_rate": 7.058189655172414e-07, "logits/chosen": -1.8167377710342407, "logits/rejected": -1.8257455825805664, "logps/chosen": -563.2874755859375, "logps/rejected": -553.4884643554688, "loss": 0.5733, "rewards/accuracies": 0.7203124761581421, "rewards/chosen": -0.9766351580619812, "rewards/margins": 0.4000927805900574, "rewards/rejected": -1.3767280578613281, "step": 2130 }, { "epoch": 2.30631298661903, "grad_norm": 21.0, "learning_rate": 6.950431034482759e-07, "logits/chosen": -1.786746621131897, "logits/rejected": -1.7818912267684937, "logps/chosen": -545.81640625, "logps/rejected": -537.4017333984375, "loss": 0.5852, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.9767181277275085, "rewards/margins": 0.3548073172569275, "rewards/rejected": -1.331525444984436, "step": 2140 }, { "epoch": 2.3170986551619537, "grad_norm": 20.125, "learning_rate": 6.842672413793104e-07, "logits/chosen": -1.8164615631103516, "logits/rejected": -1.8195968866348267, "logps/chosen": -524.2017822265625, "logps/rejected": -513.7794799804688, "loss": 0.5909, "rewards/accuracies": 0.7015625238418579, "rewards/chosen": -0.9479745626449585, "rewards/margins": 0.31994181871414185, "rewards/rejected": -1.2679163217544556, "step": 2150 }, { "epoch": 2.3170986551619537, "eval_logits/chosen": -1.819827914237976, "eval_logits/rejected": -1.8852392435073853, "eval_logps/chosen": -571.2940063476562, "eval_logps/rejected": -520.1119995117188, "eval_loss": 0.6401126384735107, "eval_rewards/accuracies": 0.6538461446762085, "eval_rewards/chosen": -1.1534450054168701, "eval_rewards/margins": 0.20163044333457947, "eval_rewards/rejected": -1.3550753593444824, "eval_runtime": 12.8257, "eval_samples_per_second": 7.797, "eval_steps_per_second": 1.014, "step": 2150 }, { "epoch": 2.327884323704877, "grad_norm": 20.0, "learning_rate": 6.734913793103448e-07, "logits/chosen": -1.7366435527801514, "logits/rejected": -1.7245728969573975, "logps/chosen": -527.2462768554688, "logps/rejected": -515.2891235351562, "loss": 0.5881, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.951647937297821, "rewards/margins": 0.32886141538619995, "rewards/rejected": -1.2805094718933105, "step": 2160 }, { "epoch": 2.3386699922478007, "grad_norm": 20.0, "learning_rate": 6.627155172413793e-07, "logits/chosen": -1.75906240940094, "logits/rejected": -1.772367238998413, "logps/chosen": -567.78759765625, "logps/rejected": -553.91162109375, "loss": 0.5751, "rewards/accuracies": 0.7359374761581421, "rewards/chosen": -0.9418633580207825, "rewards/margins": 0.3597813546657562, "rewards/rejected": -1.3016446828842163, "step": 2170 }, { "epoch": 2.349455660790724, "grad_norm": 22.625, "learning_rate": 6.519396551724138e-07, "logits/chosen": -1.8051865100860596, "logits/rejected": -1.8138115406036377, "logps/chosen": -551.5945434570312, "logps/rejected": -533.2476806640625, "loss": 0.5676, "rewards/accuracies": 0.7328125238418579, "rewards/chosen": -0.9801983833312988, "rewards/margins": 0.39750877022743225, "rewards/rejected": -1.3777072429656982, "step": 2180 }, { "epoch": 2.3602413293336477, "grad_norm": 22.5, "learning_rate": 6.411637931034483e-07, "logits/chosen": -1.8182309865951538, "logits/rejected": -1.8295986652374268, "logps/chosen": -559.96435546875, "logps/rejected": -546.2293701171875, "loss": 0.5932, "rewards/accuracies": 0.6859375238418579, "rewards/chosen": -0.997664749622345, "rewards/margins": 0.3369225859642029, "rewards/rejected": -1.3345873355865479, "step": 2190 }, { "epoch": 2.3710269978765717, "grad_norm": 18.375, "learning_rate": 6.303879310344828e-07, "logits/chosen": -1.7372815608978271, "logits/rejected": -1.7619171142578125, "logps/chosen": -533.6831665039062, "logps/rejected": -516.8760375976562, "loss": 0.5755, "rewards/accuracies": 0.7046874761581421, "rewards/chosen": -0.933220386505127, "rewards/margins": 0.3638560175895691, "rewards/rejected": -1.2970764636993408, "step": 2200 }, { "epoch": 2.3710269978765717, "eval_logits/chosen": -1.815535068511963, "eval_logits/rejected": -1.880157470703125, "eval_logps/chosen": -571.4099731445312, "eval_logps/rejected": -520.0239868164062, "eval_loss": 0.6433662176132202, "eval_rewards/accuracies": 0.6346153616905212, "eval_rewards/chosen": -1.1592445373535156, "eval_rewards/margins": 0.1914299577474594, "eval_rewards/rejected": -1.3506745100021362, "eval_runtime": 12.8533, "eval_samples_per_second": 7.78, "eval_steps_per_second": 1.011, "step": 2200 }, { "epoch": 2.381812666419495, "grad_norm": 20.0, "learning_rate": 6.196120689655173e-07, "logits/chosen": -1.8025963306427002, "logits/rejected": -1.7924509048461914, "logps/chosen": -551.2867431640625, "logps/rejected": -542.8524169921875, "loss": 0.5904, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9992715716362, "rewards/margins": 0.32568755745887756, "rewards/rejected": -1.3249592781066895, "step": 2210 }, { "epoch": 2.3925983349624187, "grad_norm": 21.5, "learning_rate": 6.088362068965517e-07, "logits/chosen": -1.812631368637085, "logits/rejected": -1.8184080123901367, "logps/chosen": -559.1375732421875, "logps/rejected": -542.5402221679688, "loss": 0.5766, "rewards/accuracies": 0.753125011920929, "rewards/chosen": -0.9521558880805969, "rewards/margins": 0.3602093458175659, "rewards/rejected": -1.3123652935028076, "step": 2220 }, { "epoch": 2.403384003505342, "grad_norm": 23.25, "learning_rate": 5.980603448275862e-07, "logits/chosen": -1.8015377521514893, "logits/rejected": -1.808902382850647, "logps/chosen": -552.8629150390625, "logps/rejected": -537.6948852539062, "loss": 0.5718, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.932092547416687, "rewards/margins": 0.37704476714134216, "rewards/rejected": -1.3091373443603516, "step": 2230 }, { "epoch": 2.4141696720482657, "grad_norm": 21.25, "learning_rate": 5.872844827586207e-07, "logits/chosen": -1.8348724842071533, "logits/rejected": -1.861788034439087, "logps/chosen": -537.8258056640625, "logps/rejected": -525.0621948242188, "loss": 0.5866, "rewards/accuracies": 0.703125, "rewards/chosen": -1.0409669876098633, "rewards/margins": 0.33658367395401, "rewards/rejected": -1.3775508403778076, "step": 2240 }, { "epoch": 2.4249553405911897, "grad_norm": 23.5, "learning_rate": 5.765086206896552e-07, "logits/chosen": -1.760894775390625, "logits/rejected": -1.772892951965332, "logps/chosen": -585.08544921875, "logps/rejected": -567.151123046875, "loss": 0.5984, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9715090990066528, "rewards/margins": 0.30948516726493835, "rewards/rejected": -1.280994176864624, "step": 2250 }, { "epoch": 2.4249553405911897, "eval_logits/chosen": -1.8186331987380981, "eval_logits/rejected": -1.8828355073928833, "eval_logps/chosen": -571.3914794921875, "eval_logps/rejected": -520.0492553710938, "eval_loss": 0.6430271863937378, "eval_rewards/accuracies": 0.6442307829856873, "eval_rewards/chosen": -1.1583155393600464, "eval_rewards/margins": 0.19362236559391022, "eval_rewards/rejected": -1.351938009262085, "eval_runtime": 12.8415, "eval_samples_per_second": 7.787, "eval_steps_per_second": 1.012, "step": 2250 }, { "epoch": 2.435741009134113, "grad_norm": 20.75, "learning_rate": 5.657327586206896e-07, "logits/chosen": -1.8033708333969116, "logits/rejected": -1.7999566793441772, "logps/chosen": -550.396728515625, "logps/rejected": -535.7592163085938, "loss": 0.6, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0161781311035156, "rewards/margins": 0.3219712972640991, "rewards/rejected": -1.3381493091583252, "step": 2260 }, { "epoch": 2.4465266776770367, "grad_norm": 22.125, "learning_rate": 5.549568965517241e-07, "logits/chosen": -1.8223384618759155, "logits/rejected": -1.831163763999939, "logps/chosen": -535.5626220703125, "logps/rejected": -530.4205322265625, "loss": 0.5904, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0250645875930786, "rewards/margins": 0.32182878255844116, "rewards/rejected": -1.346893310546875, "step": 2270 }, { "epoch": 2.45731234621996, "grad_norm": 19.5, "learning_rate": 5.441810344827586e-07, "logits/chosen": -1.793067216873169, "logits/rejected": -1.8330637216567993, "logps/chosen": -552.1552124023438, "logps/rejected": -533.558349609375, "loss": 0.5834, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0002939701080322, "rewards/margins": 0.34119078516960144, "rewards/rejected": -1.341484785079956, "step": 2280 }, { "epoch": 2.4680980147628837, "grad_norm": 21.125, "learning_rate": 5.33405172413793e-07, "logits/chosen": -1.7839548587799072, "logits/rejected": -1.7871309518814087, "logps/chosen": -559.6815185546875, "logps/rejected": -536.8385009765625, "loss": 0.6067, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -1.0037622451782227, "rewards/margins": 0.2857220768928528, "rewards/rejected": -1.2894842624664307, "step": 2290 }, { "epoch": 2.478883683305807, "grad_norm": 18.5, "learning_rate": 5.226293103448276e-07, "logits/chosen": -1.8063087463378906, "logits/rejected": -1.8256887197494507, "logps/chosen": -540.8095703125, "logps/rejected": -519.8458251953125, "loss": 0.5698, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9602404832839966, "rewards/margins": 0.3675882816314697, "rewards/rejected": -1.3278287649154663, "step": 2300 }, { "epoch": 2.478883683305807, "eval_logits/chosen": -1.820350170135498, "eval_logits/rejected": -1.8840982913970947, "eval_logps/chosen": -571.4730224609375, "eval_logps/rejected": -520.2723999023438, "eval_loss": 0.6414951086044312, "eval_rewards/accuracies": 0.6538461446762085, "eval_rewards/chosen": -1.16239595413208, "eval_rewards/margins": 0.20070229470729828, "eval_rewards/rejected": -1.3630982637405396, "eval_runtime": 12.8591, "eval_samples_per_second": 7.777, "eval_steps_per_second": 1.011, "step": 2300 }, { "epoch": 2.489669351848731, "grad_norm": 19.875, "learning_rate": 5.118534482758621e-07, "logits/chosen": -1.8190813064575195, "logits/rejected": -1.823015809059143, "logps/chosen": -526.2591552734375, "logps/rejected": -516.8885498046875, "loss": 0.5822, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.9837486147880554, "rewards/margins": 0.3439069986343384, "rewards/rejected": -1.327655553817749, "step": 2310 }, { "epoch": 2.5004550203916547, "grad_norm": 20.75, "learning_rate": 5.010775862068965e-07, "logits/chosen": -1.8792375326156616, "logits/rejected": -1.8921849727630615, "logps/chosen": -548.6787719726562, "logps/rejected": -541.21435546875, "loss": 0.5713, "rewards/accuracies": 0.739062488079071, "rewards/chosen": -1.0175832509994507, "rewards/margins": 0.37862181663513184, "rewards/rejected": -1.396205186843872, "step": 2320 }, { "epoch": 2.511240688934578, "grad_norm": 22.375, "learning_rate": 4.903017241379311e-07, "logits/chosen": -1.7593591213226318, "logits/rejected": -1.772139549255371, "logps/chosen": -526.41650390625, "logps/rejected": -535.5494384765625, "loss": 0.5849, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.9334033727645874, "rewards/margins": 0.32959261536598206, "rewards/rejected": -1.2629960775375366, "step": 2330 }, { "epoch": 2.5220263574775017, "grad_norm": 20.125, "learning_rate": 4.795258620689656e-07, "logits/chosen": -1.7565523386001587, "logits/rejected": -1.7509338855743408, "logps/chosen": -560.8541870117188, "logps/rejected": -542.8634643554688, "loss": 0.5863, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.9780709147453308, "rewards/margins": 0.33096015453338623, "rewards/rejected": -1.3090311288833618, "step": 2340 }, { "epoch": 2.532812026020425, "grad_norm": 20.375, "learning_rate": 4.6875e-07, "logits/chosen": -1.8536609411239624, "logits/rejected": -1.890777349472046, "logps/chosen": -559.2564697265625, "logps/rejected": -546.4683837890625, "loss": 0.5739, "rewards/accuracies": 0.734375, "rewards/chosen": -1.012340784072876, "rewards/margins": 0.37036722898483276, "rewards/rejected": -1.382707953453064, "step": 2350 }, { "epoch": 2.532812026020425, "eval_logits/chosen": -1.8193501234054565, "eval_logits/rejected": -1.8834855556488037, "eval_logps/chosen": -571.5364379882812, "eval_logps/rejected": -520.3049926757812, "eval_loss": 0.6403661966323853, "eval_rewards/accuracies": 0.6538461446762085, "eval_rewards/chosen": -1.16556715965271, "eval_rewards/margins": 0.19915804266929626, "eval_rewards/rejected": -1.3647252321243286, "eval_runtime": 12.8276, "eval_samples_per_second": 7.796, "eval_steps_per_second": 1.013, "step": 2350 }, { "epoch": 2.543597694563349, "grad_norm": 21.0, "learning_rate": 4.5797413793103446e-07, "logits/chosen": -1.8258733749389648, "logits/rejected": -1.8301451206207275, "logps/chosen": -536.6988525390625, "logps/rejected": -524.0502319335938, "loss": 0.5685, "rewards/accuracies": 0.729687511920929, "rewards/chosen": -0.9772807359695435, "rewards/margins": 0.3694283366203308, "rewards/rejected": -1.3467090129852295, "step": 2360 }, { "epoch": 2.5543833631062727, "grad_norm": 22.375, "learning_rate": 4.4719827586206897e-07, "logits/chosen": -1.8070919513702393, "logits/rejected": -1.7797702550888062, "logps/chosen": -574.8311767578125, "logps/rejected": -568.0702514648438, "loss": 0.575, "rewards/accuracies": 0.7484375238418579, "rewards/chosen": -0.9659520983695984, "rewards/margins": 0.3521731197834015, "rewards/rejected": -1.3181252479553223, "step": 2370 }, { "epoch": 2.565169031649196, "grad_norm": 20.25, "learning_rate": 4.364224137931034e-07, "logits/chosen": -1.8053117990493774, "logits/rejected": -1.8097355365753174, "logps/chosen": -553.0218505859375, "logps/rejected": -546.5211181640625, "loss": 0.5767, "rewards/accuracies": 0.745312511920929, "rewards/chosen": -0.9683555364608765, "rewards/margins": 0.35642895102500916, "rewards/rejected": -1.324784517288208, "step": 2380 }, { "epoch": 2.5759547001921197, "grad_norm": 19.375, "learning_rate": 4.2564655172413793e-07, "logits/chosen": -1.805479645729065, "logits/rejected": -1.838136911392212, "logps/chosen": -561.8768920898438, "logps/rejected": -540.8943481445312, "loss": 0.5959, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0204946994781494, "rewards/margins": 0.3259061276912689, "rewards/rejected": -1.3464009761810303, "step": 2390 }, { "epoch": 2.586740368735043, "grad_norm": 21.625, "learning_rate": 4.148706896551724e-07, "logits/chosen": -1.8230018615722656, "logits/rejected": -1.8241417407989502, "logps/chosen": -530.9923706054688, "logps/rejected": -517.3659057617188, "loss": 0.5808, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.9844582676887512, "rewards/margins": 0.3568059206008911, "rewards/rejected": -1.341264247894287, "step": 2400 }, { "epoch": 2.586740368735043, "eval_logits/chosen": -1.8191014528274536, "eval_logits/rejected": -1.8839582204818726, "eval_logps/chosen": -571.337646484375, "eval_logps/rejected": -520.2830200195312, "eval_loss": 0.6378647089004517, "eval_rewards/accuracies": 0.6346153616905212, "eval_rewards/chosen": -1.15562903881073, "eval_rewards/margins": 0.20799769461154938, "eval_rewards/rejected": -1.3636267185211182, "eval_runtime": 12.8404, "eval_samples_per_second": 7.788, "eval_steps_per_second": 1.012, "step": 2400 }, { "epoch": 2.5975260372779667, "grad_norm": 20.375, "learning_rate": 4.0409482758620694e-07, "logits/chosen": -1.8546650409698486, "logits/rejected": -1.8732141256332397, "logps/chosen": -525.1890869140625, "logps/rejected": -511.1866760253906, "loss": 0.5827, "rewards/accuracies": 0.714062511920929, "rewards/chosen": -1.0207396745681763, "rewards/margins": 0.34293699264526367, "rewards/rejected": -1.36367666721344, "step": 2410 }, { "epoch": 2.6083117058208902, "grad_norm": 21.375, "learning_rate": 3.9331896551724145e-07, "logits/chosen": -1.7957582473754883, "logits/rejected": -1.8112951517105103, "logps/chosen": -539.9984130859375, "logps/rejected": -546.8135375976562, "loss": 0.5695, "rewards/accuracies": 0.745312511920929, "rewards/chosen": -0.9470235109329224, "rewards/margins": 0.41715818643569946, "rewards/rejected": -1.364181637763977, "step": 2420 }, { "epoch": 2.619097374363814, "grad_norm": 20.25, "learning_rate": 3.825431034482759e-07, "logits/chosen": -1.8335113525390625, "logits/rejected": -1.8350120782852173, "logps/chosen": -557.8736572265625, "logps/rejected": -546.7265014648438, "loss": 0.5721, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.944877028465271, "rewards/margins": 0.3734719753265381, "rewards/rejected": -1.318349003791809, "step": 2430 }, { "epoch": 2.6298830429067377, "grad_norm": 20.875, "learning_rate": 3.7176724137931035e-07, "logits/chosen": -1.8570324182510376, "logits/rejected": -1.875575065612793, "logps/chosen": -540.6766967773438, "logps/rejected": -514.4322509765625, "loss": 0.5742, "rewards/accuracies": 0.7203124761581421, "rewards/chosen": -1.010191559791565, "rewards/margins": 0.363949716091156, "rewards/rejected": -1.3741410970687866, "step": 2440 }, { "epoch": 2.640668711449661, "grad_norm": 20.375, "learning_rate": 3.6099137931034486e-07, "logits/chosen": -1.8072179555892944, "logits/rejected": -1.8312402963638306, "logps/chosen": -570.2026977539062, "logps/rejected": -541.2198486328125, "loss": 0.5743, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.9453884959220886, "rewards/margins": 0.36282581090927124, "rewards/rejected": -1.3082144260406494, "step": 2450 }, { "epoch": 2.640668711449661, "eval_logits/chosen": -1.8189289569854736, "eval_logits/rejected": -1.882326602935791, "eval_logps/chosen": -571.3778076171875, "eval_logps/rejected": -520.1837158203125, "eval_loss": 0.6420935988426208, "eval_rewards/accuracies": 0.6538461446762085, "eval_rewards/chosen": -1.1576350927352905, "eval_rewards/margins": 0.2010272890329361, "eval_rewards/rejected": -1.3586623668670654, "eval_runtime": 12.8126, "eval_samples_per_second": 7.805, "eval_steps_per_second": 1.015, "step": 2450 }, { "epoch": 2.6514543799925847, "grad_norm": 22.0, "learning_rate": 3.502155172413793e-07, "logits/chosen": -1.8071362972259521, "logits/rejected": -1.8451793193817139, "logps/chosen": -558.2564086914062, "logps/rejected": -535.6451416015625, "loss": 0.5825, "rewards/accuracies": 0.7171875238418579, "rewards/chosen": -1.0327677726745605, "rewards/margins": 0.3471210300922394, "rewards/rejected": -1.379888892173767, "step": 2460 }, { "epoch": 2.6622400485355087, "grad_norm": 21.25, "learning_rate": 3.394396551724138e-07, "logits/chosen": -1.7823054790496826, "logits/rejected": -1.800473928451538, "logps/chosen": -574.7456665039062, "logps/rejected": -547.697265625, "loss": 0.5809, "rewards/accuracies": 0.7109375, "rewards/chosen": -1.008268117904663, "rewards/margins": 0.3359683156013489, "rewards/rejected": -1.3442364931106567, "step": 2470 }, { "epoch": 2.673025717078432, "grad_norm": 19.0, "learning_rate": 3.2866379310344827e-07, "logits/chosen": -1.8167625665664673, "logits/rejected": -1.8581584692001343, "logps/chosen": -546.1006469726562, "logps/rejected": -524.2261962890625, "loss": 0.5784, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.9644547700881958, "rewards/margins": 0.3466854691505432, "rewards/rejected": -1.3111401796340942, "step": 2480 }, { "epoch": 2.6838113856213557, "grad_norm": 21.125, "learning_rate": 3.1788793103448277e-07, "logits/chosen": -1.8069427013397217, "logits/rejected": -1.826890230178833, "logps/chosen": -549.4500122070312, "logps/rejected": -528.8165283203125, "loss": 0.5792, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0348377227783203, "rewards/margins": 0.3798985183238983, "rewards/rejected": -1.414736270904541, "step": 2490 }, { "epoch": 2.694597054164279, "grad_norm": 20.75, "learning_rate": 3.071120689655173e-07, "logits/chosen": -1.8492558002471924, "logits/rejected": -1.9055678844451904, "logps/chosen": -531.1982421875, "logps/rejected": -509.7709045410156, "loss": 0.5947, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.9890526533126831, "rewards/margins": 0.31767064332962036, "rewards/rejected": -1.3067232370376587, "step": 2500 }, { "epoch": 2.694597054164279, "eval_logits/chosen": -1.8213615417480469, "eval_logits/rejected": -1.8862541913986206, "eval_logps/chosen": -571.5040283203125, "eval_logps/rejected": -520.3077392578125, "eval_loss": 0.6428215503692627, "eval_rewards/accuracies": 0.625, "eval_rewards/chosen": -1.163952350616455, "eval_rewards/margins": 0.2009093016386032, "eval_rewards/rejected": -1.3648617267608643, "eval_runtime": 12.8298, "eval_samples_per_second": 7.794, "eval_steps_per_second": 1.013, "step": 2500 }, { "epoch": 2.7053827227072027, "grad_norm": 20.5, "learning_rate": 2.9633620689655173e-07, "logits/chosen": -1.8131103515625, "logits/rejected": -1.8387792110443115, "logps/chosen": -550.4927978515625, "logps/rejected": -543.66162109375, "loss": 0.5763, "rewards/accuracies": 0.723437488079071, "rewards/chosen": -0.9916373491287231, "rewards/margins": 0.38830363750457764, "rewards/rejected": -1.3799409866333008, "step": 2510 }, { "epoch": 2.716168391250126, "grad_norm": 23.375, "learning_rate": 2.855603448275862e-07, "logits/chosen": -1.8427207469940186, "logits/rejected": -1.8411712646484375, "logps/chosen": -552.4677734375, "logps/rejected": -533.5294799804688, "loss": 0.5717, "rewards/accuracies": 0.6890624761581421, "rewards/chosen": -1.0246634483337402, "rewards/margins": 0.3778466582298279, "rewards/rejected": -1.402510166168213, "step": 2520 }, { "epoch": 2.7269540597930497, "grad_norm": 21.125, "learning_rate": 2.747844827586207e-07, "logits/chosen": -1.7936853170394897, "logits/rejected": -1.7962579727172852, "logps/chosen": -556.09521484375, "logps/rejected": -547.2576904296875, "loss": 0.5785, "rewards/accuracies": 0.71875, "rewards/chosen": -1.012605905532837, "rewards/margins": 0.35908856987953186, "rewards/rejected": -1.3716943264007568, "step": 2530 }, { "epoch": 2.7377397283359737, "grad_norm": 20.375, "learning_rate": 2.6400862068965514e-07, "logits/chosen": -1.7601362466812134, "logits/rejected": -1.7817468643188477, "logps/chosen": -525.7601928710938, "logps/rejected": -519.701171875, "loss": 0.5731, "rewards/accuracies": 0.7328125238418579, "rewards/chosen": -0.9400823712348938, "rewards/margins": 0.3647005558013916, "rewards/rejected": -1.3047831058502197, "step": 2540 }, { "epoch": 2.748525396878897, "grad_norm": 19.875, "learning_rate": 2.532327586206897e-07, "logits/chosen": -1.8128465414047241, "logits/rejected": -1.8134959936141968, "logps/chosen": -549.0126342773438, "logps/rejected": -539.8706665039062, "loss": 0.5836, "rewards/accuracies": 0.714062511920929, "rewards/chosen": -1.0344898700714111, "rewards/margins": 0.35030892491340637, "rewards/rejected": -1.3847988843917847, "step": 2550 }, { "epoch": 2.748525396878897, "eval_logits/chosen": -1.821547269821167, "eval_logits/rejected": -1.8859292268753052, "eval_logps/chosen": -571.573974609375, "eval_logps/rejected": -520.504150390625, "eval_loss": 0.6378675103187561, "eval_rewards/accuracies": 0.6634615659713745, "eval_rewards/chosen": -1.167443037033081, "eval_rewards/margins": 0.20724283158779144, "eval_rewards/rejected": -1.3746860027313232, "eval_runtime": 12.8565, "eval_samples_per_second": 7.778, "eval_steps_per_second": 1.011, "step": 2550 }, { "epoch": 2.7593110654218207, "grad_norm": 24.875, "learning_rate": 2.4245689655172415e-07, "logits/chosen": -1.8031337261199951, "logits/rejected": -1.7945318222045898, "logps/chosen": -555.7886962890625, "logps/rejected": -544.8328857421875, "loss": 0.594, "rewards/accuracies": 0.723437488079071, "rewards/chosen": -0.9998787045478821, "rewards/margins": 0.3179456293582916, "rewards/rejected": -1.3178244829177856, "step": 2560 }, { "epoch": 2.770096733964744, "grad_norm": 22.0, "learning_rate": 2.3168103448275863e-07, "logits/chosen": -1.7943198680877686, "logits/rejected": -1.8033891916275024, "logps/chosen": -543.3651733398438, "logps/rejected": -532.5569458007812, "loss": 0.5897, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.997797966003418, "rewards/margins": 0.326891154050827, "rewards/rejected": -1.3246891498565674, "step": 2570 }, { "epoch": 2.780882402507668, "grad_norm": 21.375, "learning_rate": 2.209051724137931e-07, "logits/chosen": -1.8031666278839111, "logits/rejected": -1.8120911121368408, "logps/chosen": -544.0496826171875, "logps/rejected": -541.0577392578125, "loss": 0.5774, "rewards/accuracies": 0.723437488079071, "rewards/chosen": -0.9857531785964966, "rewards/margins": 0.35895711183547974, "rewards/rejected": -1.3447102308273315, "step": 2580 }, { "epoch": 2.7916680710505917, "grad_norm": 19.75, "learning_rate": 2.1012931034482756e-07, "logits/chosen": -1.8335663080215454, "logits/rejected": -1.8373979330062866, "logps/chosen": -548.4537963867188, "logps/rejected": -540.0748291015625, "loss": 0.5892, "rewards/accuracies": 0.7109375, "rewards/chosen": -1.0308494567871094, "rewards/margins": 0.3219280242919922, "rewards/rejected": -1.3527776002883911, "step": 2590 }, { "epoch": 2.802453739593515, "grad_norm": 20.875, "learning_rate": 1.993534482758621e-07, "logits/chosen": -1.8183438777923584, "logits/rejected": -1.81350576877594, "logps/chosen": -543.9989624023438, "logps/rejected": -523.77490234375, "loss": 0.5911, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.045743703842163, "rewards/margins": 0.32820892333984375, "rewards/rejected": -1.3739526271820068, "step": 2600 }, { "epoch": 2.802453739593515, "eval_logits/chosen": -1.8199844360351562, "eval_logits/rejected": -1.8842284679412842, "eval_logps/chosen": -571.6898193359375, "eval_logps/rejected": -520.4623413085938, "eval_loss": 0.6425648331642151, "eval_rewards/accuracies": 0.625, "eval_rewards/chosen": -1.1732372045516968, "eval_rewards/margins": 0.19935639202594757, "eval_rewards/rejected": -1.3725935220718384, "eval_runtime": 12.8299, "eval_samples_per_second": 7.794, "eval_steps_per_second": 1.013, "step": 2600 }, { "epoch": 2.8132394081364387, "grad_norm": 22.875, "learning_rate": 1.8857758620689658e-07, "logits/chosen": -1.7655651569366455, "logits/rejected": -1.7627290487289429, "logps/chosen": -573.8090209960938, "logps/rejected": -579.2223510742188, "loss": 0.588, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.9776386022567749, "rewards/margins": 0.37593865394592285, "rewards/rejected": -1.3535772562026978, "step": 2610 }, { "epoch": 2.824025076679362, "grad_norm": 20.625, "learning_rate": 1.7780172413793103e-07, "logits/chosen": -1.8132917881011963, "logits/rejected": -1.833608865737915, "logps/chosen": -539.3812866210938, "logps/rejected": -532.1210327148438, "loss": 0.5694, "rewards/accuracies": 0.734375, "rewards/chosen": -0.993198037147522, "rewards/margins": 0.3927215039730072, "rewards/rejected": -1.3859195709228516, "step": 2620 }, { "epoch": 2.8348107452222857, "grad_norm": 21.375, "learning_rate": 1.670258620689655e-07, "logits/chosen": -1.8571910858154297, "logits/rejected": -1.8688180446624756, "logps/chosen": -533.7222900390625, "logps/rejected": -527.7282104492188, "loss": 0.5942, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0575587749481201, "rewards/margins": 0.3217419385910034, "rewards/rejected": -1.379300832748413, "step": 2630 }, { "epoch": 2.845596413765209, "grad_norm": 22.125, "learning_rate": 1.5625e-07, "logits/chosen": -1.801645278930664, "logits/rejected": -1.8315293788909912, "logps/chosen": -531.2138061523438, "logps/rejected": -502.26953125, "loss": 0.5914, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.9891794323921204, "rewards/margins": 0.3226587176322937, "rewards/rejected": -1.3118380308151245, "step": 2640 }, { "epoch": 2.856382082308133, "grad_norm": 19.0, "learning_rate": 1.454741379310345e-07, "logits/chosen": -1.8319976329803467, "logits/rejected": -1.8414185047149658, "logps/chosen": -547.1237182617188, "logps/rejected": -527.4207763671875, "loss": 0.5862, "rewards/accuracies": 0.7046874761581421, "rewards/chosen": -1.0397682189941406, "rewards/margins": 0.3290941119194031, "rewards/rejected": -1.3688621520996094, "step": 2650 }, { "epoch": 2.856382082308133, "eval_logits/chosen": -1.8245118856430054, "eval_logits/rejected": -1.8896102905273438, "eval_logps/chosen": -571.779052734375, "eval_logps/rejected": -520.575439453125, "eval_loss": 0.6429222822189331, "eval_rewards/accuracies": 0.6442307829856873, "eval_rewards/chosen": -1.1776981353759766, "eval_rewards/margins": 0.20054802298545837, "eval_rewards/rejected": -1.3782461881637573, "eval_runtime": 12.8292, "eval_samples_per_second": 7.795, "eval_steps_per_second": 1.013, "step": 2650 }, { "epoch": 2.8671677508510567, "grad_norm": 20.75, "learning_rate": 1.3469827586206897e-07, "logits/chosen": -1.8030239343643188, "logits/rejected": -1.8243894577026367, "logps/chosen": -543.3869018554688, "logps/rejected": -528.6986083984375, "loss": 0.582, "rewards/accuracies": 0.7328125238418579, "rewards/chosen": -0.964773952960968, "rewards/margins": 0.3342544436454773, "rewards/rejected": -1.2990283966064453, "step": 2660 }, { "epoch": 2.87795341939398, "grad_norm": 21.0, "learning_rate": 1.2392241379310345e-07, "logits/chosen": -1.7584861516952515, "logits/rejected": -1.7730573415756226, "logps/chosen": -561.669677734375, "logps/rejected": -548.2999267578125, "loss": 0.6069, "rewards/accuracies": 0.671875, "rewards/chosen": -1.0046155452728271, "rewards/margins": 0.30270934104919434, "rewards/rejected": -1.3073248863220215, "step": 2670 }, { "epoch": 2.8887390879369037, "grad_norm": 20.25, "learning_rate": 1.1314655172413793e-07, "logits/chosen": -1.7781982421875, "logits/rejected": -1.7862926721572876, "logps/chosen": -554.6754150390625, "logps/rejected": -538.4164428710938, "loss": 0.5843, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -1.047992467880249, "rewards/margins": 0.3425953984260559, "rewards/rejected": -1.3905879259109497, "step": 2680 }, { "epoch": 2.8995247564798277, "grad_norm": 21.25, "learning_rate": 1.0237068965517242e-07, "logits/chosen": -1.8572750091552734, "logits/rejected": -1.8793262243270874, "logps/chosen": -549.9866943359375, "logps/rejected": -524.3643188476562, "loss": 0.5601, "rewards/accuracies": 0.746874988079071, "rewards/chosen": -1.0121921300888062, "rewards/margins": 0.3896443247795105, "rewards/rejected": -1.401836633682251, "step": 2690 }, { "epoch": 2.910310425022751, "grad_norm": 20.0, "learning_rate": 9.15948275862069e-08, "logits/chosen": -1.80484938621521, "logits/rejected": -1.80769944190979, "logps/chosen": -524.7546997070312, "logps/rejected": -520.0099487304688, "loss": 0.5821, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.9889782071113586, "rewards/margins": 0.34463515877723694, "rewards/rejected": -1.333613395690918, "step": 2700 }, { "epoch": 2.910310425022751, "eval_logits/chosen": -1.8193469047546387, "eval_logits/rejected": -1.8841547966003418, "eval_logps/chosen": -571.4869384765625, "eval_logps/rejected": -520.3854370117188, "eval_loss": 0.6395397782325745, "eval_rewards/accuracies": 0.6730769276618958, "eval_rewards/chosen": -1.163095235824585, "eval_rewards/margins": 0.2056487500667572, "eval_rewards/rejected": -1.368743896484375, "eval_runtime": 12.8307, "eval_samples_per_second": 7.794, "eval_steps_per_second": 1.013, "step": 2700 }, { "epoch": 2.9210960935656747, "grad_norm": 22.625, "learning_rate": 8.081896551724138e-08, "logits/chosen": -1.8104597330093384, "logits/rejected": -1.8475234508514404, "logps/chosen": -556.8909912109375, "logps/rejected": -538.2855224609375, "loss": 0.5742, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9970060586929321, "rewards/margins": 0.3584115207195282, "rewards/rejected": -1.3554174900054932, "step": 2710 }, { "epoch": 2.931881762108598, "grad_norm": 20.125, "learning_rate": 7.004310344827586e-08, "logits/chosen": -1.8974506855010986, "logits/rejected": -1.9189106225967407, "logps/chosen": -536.3151245117188, "logps/rejected": -511.659912109375, "loss": 0.5842, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -1.0638625621795654, "rewards/margins": 0.34429508447647095, "rewards/rejected": -1.4081575870513916, "step": 2720 }, { "epoch": 2.9426674306515217, "grad_norm": 23.0, "learning_rate": 5.9267241379310345e-08, "logits/chosen": -1.7627315521240234, "logits/rejected": -1.7557799816131592, "logps/chosen": -555.1419677734375, "logps/rejected": -536.8969116210938, "loss": 0.5864, "rewards/accuracies": 0.714062511920929, "rewards/chosen": -0.9729493856430054, "rewards/margins": 0.33462923765182495, "rewards/rejected": -1.3075785636901855, "step": 2730 }, { "epoch": 2.953453099194445, "grad_norm": 19.375, "learning_rate": 4.849137931034483e-08, "logits/chosen": -1.7903178930282593, "logits/rejected": -1.7958800792694092, "logps/chosen": -540.8969116210938, "logps/rejected": -532.142578125, "loss": 0.5838, "rewards/accuracies": 0.723437488079071, "rewards/chosen": -0.9786058664321899, "rewards/margins": 0.3315025866031647, "rewards/rejected": -1.3101084232330322, "step": 2740 }, { "epoch": 2.9642387677373687, "grad_norm": 21.875, "learning_rate": 3.771551724137931e-08, "logits/chosen": -1.7704594135284424, "logits/rejected": -1.7803184986114502, "logps/chosen": -546.58349609375, "logps/rejected": -545.2351684570312, "loss": 0.5639, "rewards/accuracies": 0.71875, "rewards/chosen": -1.017333984375, "rewards/margins": 0.38938578963279724, "rewards/rejected": -1.4067199230194092, "step": 2750 }, { "epoch": 2.9642387677373687, "eval_logits/chosen": -1.8192270994186401, "eval_logits/rejected": -1.8834381103515625, "eval_logps/chosen": -571.4424438476562, "eval_logps/rejected": -520.3687744140625, "eval_loss": 0.6384233236312866, "eval_rewards/accuracies": 0.6442307829856873, "eval_rewards/chosen": -1.1608668565750122, "eval_rewards/margins": 0.20704683661460876, "eval_rewards/rejected": -1.3679137229919434, "eval_runtime": 12.8408, "eval_samples_per_second": 7.788, "eval_steps_per_second": 1.012, "step": 2750 }, { "epoch": 2.9750244362802927, "grad_norm": 20.625, "learning_rate": 2.6939655172413792e-08, "logits/chosen": -1.8296130895614624, "logits/rejected": -1.8367893695831299, "logps/chosen": -521.9843139648438, "logps/rejected": -498.4405822753906, "loss": 0.597, "rewards/accuracies": 0.7171875238418579, "rewards/chosen": -1.0047184228897095, "rewards/margins": 0.30621933937072754, "rewards/rejected": -1.3109376430511475, "step": 2760 }, { "epoch": 2.985810104823216, "grad_norm": 21.375, "learning_rate": 1.6163793103448278e-08, "logits/chosen": -1.8125426769256592, "logits/rejected": -1.837774634361267, "logps/chosen": -573.8707275390625, "logps/rejected": -565.1752319335938, "loss": 0.5891, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -1.0296361446380615, "rewards/margins": 0.3325849175453186, "rewards/rejected": -1.3622210025787354, "step": 2770 }, { "epoch": 2.9965957733661397, "grad_norm": 20.5, "learning_rate": 5.387931034482759e-09, "logits/chosen": -1.765448808670044, "logits/rejected": -1.7794584035873413, "logps/chosen": -562.4325561523438, "logps/rejected": -534.7550659179688, "loss": 0.5723, "rewards/accuracies": 0.739062488079071, "rewards/chosen": -0.9770288467407227, "rewards/margins": 0.3728591799736023, "rewards/rejected": -1.3498878479003906, "step": 2780 } ], "logging_steps": 10, "max_steps": 2784, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }