{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 50, "global_step": 928, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01078566854292359, "grad_norm": 20.25, "learning_rate": 2.990301724137931e-06, "logits/chosen": -1.2338563203811646, "logits/rejected": -1.2257438898086548, "logps/chosen": -490.282470703125, "logps/rejected": -482.52545166015625, "loss": 0.6903, "rewards/accuracies": 0.5, "rewards/chosen": 0.0017994878580793738, "rewards/margins": 0.006464972160756588, "rewards/rejected": -0.004665483720600605, "step": 10 }, { "epoch": 0.02157133708584718, "grad_norm": 22.0, "learning_rate": 2.979525862068966e-06, "logits/chosen": -1.2723968029022217, "logits/rejected": -1.2785792350769043, "logps/chosen": -520.5911254882812, "logps/rejected": -514.4813842773438, "loss": 0.6856, "rewards/accuracies": 0.6078125238418579, "rewards/chosen": 0.0043027508072555065, "rewards/margins": 0.0164328720420599, "rewards/rejected": -0.01213012170046568, "step": 20 }, { "epoch": 0.03235700562877077, "grad_norm": 19.125, "learning_rate": 2.96875e-06, "logits/chosen": -1.2637840509414673, "logits/rejected": -1.246565818786621, "logps/chosen": -507.4403381347656, "logps/rejected": -501.22479248046875, "loss": 0.6789, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.0017711544642224908, "rewards/margins": 0.030538281425833702, "rewards/rejected": -0.032309435307979584, "step": 30 }, { "epoch": 0.04314267417169436, "grad_norm": 20.75, "learning_rate": 2.9579741379310345e-06, "logits/chosen": -1.2590439319610596, "logits/rejected": -1.2330172061920166, "logps/chosen": -485.248291015625, "logps/rejected": -482.63458251953125, "loss": 0.6765, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.00601751497015357, "rewards/margins": 0.03583287447690964, "rewards/rejected": -0.041850391775369644, "step": 40 }, { "epoch": 0.05392834271461795, "grad_norm": 20.25, "learning_rate": 2.947198275862069e-06, "logits/chosen": -1.3426216840744019, "logits/rejected": -1.3334815502166748, "logps/chosen": -532.8280029296875, "logps/rejected": -514.1970825195312, "loss": 0.6684, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.00931160431355238, "rewards/margins": 0.053057052195072174, "rewards/rejected": -0.06236865371465683, "step": 50 }, { "epoch": 0.05392834271461795, "eval_logits/chosen": -1.3231499195098877, "eval_logits/rejected": -1.3642845153808594, "eval_logps/chosen": -549.2651977539062, "eval_logps/rejected": -494.54583740234375, "eval_loss": 0.6826924681663513, "eval_rewards/accuracies": 0.6153846383094788, "eval_rewards/chosen": -0.05200628936290741, "eval_rewards/margins": 0.02476159855723381, "eval_rewards/rejected": -0.07676788419485092, "eval_runtime": 13.2928, "eval_samples_per_second": 7.523, "eval_steps_per_second": 0.978, "step": 50 }, { "epoch": 0.06471401125754155, "grad_norm": 20.25, "learning_rate": 2.9364224137931035e-06, "logits/chosen": -1.3449946641921997, "logits/rejected": -1.343201756477356, "logps/chosen": -548.1566772460938, "logps/rejected": -531.6286010742188, "loss": 0.6705, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.028984328731894493, "rewards/margins": 0.04919930547475815, "rewards/rejected": -0.07818363606929779, "step": 60 }, { "epoch": 0.07549967980046514, "grad_norm": 20.875, "learning_rate": 2.925646551724138e-06, "logits/chosen": -1.3576488494873047, "logits/rejected": -1.3385895490646362, "logps/chosen": -516.7993774414062, "logps/rejected": -502.6502990722656, "loss": 0.6624, "rewards/accuracies": 0.6875, "rewards/chosen": -0.03774970397353172, "rewards/margins": 0.0672496110200882, "rewards/rejected": -0.10499931871891022, "step": 70 }, { "epoch": 0.08628534834338872, "grad_norm": 19.5, "learning_rate": 2.9148706896551725e-06, "logits/chosen": -1.4393236637115479, "logits/rejected": -1.4375989437103271, "logps/chosen": -534.2485961914062, "logps/rejected": -516.8245849609375, "loss": 0.6547, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.05567573383450508, "rewards/margins": 0.08416923880577087, "rewards/rejected": -0.13984496891498566, "step": 80 }, { "epoch": 0.09707101688631231, "grad_norm": 20.625, "learning_rate": 2.904094827586207e-06, "logits/chosen": -1.4080358743667603, "logits/rejected": -1.399465799331665, "logps/chosen": -503.4671936035156, "logps/rejected": -488.74383544921875, "loss": 0.6565, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.07379738986492157, "rewards/margins": 0.08224000781774521, "rewards/rejected": -0.15603742003440857, "step": 90 }, { "epoch": 0.1078566854292359, "grad_norm": 20.375, "learning_rate": 2.8933189655172415e-06, "logits/chosen": -1.3748817443847656, "logits/rejected": -1.366629958152771, "logps/chosen": -518.0892944335938, "logps/rejected": -499.7432556152344, "loss": 0.6541, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.09199288487434387, "rewards/margins": 0.08948174864053726, "rewards/rejected": -0.18147462606430054, "step": 100 }, { "epoch": 0.1078566854292359, "eval_logits/chosen": -1.4332343339920044, "eval_logits/rejected": -1.479236364364624, "eval_logps/chosen": -551.7933349609375, "eval_logps/rejected": -497.4646911621094, "eval_loss": 0.6763917803764343, "eval_rewards/accuracies": 0.5769230723381042, "eval_rewards/chosen": -0.17841331660747528, "eval_rewards/margins": 0.04429765045642853, "eval_rewards/rejected": -0.222710981965065, "eval_runtime": 12.8741, "eval_samples_per_second": 7.768, "eval_steps_per_second": 1.01, "step": 100 }, { "epoch": 0.1186423539721595, "grad_norm": 19.25, "learning_rate": 2.8825431034482758e-06, "logits/chosen": -1.436632513999939, "logits/rejected": -1.4507520198822021, "logps/chosen": -508.725830078125, "logps/rejected": -500.9781799316406, "loss": 0.6421, "rewards/accuracies": 0.729687511920929, "rewards/chosen": -0.11871524900197983, "rewards/margins": 0.11672022193670273, "rewards/rejected": -0.23543548583984375, "step": 110 }, { "epoch": 0.1294280225150831, "grad_norm": 19.375, "learning_rate": 2.8717672413793105e-06, "logits/chosen": -1.4648942947387695, "logits/rejected": -1.474923849105835, "logps/chosen": -522.5699462890625, "logps/rejected": -521.1087036132812, "loss": 0.635, "rewards/accuracies": 0.7171875238418579, "rewards/chosen": -0.14662253856658936, "rewards/margins": 0.13384078443050385, "rewards/rejected": -0.2804633677005768, "step": 120 }, { "epoch": 0.14021369105800668, "grad_norm": 21.0, "learning_rate": 2.860991379310345e-06, "logits/chosen": -1.5117241144180298, "logits/rejected": -1.5187674760818481, "logps/chosen": -526.3902587890625, "logps/rejected": -518.88525390625, "loss": 0.6413, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.1973995864391327, "rewards/margins": 0.12558409571647644, "rewards/rejected": -0.32298368215560913, "step": 130 }, { "epoch": 0.15099935960093028, "grad_norm": 19.75, "learning_rate": 2.8502155172413795e-06, "logits/chosen": -1.5119014978408813, "logits/rejected": -1.5130690336227417, "logps/chosen": -533.3895263671875, "logps/rejected": -527.9937744140625, "loss": 0.6401, "rewards/accuracies": 0.6703125238418579, "rewards/chosen": -0.22728869318962097, "rewards/margins": 0.13429734110832214, "rewards/rejected": -0.3615860342979431, "step": 140 }, { "epoch": 0.16178502814385384, "grad_norm": 19.375, "learning_rate": 2.839439655172414e-06, "logits/chosen": -1.5527749061584473, "logits/rejected": -1.5703589916229248, "logps/chosen": -515.7926025390625, "logps/rejected": -503.88360595703125, "loss": 0.6397, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2661042809486389, "rewards/margins": 0.13428668677806854, "rewards/rejected": -0.40039095282554626, "step": 150 }, { "epoch": 0.16178502814385384, "eval_logits/chosen": -1.5442339181900024, "eval_logits/rejected": -1.5949413776397705, "eval_logps/chosen": -555.982666015625, "eval_logps/rejected": -502.18585205078125, "eval_loss": 0.6722157001495361, "eval_rewards/accuracies": 0.5865384340286255, "eval_rewards/chosen": -0.38787999749183655, "eval_rewards/margins": 0.07088876515626907, "eval_rewards/rejected": -0.4587687849998474, "eval_runtime": 12.8342, "eval_samples_per_second": 7.792, "eval_steps_per_second": 1.013, "step": 150 }, { "epoch": 0.17257069668677744, "grad_norm": 20.0, "learning_rate": 2.8286637931034485e-06, "logits/chosen": -1.5724695920944214, "logits/rejected": -1.5730245113372803, "logps/chosen": -506.13671875, "logps/rejected": -495.46142578125, "loss": 0.6699, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.29719287157058716, "rewards/margins": 0.10155584663152695, "rewards/rejected": -0.3987486958503723, "step": 160 }, { "epoch": 0.18335636522970103, "grad_norm": 21.125, "learning_rate": 2.817887931034483e-06, "logits/chosen": -1.5566643476486206, "logits/rejected": -1.555086612701416, "logps/chosen": -557.1043701171875, "logps/rejected": -539.7008666992188, "loss": 0.6322, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.32251477241516113, "rewards/margins": 0.1617726981639862, "rewards/rejected": -0.48428741097450256, "step": 170 }, { "epoch": 0.19414203377262462, "grad_norm": 20.5, "learning_rate": 2.807112068965517e-06, "logits/chosen": -1.5289297103881836, "logits/rejected": -1.5290358066558838, "logps/chosen": -527.3707275390625, "logps/rejected": -511.8185119628906, "loss": 0.6347, "rewards/accuracies": 0.667187511920929, "rewards/chosen": -0.3478879928588867, "rewards/margins": 0.1548273265361786, "rewards/rejected": -0.5027152895927429, "step": 180 }, { "epoch": 0.20492770231554822, "grad_norm": 21.0, "learning_rate": 2.796336206896552e-06, "logits/chosen": -1.5106637477874756, "logits/rejected": -1.4951450824737549, "logps/chosen": -535.6061401367188, "logps/rejected": -528.7681884765625, "loss": 0.623, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.35832709074020386, "rewards/margins": 0.20923912525177002, "rewards/rejected": -0.5675662755966187, "step": 190 }, { "epoch": 0.2157133708584718, "grad_norm": 19.75, "learning_rate": 2.785560344827586e-06, "logits/chosen": -1.6122379302978516, "logits/rejected": -1.6149158477783203, "logps/chosen": -556.2003173828125, "logps/rejected": -536.0861206054688, "loss": 0.6237, "rewards/accuracies": 0.707812488079071, "rewards/chosen": -0.40547609329223633, "rewards/margins": 0.18066053092479706, "rewards/rejected": -0.5861365795135498, "step": 200 }, { "epoch": 0.2157133708584718, "eval_logits/chosen": -1.6185290813446045, "eval_logits/rejected": -1.671698808670044, "eval_logps/chosen": -559.1907348632812, "eval_logps/rejected": -505.73992919921875, "eval_loss": 0.6703996062278748, "eval_rewards/accuracies": 0.5769230723381042, "eval_rewards/chosen": -0.5482814311981201, "eval_rewards/margins": 0.08819277584552765, "eval_rewards/rejected": -0.6364741921424866, "eval_runtime": 12.8368, "eval_samples_per_second": 7.79, "eval_steps_per_second": 1.013, "step": 200 }, { "epoch": 0.2264990394013954, "grad_norm": 20.0, "learning_rate": 2.774784482758621e-06, "logits/chosen": -1.602872610092163, "logits/rejected": -1.6062767505645752, "logps/chosen": -494.86566162109375, "logps/rejected": -486.106689453125, "loss": 0.6313, "rewards/accuracies": 0.676562488079071, "rewards/chosen": -0.41412362456321716, "rewards/margins": 0.16503319144248962, "rewards/rejected": -0.5791568160057068, "step": 210 }, { "epoch": 0.237284707944319, "grad_norm": 22.875, "learning_rate": 2.764008620689655e-06, "logits/chosen": -1.6498100757598877, "logits/rejected": -1.6630780696868896, "logps/chosen": -540.1375732421875, "logps/rejected": -533.9187622070312, "loss": 0.6307, "rewards/accuracies": 0.6546875238418579, "rewards/chosen": -0.4561203420162201, "rewards/margins": 0.18029221892356873, "rewards/rejected": -0.636412501335144, "step": 220 }, { "epoch": 0.2480703764872426, "grad_norm": 19.5, "learning_rate": 2.75323275862069e-06, "logits/chosen": -1.667067527770996, "logits/rejected": -1.681429147720337, "logps/chosen": -525.0696411132812, "logps/rejected": -508.50067138671875, "loss": 0.6261, "rewards/accuracies": 0.6703125238418579, "rewards/chosen": -0.4919908940792084, "rewards/margins": 0.19147078692913055, "rewards/rejected": -0.6834616661071777, "step": 230 }, { "epoch": 0.2588560450301662, "grad_norm": 22.375, "learning_rate": 2.742456896551724e-06, "logits/chosen": -1.646775484085083, "logits/rejected": -1.6376352310180664, "logps/chosen": -517.5604248046875, "logps/rejected": -504.1005859375, "loss": 0.6228, "rewards/accuracies": 0.692187488079071, "rewards/chosen": -0.4839399755001068, "rewards/margins": 0.1933339387178421, "rewards/rejected": -0.6772739291191101, "step": 240 }, { "epoch": 0.26964171357308975, "grad_norm": 20.125, "learning_rate": 2.7316810344827584e-06, "logits/chosen": -1.630406379699707, "logits/rejected": -1.6203149557113647, "logps/chosen": -511.6064453125, "logps/rejected": -504.46112060546875, "loss": 0.6224, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.5028296709060669, "rewards/margins": 0.19740687310695648, "rewards/rejected": -0.700236439704895, "step": 250 }, { "epoch": 0.26964171357308975, "eval_logits/chosen": -1.671502947807312, "eval_logits/rejected": -1.727898359298706, "eval_logps/chosen": -561.3909912109375, "eval_logps/rejected": -507.87896728515625, "eval_loss": 0.6769301891326904, "eval_rewards/accuracies": 0.567307710647583, "eval_rewards/chosen": -0.6582961678504944, "eval_rewards/margins": 0.08512917906045914, "eval_rewards/rejected": -0.7434254288673401, "eval_runtime": 12.8313, "eval_samples_per_second": 7.793, "eval_steps_per_second": 1.013, "step": 250 }, { "epoch": 0.28042738211601337, "grad_norm": 22.75, "learning_rate": 2.720905172413793e-06, "logits/chosen": -1.6629798412322998, "logits/rejected": -1.664310097694397, "logps/chosen": -564.4110717773438, "logps/rejected": -550.1747436523438, "loss": 0.6344, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.51663738489151, "rewards/margins": 0.17350056767463684, "rewards/rejected": -0.6901379823684692, "step": 260 }, { "epoch": 0.29121305065893693, "grad_norm": 20.5, "learning_rate": 2.7101293103448275e-06, "logits/chosen": -1.6739919185638428, "logits/rejected": -1.6669447422027588, "logps/chosen": -562.1046752929688, "logps/rejected": -553.902099609375, "loss": 0.6256, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.5245037078857422, "rewards/margins": 0.19063076376914978, "rewards/rejected": -0.7151345014572144, "step": 270 }, { "epoch": 0.30199871920186055, "grad_norm": 19.875, "learning_rate": 2.699353448275862e-06, "logits/chosen": -1.6442272663116455, "logits/rejected": -1.6505457162857056, "logps/chosen": -529.9203491210938, "logps/rejected": -513.3055419921875, "loss": 0.6378, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5445322394371033, "rewards/margins": 0.16621330380439758, "rewards/rejected": -0.710745632648468, "step": 280 }, { "epoch": 0.3127843877447841, "grad_norm": 21.125, "learning_rate": 2.6885775862068965e-06, "logits/chosen": -1.6472547054290771, "logits/rejected": -1.6532646417617798, "logps/chosen": -527.1898193359375, "logps/rejected": -516.4867553710938, "loss": 0.6132, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.49309319257736206, "rewards/margins": 0.21812161803245544, "rewards/rejected": -0.7112148404121399, "step": 290 }, { "epoch": 0.3235700562877077, "grad_norm": 19.0, "learning_rate": 2.677801724137931e-06, "logits/chosen": -1.6490532159805298, "logits/rejected": -1.6721302270889282, "logps/chosen": -567.3697509765625, "logps/rejected": -550.6924438476562, "loss": 0.6216, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5456413626670837, "rewards/margins": 0.2058580368757248, "rewards/rejected": -0.7514994144439697, "step": 300 }, { "epoch": 0.3235700562877077, "eval_logits/chosen": -1.6934152841567993, "eval_logits/rejected": -1.7509509325027466, "eval_logps/chosen": -562.0770874023438, "eval_logps/rejected": -508.9422302246094, "eval_loss": 0.6661080121994019, "eval_rewards/accuracies": 0.5865384340286255, "eval_rewards/chosen": -0.692603349685669, "eval_rewards/margins": 0.10398232936859131, "eval_rewards/rejected": -0.7965856194496155, "eval_runtime": 12.8789, "eval_samples_per_second": 7.765, "eval_steps_per_second": 1.009, "step": 300 }, { "epoch": 0.3343557248306313, "grad_norm": 21.875, "learning_rate": 2.6670258620689655e-06, "logits/chosen": -1.7095540761947632, "logits/rejected": -1.7263736724853516, "logps/chosen": -530.0485229492188, "logps/rejected": -530.2100830078125, "loss": 0.6314, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.6013873815536499, "rewards/margins": 0.17918090522289276, "rewards/rejected": -0.7805682420730591, "step": 310 }, { "epoch": 0.3451413933735549, "grad_norm": 20.375, "learning_rate": 2.6562499999999998e-06, "logits/chosen": -1.7012121677398682, "logits/rejected": -1.6956939697265625, "logps/chosen": -520.4534301757812, "logps/rejected": -511.6394958496094, "loss": 0.6502, "rewards/accuracies": 0.629687488079071, "rewards/chosen": -0.6033864617347717, "rewards/margins": 0.14426395297050476, "rewards/rejected": -0.7476503252983093, "step": 320 }, { "epoch": 0.3559270619164785, "grad_norm": 20.625, "learning_rate": 2.6454741379310345e-06, "logits/chosen": -1.6911704540252686, "logits/rejected": -1.6552823781967163, "logps/chosen": -537.2453002929688, "logps/rejected": -534.7839965820312, "loss": 0.6255, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.568359375, "rewards/margins": 0.22105315327644348, "rewards/rejected": -0.7894124388694763, "step": 330 }, { "epoch": 0.36671273045940206, "grad_norm": 21.5, "learning_rate": 2.6346982758620688e-06, "logits/chosen": -1.664223313331604, "logits/rejected": -1.6563825607299805, "logps/chosen": -565.0128784179688, "logps/rejected": -551.3712158203125, "loss": 0.6117, "rewards/accuracies": 0.692187488079071, "rewards/chosen": -0.5529137849807739, "rewards/margins": 0.23182833194732666, "rewards/rejected": -0.7847420573234558, "step": 340 }, { "epoch": 0.3774983990023257, "grad_norm": 21.75, "learning_rate": 2.6239224137931035e-06, "logits/chosen": -1.6517295837402344, "logits/rejected": -1.6636890172958374, "logps/chosen": -499.03717041015625, "logps/rejected": -469.7442932128906, "loss": 0.6454, "rewards/accuracies": 0.625, "rewards/chosen": -0.559511125087738, "rewards/margins": 0.14985349774360657, "rewards/rejected": -0.7093645930290222, "step": 350 }, { "epoch": 0.3774983990023257, "eval_logits/chosen": -1.7148549556732178, "eval_logits/rejected": -1.7745919227600098, "eval_logps/chosen": -562.5575561523438, "eval_logps/rejected": -509.7618713378906, "eval_loss": 0.6621462106704712, "eval_rewards/accuracies": 0.625, "eval_rewards/chosen": -0.7166208624839783, "eval_rewards/margins": 0.12094759196043015, "eval_rewards/rejected": -0.8375685811042786, "eval_runtime": 12.8579, "eval_samples_per_second": 7.777, "eval_steps_per_second": 1.011, "step": 350 }, { "epoch": 0.38828406754524925, "grad_norm": 18.625, "learning_rate": 2.613146551724138e-06, "logits/chosen": -1.6570751667022705, "logits/rejected": -1.673710584640503, "logps/chosen": -550.8926391601562, "logps/rejected": -530.0281982421875, "loss": 0.635, "rewards/accuracies": 0.682812511920929, "rewards/chosen": -0.5474696755409241, "rewards/margins": 0.180132657289505, "rewards/rejected": -0.7276023626327515, "step": 360 }, { "epoch": 0.39906973608817287, "grad_norm": 22.125, "learning_rate": 2.6023706896551725e-06, "logits/chosen": -1.7166540622711182, "logits/rejected": -1.7119417190551758, "logps/chosen": -535.0701904296875, "logps/rejected": -521.47509765625, "loss": 0.6346, "rewards/accuracies": 0.6421874761581421, "rewards/chosen": -0.6037973761558533, "rewards/margins": 0.18038420379161835, "rewards/rejected": -0.7841815948486328, "step": 370 }, { "epoch": 0.40985540463109643, "grad_norm": 21.5, "learning_rate": 2.591594827586207e-06, "logits/chosen": -1.711259126663208, "logits/rejected": -1.7223602533340454, "logps/chosen": -554.1033935546875, "logps/rejected": -536.9190673828125, "loss": 0.6215, "rewards/accuracies": 0.667187511920929, "rewards/chosen": -0.5765694379806519, "rewards/margins": 0.21333327889442444, "rewards/rejected": -0.7899028062820435, "step": 380 }, { "epoch": 0.42064107317402, "grad_norm": 20.125, "learning_rate": 2.580818965517241e-06, "logits/chosen": -1.674808144569397, "logits/rejected": -1.7261720895767212, "logps/chosen": -534.8923950195312, "logps/rejected": -509.17437744140625, "loss": 0.6234, "rewards/accuracies": 0.645312488079071, "rewards/chosen": -0.5851498246192932, "rewards/margins": 0.20684213936328888, "rewards/rejected": -0.7919918298721313, "step": 390 }, { "epoch": 0.4314267417169436, "grad_norm": 21.375, "learning_rate": 2.5700431034482762e-06, "logits/chosen": -1.7043625116348267, "logits/rejected": -1.721663236618042, "logps/chosen": -547.33935546875, "logps/rejected": -535.8040771484375, "loss": 0.6314, "rewards/accuracies": 0.645312488079071, "rewards/chosen": -0.6050974130630493, "rewards/margins": 0.18475469946861267, "rewards/rejected": -0.7898520827293396, "step": 400 }, { "epoch": 0.4314267417169436, "eval_logits/chosen": -1.7199591398239136, "eval_logits/rejected": -1.7800374031066895, "eval_logps/chosen": -562.4886474609375, "eval_logps/rejected": -509.6477355957031, "eval_loss": 0.6607488989830017, "eval_rewards/accuracies": 0.6153846383094788, "eval_rewards/chosen": -0.7131773829460144, "eval_rewards/margins": 0.11868361383676529, "eval_rewards/rejected": -0.831861138343811, "eval_runtime": 12.8809, "eval_samples_per_second": 7.763, "eval_steps_per_second": 1.009, "step": 400 }, { "epoch": 0.4422124102598672, "grad_norm": 20.25, "learning_rate": 2.5592672413793105e-06, "logits/chosen": -1.7056515216827393, "logits/rejected": -1.7032356262207031, "logps/chosen": -544.6566162109375, "logps/rejected": -526.3981323242188, "loss": 0.6346, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.6110819578170776, "rewards/margins": 0.17850695550441742, "rewards/rejected": -0.7895889282226562, "step": 410 }, { "epoch": 0.4529980788027908, "grad_norm": 21.125, "learning_rate": 2.5484913793103452e-06, "logits/chosen": -1.7077720165252686, "logits/rejected": -1.7262470722198486, "logps/chosen": -538.9916381835938, "logps/rejected": -514.1024780273438, "loss": 0.6319, "rewards/accuracies": 0.6390625238418579, "rewards/chosen": -0.6130255460739136, "rewards/margins": 0.18954649567604065, "rewards/rejected": -0.8025720715522766, "step": 420 }, { "epoch": 0.46378374734571437, "grad_norm": 21.125, "learning_rate": 2.5377155172413795e-06, "logits/chosen": -1.6869211196899414, "logits/rejected": -1.7010374069213867, "logps/chosen": -519.823486328125, "logps/rejected": -514.0619506835938, "loss": 0.6187, "rewards/accuracies": 0.6734374761581421, "rewards/chosen": -0.6134659647941589, "rewards/margins": 0.21337684988975525, "rewards/rejected": -0.8268427848815918, "step": 430 }, { "epoch": 0.474569415888638, "grad_norm": 20.75, "learning_rate": 2.526939655172414e-06, "logits/chosen": -1.7328685522079468, "logits/rejected": -1.7307264804840088, "logps/chosen": -519.9312744140625, "logps/rejected": -513.1961669921875, "loss": 0.6278, "rewards/accuracies": 0.676562488079071, "rewards/chosen": -0.6580926179885864, "rewards/margins": 0.20139017701148987, "rewards/rejected": -0.8594827651977539, "step": 440 }, { "epoch": 0.48535508443156156, "grad_norm": 21.0, "learning_rate": 2.5161637931034486e-06, "logits/chosen": -1.7335723638534546, "logits/rejected": -1.7415554523468018, "logps/chosen": -526.5194702148438, "logps/rejected": -518.3421020507812, "loss": 0.6219, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.6824163198471069, "rewards/margins": 0.2106122523546219, "rewards/rejected": -0.8930285573005676, "step": 450 }, { "epoch": 0.48535508443156156, "eval_logits/chosen": -1.7433769702911377, "eval_logits/rejected": -1.8047083616256714, "eval_logps/chosen": -564.2274169921875, "eval_logps/rejected": -511.63067626953125, "eval_loss": 0.6593914031982422, "eval_rewards/accuracies": 0.5769230723381042, "eval_rewards/chosen": -0.8001139760017395, "eval_rewards/margins": 0.13089530169963837, "eval_rewards/rejected": -0.9310091733932495, "eval_runtime": 12.8159, "eval_samples_per_second": 7.803, "eval_steps_per_second": 1.014, "step": 450 }, { "epoch": 0.4961407529744852, "grad_norm": 20.875, "learning_rate": 2.505387931034483e-06, "logits/chosen": -1.7393690347671509, "logits/rejected": -1.7588695287704468, "logps/chosen": -552.0422973632812, "logps/rejected": -530.3653564453125, "loss": 0.6202, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.686679482460022, "rewards/margins": 0.22524280846118927, "rewards/rejected": -0.9119223356246948, "step": 460 }, { "epoch": 0.5069264215174087, "grad_norm": 20.25, "learning_rate": 2.4946120689655176e-06, "logits/chosen": -1.7419532537460327, "logits/rejected": -1.7394039630889893, "logps/chosen": -526.1238403320312, "logps/rejected": -506.7391052246094, "loss": 0.6301, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.6740504503250122, "rewards/margins": 0.18307238817214966, "rewards/rejected": -0.8571227788925171, "step": 470 }, { "epoch": 0.5177120900603324, "grad_norm": 22.375, "learning_rate": 2.483836206896552e-06, "logits/chosen": -1.6943247318267822, "logits/rejected": -1.6783298254013062, "logps/chosen": -568.21435546875, "logps/rejected": -553.17822265625, "loss": 0.6339, "rewards/accuracies": 0.6390625238418579, "rewards/chosen": -0.6776462197303772, "rewards/margins": 0.1882401704788208, "rewards/rejected": -0.865886390209198, "step": 480 }, { "epoch": 0.5284977586032559, "grad_norm": 21.125, "learning_rate": 2.473060344827586e-06, "logits/chosen": -1.7065378427505493, "logits/rejected": -1.7249234914779663, "logps/chosen": -529.4561767578125, "logps/rejected": -506.04071044921875, "loss": 0.6378, "rewards/accuracies": 0.645312488079071, "rewards/chosen": -0.6808103322982788, "rewards/margins": 0.17550134658813477, "rewards/rejected": -0.8563116788864136, "step": 490 }, { "epoch": 0.5392834271461795, "grad_norm": 23.75, "learning_rate": 2.462284482758621e-06, "logits/chosen": -1.6840463876724243, "logits/rejected": -1.70773184299469, "logps/chosen": -530.1900634765625, "logps/rejected": -512.8983764648438, "loss": 0.6382, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6517452001571655, "rewards/margins": 0.17097657918930054, "rewards/rejected": -0.8227217793464661, "step": 500 }, { "epoch": 0.5392834271461795, "eval_logits/chosen": -1.7378157377243042, "eval_logits/rejected": -1.7989575862884521, "eval_logps/chosen": -563.543212890625, "eval_logps/rejected": -510.96929931640625, "eval_loss": 0.6553998589515686, "eval_rewards/accuracies": 0.6442307829856873, "eval_rewards/chosen": -0.7659082412719727, "eval_rewards/margins": 0.1320342719554901, "eval_rewards/rejected": -0.8979425430297852, "eval_runtime": 12.8216, "eval_samples_per_second": 7.799, "eval_steps_per_second": 1.014, "step": 500 }, { "epoch": 0.5500690956891031, "grad_norm": 23.375, "learning_rate": 2.451508620689655e-06, "logits/chosen": -1.7287929058074951, "logits/rejected": -1.747097373008728, "logps/chosen": -568.22900390625, "logps/rejected": -537.8626098632812, "loss": 0.6234, "rewards/accuracies": 0.660937488079071, "rewards/chosen": -0.6510963439941406, "rewards/margins": 0.20002727210521698, "rewards/rejected": -0.8511236310005188, "step": 510 }, { "epoch": 0.5608547642320267, "grad_norm": 21.375, "learning_rate": 2.44073275862069e-06, "logits/chosen": -1.7096725702285767, "logits/rejected": -1.713513970375061, "logps/chosen": -578.6484375, "logps/rejected": -552.06884765625, "loss": 0.618, "rewards/accuracies": 0.6734374761581421, "rewards/chosen": -0.6893950700759888, "rewards/margins": 0.22265811264514923, "rewards/rejected": -0.9120532274246216, "step": 520 }, { "epoch": 0.5716404327749502, "grad_norm": 24.375, "learning_rate": 2.429956896551724e-06, "logits/chosen": -1.7773420810699463, "logits/rejected": -1.7806062698364258, "logps/chosen": -561.3190307617188, "logps/rejected": -557.246826171875, "loss": 0.6274, "rewards/accuracies": 0.660937488079071, "rewards/chosen": -0.6820749640464783, "rewards/margins": 0.23882977664470673, "rewards/rejected": -0.9209047555923462, "step": 530 }, { "epoch": 0.5824261013178739, "grad_norm": 21.625, "learning_rate": 2.419181034482759e-06, "logits/chosen": -1.7619960308074951, "logits/rejected": -1.7642993927001953, "logps/chosen": -520.9690551757812, "logps/rejected": -509.5274353027344, "loss": 0.6309, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7083293795585632, "rewards/margins": 0.1926591694355011, "rewards/rejected": -0.9009885787963867, "step": 540 }, { "epoch": 0.5932117698607975, "grad_norm": 18.75, "learning_rate": 2.408405172413793e-06, "logits/chosen": -1.7466360330581665, "logits/rejected": -1.746272325515747, "logps/chosen": -513.3010864257812, "logps/rejected": -490.46624755859375, "loss": 0.6377, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.7287647128105164, "rewards/margins": 0.16981419920921326, "rewards/rejected": -0.8985790014266968, "step": 550 }, { "epoch": 0.5932117698607975, "eval_logits/chosen": -1.7363488674163818, "eval_logits/rejected": -1.7969601154327393, "eval_logps/chosen": -564.2831420898438, "eval_logps/rejected": -511.74945068359375, "eval_loss": 0.6561428904533386, "eval_rewards/accuracies": 0.6442307829856873, "eval_rewards/chosen": -0.80290687084198, "eval_rewards/margins": 0.13403938710689545, "eval_rewards/rejected": -0.9369462728500366, "eval_runtime": 12.8373, "eval_samples_per_second": 7.79, "eval_steps_per_second": 1.013, "step": 550 }, { "epoch": 0.6039974384037211, "grad_norm": 21.75, "learning_rate": 2.3976293103448275e-06, "logits/chosen": -1.7120717763900757, "logits/rejected": -1.717246651649475, "logps/chosen": -544.954833984375, "logps/rejected": -519.8233032226562, "loss": 0.6182, "rewards/accuracies": 0.6703125238418579, "rewards/chosen": -0.6614271402359009, "rewards/margins": 0.22156758606433868, "rewards/rejected": -0.8829947710037231, "step": 560 }, { "epoch": 0.6147831069466446, "grad_norm": 22.625, "learning_rate": 2.386853448275862e-06, "logits/chosen": -1.6905927658081055, "logits/rejected": -1.709628701210022, "logps/chosen": -529.6423950195312, "logps/rejected": -513.8201904296875, "loss": 0.626, "rewards/accuracies": 0.692187488079071, "rewards/chosen": -0.6795625686645508, "rewards/margins": 0.20236878097057343, "rewards/rejected": -0.8819311857223511, "step": 570 }, { "epoch": 0.6255687754895682, "grad_norm": 21.375, "learning_rate": 2.3760775862068965e-06, "logits/chosen": -1.7218097448349, "logits/rejected": -1.7233638763427734, "logps/chosen": -566.1005249023438, "logps/rejected": -550.3419799804688, "loss": 0.6288, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7431906461715698, "rewards/margins": 0.20129895210266113, "rewards/rejected": -0.9444894790649414, "step": 580 }, { "epoch": 0.6363544440324919, "grad_norm": 21.625, "learning_rate": 2.365301724137931e-06, "logits/chosen": -1.718971848487854, "logits/rejected": -1.7108612060546875, "logps/chosen": -552.8258056640625, "logps/rejected": -534.0645141601562, "loss": 0.6376, "rewards/accuracies": 0.6421874761581421, "rewards/chosen": -0.7128755450248718, "rewards/margins": 0.18357399106025696, "rewards/rejected": -0.8964495658874512, "step": 590 }, { "epoch": 0.6471401125754154, "grad_norm": 21.75, "learning_rate": 2.3545258620689655e-06, "logits/chosen": -1.6846719980239868, "logits/rejected": -1.6692850589752197, "logps/chosen": -575.1969604492188, "logps/rejected": -551.0662841796875, "loss": 0.6428, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7172713279724121, "rewards/margins": 0.16418126225471497, "rewards/rejected": -0.8814526796340942, "step": 600 }, { "epoch": 0.6471401125754154, "eval_logits/chosen": -1.7446637153625488, "eval_logits/rejected": -1.805701494216919, "eval_logps/chosen": -564.8399658203125, "eval_logps/rejected": -512.3929443359375, "eval_loss": 0.6552779674530029, "eval_rewards/accuracies": 0.6153846383094788, "eval_rewards/chosen": -0.8307470083236694, "eval_rewards/margins": 0.13837800920009613, "eval_rewards/rejected": -0.9691251516342163, "eval_runtime": 12.8832, "eval_samples_per_second": 7.762, "eval_steps_per_second": 1.009, "step": 600 }, { "epoch": 0.657925781118339, "grad_norm": 19.625, "learning_rate": 2.3437500000000002e-06, "logits/chosen": -1.7127196788787842, "logits/rejected": -1.722394585609436, "logps/chosen": -534.408447265625, "logps/rejected": -524.6024169921875, "loss": 0.6382, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.7230153679847717, "rewards/margins": 0.17467036843299866, "rewards/rejected": -0.897685706615448, "step": 610 }, { "epoch": 0.6687114496612626, "grad_norm": 19.25, "learning_rate": 2.3329741379310345e-06, "logits/chosen": -1.731837511062622, "logits/rejected": -1.7623554468154907, "logps/chosen": -528.728515625, "logps/rejected": -500.19500732421875, "loss": 0.6294, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.695734441280365, "rewards/margins": 0.19728422164916992, "rewards/rejected": -0.8930186033248901, "step": 620 }, { "epoch": 0.6794971182041862, "grad_norm": 22.875, "learning_rate": 2.322198275862069e-06, "logits/chosen": -1.6935323476791382, "logits/rejected": -1.7253952026367188, "logps/chosen": -557.6375122070312, "logps/rejected": -530.9302368164062, "loss": 0.6425, "rewards/accuracies": 0.6421874761581421, "rewards/chosen": -0.7235587239265442, "rewards/margins": 0.1692447066307068, "rewards/rejected": -0.8928033709526062, "step": 630 }, { "epoch": 0.6902827867471097, "grad_norm": 22.75, "learning_rate": 2.3114224137931035e-06, "logits/chosen": -1.7822033166885376, "logits/rejected": -1.7954456806182861, "logps/chosen": -548.27294921875, "logps/rejected": -529.1452026367188, "loss": 0.6357, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.733366847038269, "rewards/margins": 0.18996267020702362, "rewards/rejected": -0.9233294725418091, "step": 640 }, { "epoch": 0.7010684552900334, "grad_norm": 23.625, "learning_rate": 2.300646551724138e-06, "logits/chosen": -1.7504284381866455, "logits/rejected": -1.7501709461212158, "logps/chosen": -563.0472412109375, "logps/rejected": -547.3125, "loss": 0.6376, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7303886413574219, "rewards/margins": 0.17333553731441498, "rewards/rejected": -0.903724193572998, "step": 650 }, { "epoch": 0.7010684552900334, "eval_logits/chosen": -1.7363975048065186, "eval_logits/rejected": -1.7971389293670654, "eval_logps/chosen": -564.5283813476562, "eval_logps/rejected": -512.2813110351562, "eval_loss": 0.6501194834709167, "eval_rewards/accuracies": 0.6442307829856873, "eval_rewards/chosen": -0.8151662349700928, "eval_rewards/margins": 0.1483728587627411, "eval_rewards/rejected": -0.9635391235351562, "eval_runtime": 12.8651, "eval_samples_per_second": 7.773, "eval_steps_per_second": 1.01, "step": 650 }, { "epoch": 0.711854123832957, "grad_norm": 21.0, "learning_rate": 2.2898706896551725e-06, "logits/chosen": -1.6629375219345093, "logits/rejected": -1.686265230178833, "logps/chosen": -579.2559814453125, "logps/rejected": -550.0233764648438, "loss": 0.6339, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7081414461135864, "rewards/margins": 0.19901154935359955, "rewards/rejected": -0.9071530103683472, "step": 660 }, { "epoch": 0.7226397923758805, "grad_norm": 24.25, "learning_rate": 2.279094827586207e-06, "logits/chosen": -1.772698998451233, "logits/rejected": -1.8019428253173828, "logps/chosen": -505.48577880859375, "logps/rejected": -490.10662841796875, "loss": 0.6425, "rewards/accuracies": 0.6078125238418579, "rewards/chosen": -0.7496585249900818, "rewards/margins": 0.17492921650409698, "rewards/rejected": -0.9245878458023071, "step": 670 }, { "epoch": 0.7334254609188041, "grad_norm": 23.125, "learning_rate": 2.2683189655172415e-06, "logits/chosen": -1.7226699590682983, "logits/rejected": -1.7439358234405518, "logps/chosen": -551.814208984375, "logps/rejected": -530.10888671875, "loss": 0.6147, "rewards/accuracies": 0.6703125238418579, "rewards/chosen": -0.7403059601783752, "rewards/margins": 0.22587330639362335, "rewards/rejected": -0.966179370880127, "step": 680 }, { "epoch": 0.7442111294617277, "grad_norm": 19.625, "learning_rate": 2.257543103448276e-06, "logits/chosen": -1.7711864709854126, "logits/rejected": -1.7668606042861938, "logps/chosen": -540.2945556640625, "logps/rejected": -520.48828125, "loss": 0.6112, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.7231726050376892, "rewards/margins": 0.24855723977088928, "rewards/rejected": -0.9717298746109009, "step": 690 }, { "epoch": 0.7549967980046514, "grad_norm": 22.375, "learning_rate": 2.24676724137931e-06, "logits/chosen": -1.7296479940414429, "logits/rejected": -1.7336111068725586, "logps/chosen": -525.4596557617188, "logps/rejected": -499.53826904296875, "loss": 0.644, "rewards/accuracies": 0.6578124761581421, "rewards/chosen": -0.7867623567581177, "rewards/margins": 0.18225276470184326, "rewards/rejected": -0.9690152406692505, "step": 700 }, { "epoch": 0.7549967980046514, "eval_logits/chosen": -1.7320126295089722, "eval_logits/rejected": -1.7923235893249512, "eval_logps/chosen": -565.2008056640625, "eval_logps/rejected": -512.877197265625, "eval_loss": 0.6524822115898132, "eval_rewards/accuracies": 0.6153846383094788, "eval_rewards/chosen": -0.8487869501113892, "eval_rewards/margins": 0.1445501744747162, "eval_rewards/rejected": -0.9933372139930725, "eval_runtime": 12.8132, "eval_samples_per_second": 7.804, "eval_steps_per_second": 1.015, "step": 700 }, { "epoch": 0.7657824665475749, "grad_norm": 36.25, "learning_rate": 2.235991379310345e-06, "logits/chosen": -1.6764347553253174, "logits/rejected": -1.6931612491607666, "logps/chosen": -555.44091796875, "logps/rejected": -529.9513549804688, "loss": 0.6177, "rewards/accuracies": 0.667187511920929, "rewards/chosen": -0.7847446203231812, "rewards/margins": 0.23240776360034943, "rewards/rejected": -1.0171524286270142, "step": 710 }, { "epoch": 0.7765681350904985, "grad_norm": 21.375, "learning_rate": 2.225215517241379e-06, "logits/chosen": -1.7237228155136108, "logits/rejected": -1.7398170232772827, "logps/chosen": -542.1912231445312, "logps/rejected": -518.4722900390625, "loss": 0.6354, "rewards/accuracies": 0.6421874761581421, "rewards/chosen": -0.7876893281936646, "rewards/margins": 0.1978759765625, "rewards/rejected": -0.9855653643608093, "step": 720 }, { "epoch": 0.7873538036334221, "grad_norm": 23.0, "learning_rate": 2.214439655172414e-06, "logits/chosen": -1.752096176147461, "logits/rejected": -1.7941181659698486, "logps/chosen": -548.2041015625, "logps/rejected": -513.8216552734375, "loss": 0.6555, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.7717311978340149, "rewards/margins": 0.14397189021110535, "rewards/rejected": -0.9157029986381531, "step": 730 }, { "epoch": 0.7981394721763457, "grad_norm": 19.75, "learning_rate": 2.203663793103448e-06, "logits/chosen": -1.709242820739746, "logits/rejected": -1.7145198583602905, "logps/chosen": -539.0947265625, "logps/rejected": -521.4993896484375, "loss": 0.6394, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.7486387491226196, "rewards/margins": 0.18467199802398682, "rewards/rejected": -0.933310866355896, "step": 740 }, { "epoch": 0.8089251407192692, "grad_norm": 23.875, "learning_rate": 2.192887931034483e-06, "logits/chosen": -1.632927656173706, "logits/rejected": -1.6515719890594482, "logps/chosen": -559.2658081054688, "logps/rejected": -526.0423583984375, "loss": 0.6322, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.7207778692245483, "rewards/margins": 0.19092032313346863, "rewards/rejected": -0.911698043346405, "step": 750 }, { "epoch": 0.8089251407192692, "eval_logits/chosen": -1.717968463897705, "eval_logits/rejected": -1.7769808769226074, "eval_logps/chosen": -564.5967407226562, "eval_logps/rejected": -512.3129272460938, "eval_loss": 0.6494570970535278, "eval_rewards/accuracies": 0.6634615659713745, "eval_rewards/chosen": -0.8185831308364868, "eval_rewards/margins": 0.1465369611978531, "eval_rewards/rejected": -0.9651200175285339, "eval_runtime": 12.8208, "eval_samples_per_second": 7.8, "eval_steps_per_second": 1.014, "step": 750 }, { "epoch": 0.8197108092621929, "grad_norm": 22.25, "learning_rate": 2.182112068965517e-06, "logits/chosen": -1.7085649967193604, "logits/rejected": -1.7567522525787354, "logps/chosen": -565.7103271484375, "logps/rejected": -539.7507934570312, "loss": 0.626, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7772555351257324, "rewards/margins": 0.20999836921691895, "rewards/rejected": -0.9872539639472961, "step": 760 }, { "epoch": 0.8304964778051165, "grad_norm": 22.125, "learning_rate": 2.1713362068965515e-06, "logits/chosen": -1.7073513269424438, "logits/rejected": -1.7328789234161377, "logps/chosen": -536.3829345703125, "logps/rejected": -514.23779296875, "loss": 0.6285, "rewards/accuracies": 0.6578124761581421, "rewards/chosen": -0.7473222613334656, "rewards/margins": 0.20041945576667786, "rewards/rejected": -0.947741687297821, "step": 770 }, { "epoch": 0.84128214634804, "grad_norm": 21.75, "learning_rate": 2.160560344827586e-06, "logits/chosen": -1.6973340511322021, "logits/rejected": -1.691319465637207, "logps/chosen": -557.1148071289062, "logps/rejected": -558.3483276367188, "loss": 0.6268, "rewards/accuracies": 0.6578124761581421, "rewards/chosen": -0.7272000312805176, "rewards/margins": 0.21510732173919678, "rewards/rejected": -0.9423073530197144, "step": 780 }, { "epoch": 0.8520678148909636, "grad_norm": 20.875, "learning_rate": 2.1497844827586205e-06, "logits/chosen": -1.679863691329956, "logits/rejected": -1.6904770135879517, "logps/chosen": -543.8391723632812, "logps/rejected": -531.5890502929688, "loss": 0.625, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.756318986415863, "rewards/margins": 0.20582714676856995, "rewards/rejected": -0.9621461629867554, "step": 790 }, { "epoch": 0.8628534834338872, "grad_norm": 21.875, "learning_rate": 2.139008620689655e-06, "logits/chosen": -1.7281272411346436, "logits/rejected": -1.745347261428833, "logps/chosen": -547.1482543945312, "logps/rejected": -527.178466796875, "loss": 0.6278, "rewards/accuracies": 0.671875, "rewards/chosen": -0.7604348063468933, "rewards/margins": 0.19645504653453827, "rewards/rejected": -0.9568899273872375, "step": 800 }, { "epoch": 0.8628534834338872, "eval_logits/chosen": -1.7295496463775635, "eval_logits/rejected": -1.7896690368652344, "eval_logps/chosen": -565.3712158203125, "eval_logps/rejected": -513.1586303710938, "eval_loss": 0.6495404243469238, "eval_rewards/accuracies": 0.6730769276618958, "eval_rewards/chosen": -0.8573046922683716, "eval_rewards/margins": 0.15010415017604828, "eval_rewards/rejected": -1.0074087381362915, "eval_runtime": 12.8304, "eval_samples_per_second": 7.794, "eval_steps_per_second": 1.013, "step": 800 }, { "epoch": 0.8736391519768109, "grad_norm": 24.0, "learning_rate": 2.1282327586206895e-06, "logits/chosen": -1.6808828115463257, "logits/rejected": -1.7251535654067993, "logps/chosen": -545.0057373046875, "logps/rejected": -514.2200317382812, "loss": 0.6473, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7858768701553345, "rewards/margins": 0.16710534691810608, "rewards/rejected": -0.9529821276664734, "step": 810 }, { "epoch": 0.8844248205197344, "grad_norm": 21.625, "learning_rate": 2.117456896551724e-06, "logits/chosen": -1.7713664770126343, "logits/rejected": -1.7678531408309937, "logps/chosen": -544.8568115234375, "logps/rejected": -516.0807495117188, "loss": 0.6342, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.8098779916763306, "rewards/margins": 0.199488565325737, "rewards/rejected": -1.009366512298584, "step": 820 }, { "epoch": 0.895210489062658, "grad_norm": 22.375, "learning_rate": 2.106681034482759e-06, "logits/chosen": -1.7336772680282593, "logits/rejected": -1.7515900135040283, "logps/chosen": -549.8850708007812, "logps/rejected": -530.6832275390625, "loss": 0.6074, "rewards/accuracies": 0.703125, "rewards/chosen": -0.7838916778564453, "rewards/margins": 0.2659648060798645, "rewards/rejected": -1.049856424331665, "step": 830 }, { "epoch": 0.9059961576055816, "grad_norm": 21.375, "learning_rate": 2.0959051724137932e-06, "logits/chosen": -1.7343952655792236, "logits/rejected": -1.7316901683807373, "logps/chosen": -574.8560791015625, "logps/rejected": -536.2384033203125, "loss": 0.6342, "rewards/accuracies": 0.651562511920929, "rewards/chosen": -0.7904583811759949, "rewards/margins": 0.1979624629020691, "rewards/rejected": -0.988420844078064, "step": 840 }, { "epoch": 0.9167818261485051, "grad_norm": 21.5, "learning_rate": 2.085129310344828e-06, "logits/chosen": -1.733007788658142, "logits/rejected": -1.727246880531311, "logps/chosen": -524.6848754882812, "logps/rejected": -510.7845764160156, "loss": 0.6307, "rewards/accuracies": 0.6421874761581421, "rewards/chosen": -0.8132136464118958, "rewards/margins": 0.20207151770591736, "rewards/rejected": -1.0152852535247803, "step": 850 }, { "epoch": 0.9167818261485051, "eval_logits/chosen": -1.724961757659912, "eval_logits/rejected": -1.7844202518463135, "eval_logps/chosen": -565.8350830078125, "eval_logps/rejected": -513.8818969726562, "eval_loss": 0.6429941058158875, "eval_rewards/accuracies": 0.6538461446762085, "eval_rewards/chosen": -0.8805010318756104, "eval_rewards/margins": 0.16307112574577332, "eval_rewards/rejected": -1.043572187423706, "eval_runtime": 12.8678, "eval_samples_per_second": 7.771, "eval_steps_per_second": 1.01, "step": 850 }, { "epoch": 0.9275674946914287, "grad_norm": 21.0, "learning_rate": 2.0743534482758622e-06, "logits/chosen": -1.678770661354065, "logits/rejected": -1.6889280080795288, "logps/chosen": -549.0493774414062, "logps/rejected": -529.0235595703125, "loss": 0.6311, "rewards/accuracies": 0.6578124761581421, "rewards/chosen": -0.8022142648696899, "rewards/margins": 0.21385040879249573, "rewards/rejected": -1.0160646438598633, "step": 860 }, { "epoch": 0.9383531632343524, "grad_norm": 21.25, "learning_rate": 2.063577586206897e-06, "logits/chosen": -1.677046775817871, "logits/rejected": -1.715746521949768, "logps/chosen": -558.4188842773438, "logps/rejected": -526.5531616210938, "loss": 0.6388, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7892870903015137, "rewards/margins": 0.1829388290643692, "rewards/rejected": -0.9722259640693665, "step": 870 }, { "epoch": 0.949138831777276, "grad_norm": 22.25, "learning_rate": 2.0528017241379312e-06, "logits/chosen": -1.71084463596344, "logits/rejected": -1.7499706745147705, "logps/chosen": -547.9580078125, "logps/rejected": -525.6466674804688, "loss": 0.6215, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7694340348243713, "rewards/margins": 0.2342531681060791, "rewards/rejected": -1.0036872625350952, "step": 880 }, { "epoch": 0.9599245003201995, "grad_norm": 22.5, "learning_rate": 2.0420258620689655e-06, "logits/chosen": -1.7067375183105469, "logits/rejected": -1.7360172271728516, "logps/chosen": -553.2872924804688, "logps/rejected": -533.8543090820312, "loss": 0.6409, "rewards/accuracies": 0.6390625238418579, "rewards/chosen": -0.7895435690879822, "rewards/margins": 0.18897457420825958, "rewards/rejected": -0.9785181879997253, "step": 890 }, { "epoch": 0.9707101688631231, "grad_norm": 22.125, "learning_rate": 2.0312500000000002e-06, "logits/chosen": -1.7305114269256592, "logits/rejected": -1.733814001083374, "logps/chosen": -544.6137084960938, "logps/rejected": -523.6353149414062, "loss": 0.6191, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.786180317401886, "rewards/margins": 0.22723452746868134, "rewards/rejected": -1.013414978981018, "step": 900 }, { "epoch": 0.9707101688631231, "eval_logits/chosen": -1.7138627767562866, "eval_logits/rejected": -1.773409128189087, "eval_logps/chosen": -565.8074340820312, "eval_logps/rejected": -513.8848266601562, "eval_loss": 0.6439433097839355, "eval_rewards/accuracies": 0.6634615659713745, "eval_rewards/chosen": -0.8791185617446899, "eval_rewards/margins": 0.16459915041923523, "eval_rewards/rejected": -1.043717622756958, "eval_runtime": 12.8601, "eval_samples_per_second": 7.776, "eval_steps_per_second": 1.011, "step": 900 }, { "epoch": 0.9814958374060467, "grad_norm": 20.5, "learning_rate": 2.0204741379310345e-06, "logits/chosen": -1.7137806415557861, "logits/rejected": -1.714971899986267, "logps/chosen": -546.196533203125, "logps/rejected": -519.9130859375, "loss": 0.6336, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7919280529022217, "rewards/margins": 0.189212828874588, "rewards/rejected": -0.9811409115791321, "step": 910 }, { "epoch": 0.9922815059489704, "grad_norm": 21.375, "learning_rate": 2.0096982758620693e-06, "logits/chosen": -1.7518551349639893, "logits/rejected": -1.789482831954956, "logps/chosen": -539.8787841796875, "logps/rejected": -518.9136962890625, "loss": 0.6408, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.8202158808708191, "rewards/margins": 0.1931016743183136, "rewards/rejected": -1.0133177042007446, "step": 920 } ], "logging_steps": 10, "max_steps": 2784, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }