Files
ModelHub XC f72327573d 初始化项目,由ModelHub XC社区提供模型
Model: chenyongxi/Qwen2.5-1.5B-SFT-DPO-InfinityPreference
Source: Original Platform
2026-04-14 00:35:51 +08:00

1703 lines
58 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 50,
"global_step": 928,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01078566854292359,
"grad_norm": 20.25,
"learning_rate": 2.990301724137931e-06,
"logits/chosen": -1.2338563203811646,
"logits/rejected": -1.2257438898086548,
"logps/chosen": -490.282470703125,
"logps/rejected": -482.52545166015625,
"loss": 0.6903,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.0017994878580793738,
"rewards/margins": 0.006464972160756588,
"rewards/rejected": -0.004665483720600605,
"step": 10
},
{
"epoch": 0.02157133708584718,
"grad_norm": 22.0,
"learning_rate": 2.979525862068966e-06,
"logits/chosen": -1.2723968029022217,
"logits/rejected": -1.2785792350769043,
"logps/chosen": -520.5911254882812,
"logps/rejected": -514.4813842773438,
"loss": 0.6856,
"rewards/accuracies": 0.6078125238418579,
"rewards/chosen": 0.0043027508072555065,
"rewards/margins": 0.0164328720420599,
"rewards/rejected": -0.01213012170046568,
"step": 20
},
{
"epoch": 0.03235700562877077,
"grad_norm": 19.125,
"learning_rate": 2.96875e-06,
"logits/chosen": -1.2637840509414673,
"logits/rejected": -1.246565818786621,
"logps/chosen": -507.4403381347656,
"logps/rejected": -501.22479248046875,
"loss": 0.6789,
"rewards/accuracies": 0.6468750238418579,
"rewards/chosen": -0.0017711544642224908,
"rewards/margins": 0.030538281425833702,
"rewards/rejected": -0.032309435307979584,
"step": 30
},
{
"epoch": 0.04314267417169436,
"grad_norm": 20.75,
"learning_rate": 2.9579741379310345e-06,
"logits/chosen": -1.2590439319610596,
"logits/rejected": -1.2330172061920166,
"logps/chosen": -485.248291015625,
"logps/rejected": -482.63458251953125,
"loss": 0.6765,
"rewards/accuracies": 0.6484375,
"rewards/chosen": -0.00601751497015357,
"rewards/margins": 0.03583287447690964,
"rewards/rejected": -0.041850391775369644,
"step": 40
},
{
"epoch": 0.05392834271461795,
"grad_norm": 20.25,
"learning_rate": 2.947198275862069e-06,
"logits/chosen": -1.3426216840744019,
"logits/rejected": -1.3334815502166748,
"logps/chosen": -532.8280029296875,
"logps/rejected": -514.1970825195312,
"loss": 0.6684,
"rewards/accuracies": 0.6781250238418579,
"rewards/chosen": -0.00931160431355238,
"rewards/margins": 0.053057052195072174,
"rewards/rejected": -0.06236865371465683,
"step": 50
},
{
"epoch": 0.05392834271461795,
"eval_logits/chosen": -1.3231499195098877,
"eval_logits/rejected": -1.3642845153808594,
"eval_logps/chosen": -549.2651977539062,
"eval_logps/rejected": -494.54583740234375,
"eval_loss": 0.6826924681663513,
"eval_rewards/accuracies": 0.6153846383094788,
"eval_rewards/chosen": -0.05200628936290741,
"eval_rewards/margins": 0.02476159855723381,
"eval_rewards/rejected": -0.07676788419485092,
"eval_runtime": 13.2928,
"eval_samples_per_second": 7.523,
"eval_steps_per_second": 0.978,
"step": 50
},
{
"epoch": 0.06471401125754155,
"grad_norm": 20.25,
"learning_rate": 2.9364224137931035e-06,
"logits/chosen": -1.3449946641921997,
"logits/rejected": -1.343201756477356,
"logps/chosen": -548.1566772460938,
"logps/rejected": -531.6286010742188,
"loss": 0.6705,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.028984328731894493,
"rewards/margins": 0.04919930547475815,
"rewards/rejected": -0.07818363606929779,
"step": 60
},
{
"epoch": 0.07549967980046514,
"grad_norm": 20.875,
"learning_rate": 2.925646551724138e-06,
"logits/chosen": -1.3576488494873047,
"logits/rejected": -1.3385895490646362,
"logps/chosen": -516.7993774414062,
"logps/rejected": -502.6502990722656,
"loss": 0.6624,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.03774970397353172,
"rewards/margins": 0.0672496110200882,
"rewards/rejected": -0.10499931871891022,
"step": 70
},
{
"epoch": 0.08628534834338872,
"grad_norm": 19.5,
"learning_rate": 2.9148706896551725e-06,
"logits/chosen": -1.4393236637115479,
"logits/rejected": -1.4375989437103271,
"logps/chosen": -534.2485961914062,
"logps/rejected": -516.8245849609375,
"loss": 0.6547,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.05567573383450508,
"rewards/margins": 0.08416923880577087,
"rewards/rejected": -0.13984496891498566,
"step": 80
},
{
"epoch": 0.09707101688631231,
"grad_norm": 20.625,
"learning_rate": 2.904094827586207e-06,
"logits/chosen": -1.4080358743667603,
"logits/rejected": -1.399465799331665,
"logps/chosen": -503.4671936035156,
"logps/rejected": -488.74383544921875,
"loss": 0.6565,
"rewards/accuracies": 0.6781250238418579,
"rewards/chosen": -0.07379738986492157,
"rewards/margins": 0.08224000781774521,
"rewards/rejected": -0.15603742003440857,
"step": 90
},
{
"epoch": 0.1078566854292359,
"grad_norm": 20.375,
"learning_rate": 2.8933189655172415e-06,
"logits/chosen": -1.3748817443847656,
"logits/rejected": -1.366629958152771,
"logps/chosen": -518.0892944335938,
"logps/rejected": -499.7432556152344,
"loss": 0.6541,
"rewards/accuracies": 0.6781250238418579,
"rewards/chosen": -0.09199288487434387,
"rewards/margins": 0.08948174864053726,
"rewards/rejected": -0.18147462606430054,
"step": 100
},
{
"epoch": 0.1078566854292359,
"eval_logits/chosen": -1.4332343339920044,
"eval_logits/rejected": -1.479236364364624,
"eval_logps/chosen": -551.7933349609375,
"eval_logps/rejected": -497.4646911621094,
"eval_loss": 0.6763917803764343,
"eval_rewards/accuracies": 0.5769230723381042,
"eval_rewards/chosen": -0.17841331660747528,
"eval_rewards/margins": 0.04429765045642853,
"eval_rewards/rejected": -0.222710981965065,
"eval_runtime": 12.8741,
"eval_samples_per_second": 7.768,
"eval_steps_per_second": 1.01,
"step": 100
},
{
"epoch": 0.1186423539721595,
"grad_norm": 19.25,
"learning_rate": 2.8825431034482758e-06,
"logits/chosen": -1.436632513999939,
"logits/rejected": -1.4507520198822021,
"logps/chosen": -508.725830078125,
"logps/rejected": -500.9781799316406,
"loss": 0.6421,
"rewards/accuracies": 0.729687511920929,
"rewards/chosen": -0.11871524900197983,
"rewards/margins": 0.11672022193670273,
"rewards/rejected": -0.23543548583984375,
"step": 110
},
{
"epoch": 0.1294280225150831,
"grad_norm": 19.375,
"learning_rate": 2.8717672413793105e-06,
"logits/chosen": -1.4648942947387695,
"logits/rejected": -1.474923849105835,
"logps/chosen": -522.5699462890625,
"logps/rejected": -521.1087036132812,
"loss": 0.635,
"rewards/accuracies": 0.7171875238418579,
"rewards/chosen": -0.14662253856658936,
"rewards/margins": 0.13384078443050385,
"rewards/rejected": -0.2804633677005768,
"step": 120
},
{
"epoch": 0.14021369105800668,
"grad_norm": 21.0,
"learning_rate": 2.860991379310345e-06,
"logits/chosen": -1.5117241144180298,
"logits/rejected": -1.5187674760818481,
"logps/chosen": -526.3902587890625,
"logps/rejected": -518.88525390625,
"loss": 0.6413,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.1973995864391327,
"rewards/margins": 0.12558409571647644,
"rewards/rejected": -0.32298368215560913,
"step": 130
},
{
"epoch": 0.15099935960093028,
"grad_norm": 19.75,
"learning_rate": 2.8502155172413795e-06,
"logits/chosen": -1.5119014978408813,
"logits/rejected": -1.5130690336227417,
"logps/chosen": -533.3895263671875,
"logps/rejected": -527.9937744140625,
"loss": 0.6401,
"rewards/accuracies": 0.6703125238418579,
"rewards/chosen": -0.22728869318962097,
"rewards/margins": 0.13429734110832214,
"rewards/rejected": -0.3615860342979431,
"step": 140
},
{
"epoch": 0.16178502814385384,
"grad_norm": 19.375,
"learning_rate": 2.839439655172414e-06,
"logits/chosen": -1.5527749061584473,
"logits/rejected": -1.5703589916229248,
"logps/chosen": -515.7926025390625,
"logps/rejected": -503.88360595703125,
"loss": 0.6397,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.2661042809486389,
"rewards/margins": 0.13428668677806854,
"rewards/rejected": -0.40039095282554626,
"step": 150
},
{
"epoch": 0.16178502814385384,
"eval_logits/chosen": -1.5442339181900024,
"eval_logits/rejected": -1.5949413776397705,
"eval_logps/chosen": -555.982666015625,
"eval_logps/rejected": -502.18585205078125,
"eval_loss": 0.6722157001495361,
"eval_rewards/accuracies": 0.5865384340286255,
"eval_rewards/chosen": -0.38787999749183655,
"eval_rewards/margins": 0.07088876515626907,
"eval_rewards/rejected": -0.4587687849998474,
"eval_runtime": 12.8342,
"eval_samples_per_second": 7.792,
"eval_steps_per_second": 1.013,
"step": 150
},
{
"epoch": 0.17257069668677744,
"grad_norm": 20.0,
"learning_rate": 2.8286637931034485e-06,
"logits/chosen": -1.5724695920944214,
"logits/rejected": -1.5730245113372803,
"logps/chosen": -506.13671875,
"logps/rejected": -495.46142578125,
"loss": 0.6699,
"rewards/accuracies": 0.659375011920929,
"rewards/chosen": -0.29719287157058716,
"rewards/margins": 0.10155584663152695,
"rewards/rejected": -0.3987486958503723,
"step": 160
},
{
"epoch": 0.18335636522970103,
"grad_norm": 21.125,
"learning_rate": 2.817887931034483e-06,
"logits/chosen": -1.5566643476486206,
"logits/rejected": -1.555086612701416,
"logps/chosen": -557.1043701171875,
"logps/rejected": -539.7008666992188,
"loss": 0.6322,
"rewards/accuracies": 0.6656249761581421,
"rewards/chosen": -0.32251477241516113,
"rewards/margins": 0.1617726981639862,
"rewards/rejected": -0.48428741097450256,
"step": 170
},
{
"epoch": 0.19414203377262462,
"grad_norm": 20.5,
"learning_rate": 2.807112068965517e-06,
"logits/chosen": -1.5289297103881836,
"logits/rejected": -1.5290358066558838,
"logps/chosen": -527.3707275390625,
"logps/rejected": -511.8185119628906,
"loss": 0.6347,
"rewards/accuracies": 0.667187511920929,
"rewards/chosen": -0.3478879928588867,
"rewards/margins": 0.1548273265361786,
"rewards/rejected": -0.5027152895927429,
"step": 180
},
{
"epoch": 0.20492770231554822,
"grad_norm": 21.0,
"learning_rate": 2.796336206896552e-06,
"logits/chosen": -1.5106637477874756,
"logits/rejected": -1.4951450824737549,
"logps/chosen": -535.6061401367188,
"logps/rejected": -528.7681884765625,
"loss": 0.623,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.35832709074020386,
"rewards/margins": 0.20923912525177002,
"rewards/rejected": -0.5675662755966187,
"step": 190
},
{
"epoch": 0.2157133708584718,
"grad_norm": 19.75,
"learning_rate": 2.785560344827586e-06,
"logits/chosen": -1.6122379302978516,
"logits/rejected": -1.6149158477783203,
"logps/chosen": -556.2003173828125,
"logps/rejected": -536.0861206054688,
"loss": 0.6237,
"rewards/accuracies": 0.707812488079071,
"rewards/chosen": -0.40547609329223633,
"rewards/margins": 0.18066053092479706,
"rewards/rejected": -0.5861365795135498,
"step": 200
},
{
"epoch": 0.2157133708584718,
"eval_logits/chosen": -1.6185290813446045,
"eval_logits/rejected": -1.671698808670044,
"eval_logps/chosen": -559.1907348632812,
"eval_logps/rejected": -505.73992919921875,
"eval_loss": 0.6703996062278748,
"eval_rewards/accuracies": 0.5769230723381042,
"eval_rewards/chosen": -0.5482814311981201,
"eval_rewards/margins": 0.08819277584552765,
"eval_rewards/rejected": -0.6364741921424866,
"eval_runtime": 12.8368,
"eval_samples_per_second": 7.79,
"eval_steps_per_second": 1.013,
"step": 200
},
{
"epoch": 0.2264990394013954,
"grad_norm": 20.0,
"learning_rate": 2.774784482758621e-06,
"logits/chosen": -1.602872610092163,
"logits/rejected": -1.6062767505645752,
"logps/chosen": -494.86566162109375,
"logps/rejected": -486.106689453125,
"loss": 0.6313,
"rewards/accuracies": 0.676562488079071,
"rewards/chosen": -0.41412362456321716,
"rewards/margins": 0.16503319144248962,
"rewards/rejected": -0.5791568160057068,
"step": 210
},
{
"epoch": 0.237284707944319,
"grad_norm": 22.875,
"learning_rate": 2.764008620689655e-06,
"logits/chosen": -1.6498100757598877,
"logits/rejected": -1.6630780696868896,
"logps/chosen": -540.1375732421875,
"logps/rejected": -533.9187622070312,
"loss": 0.6307,
"rewards/accuracies": 0.6546875238418579,
"rewards/chosen": -0.4561203420162201,
"rewards/margins": 0.18029221892356873,
"rewards/rejected": -0.636412501335144,
"step": 220
},
{
"epoch": 0.2480703764872426,
"grad_norm": 19.5,
"learning_rate": 2.75323275862069e-06,
"logits/chosen": -1.667067527770996,
"logits/rejected": -1.681429147720337,
"logps/chosen": -525.0696411132812,
"logps/rejected": -508.50067138671875,
"loss": 0.6261,
"rewards/accuracies": 0.6703125238418579,
"rewards/chosen": -0.4919908940792084,
"rewards/margins": 0.19147078692913055,
"rewards/rejected": -0.6834616661071777,
"step": 230
},
{
"epoch": 0.2588560450301662,
"grad_norm": 22.375,
"learning_rate": 2.742456896551724e-06,
"logits/chosen": -1.646775484085083,
"logits/rejected": -1.6376352310180664,
"logps/chosen": -517.5604248046875,
"logps/rejected": -504.1005859375,
"loss": 0.6228,
"rewards/accuracies": 0.692187488079071,
"rewards/chosen": -0.4839399755001068,
"rewards/margins": 0.1933339387178421,
"rewards/rejected": -0.6772739291191101,
"step": 240
},
{
"epoch": 0.26964171357308975,
"grad_norm": 20.125,
"learning_rate": 2.7316810344827584e-06,
"logits/chosen": -1.630406379699707,
"logits/rejected": -1.6203149557113647,
"logps/chosen": -511.6064453125,
"logps/rejected": -504.46112060546875,
"loss": 0.6224,
"rewards/accuracies": 0.6781250238418579,
"rewards/chosen": -0.5028296709060669,
"rewards/margins": 0.19740687310695648,
"rewards/rejected": -0.700236439704895,
"step": 250
},
{
"epoch": 0.26964171357308975,
"eval_logits/chosen": -1.671502947807312,
"eval_logits/rejected": -1.727898359298706,
"eval_logps/chosen": -561.3909912109375,
"eval_logps/rejected": -507.87896728515625,
"eval_loss": 0.6769301891326904,
"eval_rewards/accuracies": 0.567307710647583,
"eval_rewards/chosen": -0.6582961678504944,
"eval_rewards/margins": 0.08512917906045914,
"eval_rewards/rejected": -0.7434254288673401,
"eval_runtime": 12.8313,
"eval_samples_per_second": 7.793,
"eval_steps_per_second": 1.013,
"step": 250
},
{
"epoch": 0.28042738211601337,
"grad_norm": 22.75,
"learning_rate": 2.720905172413793e-06,
"logits/chosen": -1.6629798412322998,
"logits/rejected": -1.664310097694397,
"logps/chosen": -564.4110717773438,
"logps/rejected": -550.1747436523438,
"loss": 0.6344,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.51663738489151,
"rewards/margins": 0.17350056767463684,
"rewards/rejected": -0.6901379823684692,
"step": 260
},
{
"epoch": 0.29121305065893693,
"grad_norm": 20.5,
"learning_rate": 2.7101293103448275e-06,
"logits/chosen": -1.6739919185638428,
"logits/rejected": -1.6669447422027588,
"logps/chosen": -562.1046752929688,
"logps/rejected": -553.902099609375,
"loss": 0.6256,
"rewards/accuracies": 0.6656249761581421,
"rewards/chosen": -0.5245037078857422,
"rewards/margins": 0.19063076376914978,
"rewards/rejected": -0.7151345014572144,
"step": 270
},
{
"epoch": 0.30199871920186055,
"grad_norm": 19.875,
"learning_rate": 2.699353448275862e-06,
"logits/chosen": -1.6442272663116455,
"logits/rejected": -1.6505457162857056,
"logps/chosen": -529.9203491210938,
"logps/rejected": -513.3055419921875,
"loss": 0.6378,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.5445322394371033,
"rewards/margins": 0.16621330380439758,
"rewards/rejected": -0.710745632648468,
"step": 280
},
{
"epoch": 0.3127843877447841,
"grad_norm": 21.125,
"learning_rate": 2.6885775862068965e-06,
"logits/chosen": -1.6472547054290771,
"logits/rejected": -1.6532646417617798,
"logps/chosen": -527.1898193359375,
"logps/rejected": -516.4867553710938,
"loss": 0.6132,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.49309319257736206,
"rewards/margins": 0.21812161803245544,
"rewards/rejected": -0.7112148404121399,
"step": 290
},
{
"epoch": 0.3235700562877077,
"grad_norm": 19.0,
"learning_rate": 2.677801724137931e-06,
"logits/chosen": -1.6490532159805298,
"logits/rejected": -1.6721302270889282,
"logps/chosen": -567.3697509765625,
"logps/rejected": -550.6924438476562,
"loss": 0.6216,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.5456413626670837,
"rewards/margins": 0.2058580368757248,
"rewards/rejected": -0.7514994144439697,
"step": 300
},
{
"epoch": 0.3235700562877077,
"eval_logits/chosen": -1.6934152841567993,
"eval_logits/rejected": -1.7509509325027466,
"eval_logps/chosen": -562.0770874023438,
"eval_logps/rejected": -508.9422302246094,
"eval_loss": 0.6661080121994019,
"eval_rewards/accuracies": 0.5865384340286255,
"eval_rewards/chosen": -0.692603349685669,
"eval_rewards/margins": 0.10398232936859131,
"eval_rewards/rejected": -0.7965856194496155,
"eval_runtime": 12.8789,
"eval_samples_per_second": 7.765,
"eval_steps_per_second": 1.009,
"step": 300
},
{
"epoch": 0.3343557248306313,
"grad_norm": 21.875,
"learning_rate": 2.6670258620689655e-06,
"logits/chosen": -1.7095540761947632,
"logits/rejected": -1.7263736724853516,
"logps/chosen": -530.0485229492188,
"logps/rejected": -530.2100830078125,
"loss": 0.6314,
"rewards/accuracies": 0.6468750238418579,
"rewards/chosen": -0.6013873815536499,
"rewards/margins": 0.17918090522289276,
"rewards/rejected": -0.7805682420730591,
"step": 310
},
{
"epoch": 0.3451413933735549,
"grad_norm": 20.375,
"learning_rate": 2.6562499999999998e-06,
"logits/chosen": -1.7012121677398682,
"logits/rejected": -1.6956939697265625,
"logps/chosen": -520.4534301757812,
"logps/rejected": -511.6394958496094,
"loss": 0.6502,
"rewards/accuracies": 0.629687488079071,
"rewards/chosen": -0.6033864617347717,
"rewards/margins": 0.14426395297050476,
"rewards/rejected": -0.7476503252983093,
"step": 320
},
{
"epoch": 0.3559270619164785,
"grad_norm": 20.625,
"learning_rate": 2.6454741379310345e-06,
"logits/chosen": -1.6911704540252686,
"logits/rejected": -1.6552823781967163,
"logps/chosen": -537.2453002929688,
"logps/rejected": -534.7839965820312,
"loss": 0.6255,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -0.568359375,
"rewards/margins": 0.22105315327644348,
"rewards/rejected": -0.7894124388694763,
"step": 330
},
{
"epoch": 0.36671273045940206,
"grad_norm": 21.5,
"learning_rate": 2.6346982758620688e-06,
"logits/chosen": -1.664223313331604,
"logits/rejected": -1.6563825607299805,
"logps/chosen": -565.0128784179688,
"logps/rejected": -551.3712158203125,
"loss": 0.6117,
"rewards/accuracies": 0.692187488079071,
"rewards/chosen": -0.5529137849807739,
"rewards/margins": 0.23182833194732666,
"rewards/rejected": -0.7847420573234558,
"step": 340
},
{
"epoch": 0.3774983990023257,
"grad_norm": 21.75,
"learning_rate": 2.6239224137931035e-06,
"logits/chosen": -1.6517295837402344,
"logits/rejected": -1.6636890172958374,
"logps/chosen": -499.03717041015625,
"logps/rejected": -469.7442932128906,
"loss": 0.6454,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.559511125087738,
"rewards/margins": 0.14985349774360657,
"rewards/rejected": -0.7093645930290222,
"step": 350
},
{
"epoch": 0.3774983990023257,
"eval_logits/chosen": -1.7148549556732178,
"eval_logits/rejected": -1.7745919227600098,
"eval_logps/chosen": -562.5575561523438,
"eval_logps/rejected": -509.7618713378906,
"eval_loss": 0.6621462106704712,
"eval_rewards/accuracies": 0.625,
"eval_rewards/chosen": -0.7166208624839783,
"eval_rewards/margins": 0.12094759196043015,
"eval_rewards/rejected": -0.8375685811042786,
"eval_runtime": 12.8579,
"eval_samples_per_second": 7.777,
"eval_steps_per_second": 1.011,
"step": 350
},
{
"epoch": 0.38828406754524925,
"grad_norm": 18.625,
"learning_rate": 2.613146551724138e-06,
"logits/chosen": -1.6570751667022705,
"logits/rejected": -1.673710584640503,
"logps/chosen": -550.8926391601562,
"logps/rejected": -530.0281982421875,
"loss": 0.635,
"rewards/accuracies": 0.682812511920929,
"rewards/chosen": -0.5474696755409241,
"rewards/margins": 0.180132657289505,
"rewards/rejected": -0.7276023626327515,
"step": 360
},
{
"epoch": 0.39906973608817287,
"grad_norm": 22.125,
"learning_rate": 2.6023706896551725e-06,
"logits/chosen": -1.7166540622711182,
"logits/rejected": -1.7119417190551758,
"logps/chosen": -535.0701904296875,
"logps/rejected": -521.47509765625,
"loss": 0.6346,
"rewards/accuracies": 0.6421874761581421,
"rewards/chosen": -0.6037973761558533,
"rewards/margins": 0.18038420379161835,
"rewards/rejected": -0.7841815948486328,
"step": 370
},
{
"epoch": 0.40985540463109643,
"grad_norm": 21.5,
"learning_rate": 2.591594827586207e-06,
"logits/chosen": -1.711259126663208,
"logits/rejected": -1.7223602533340454,
"logps/chosen": -554.1033935546875,
"logps/rejected": -536.9190673828125,
"loss": 0.6215,
"rewards/accuracies": 0.667187511920929,
"rewards/chosen": -0.5765694379806519,
"rewards/margins": 0.21333327889442444,
"rewards/rejected": -0.7899028062820435,
"step": 380
},
{
"epoch": 0.42064107317402,
"grad_norm": 20.125,
"learning_rate": 2.580818965517241e-06,
"logits/chosen": -1.674808144569397,
"logits/rejected": -1.7261720895767212,
"logps/chosen": -534.8923950195312,
"logps/rejected": -509.17437744140625,
"loss": 0.6234,
"rewards/accuracies": 0.645312488079071,
"rewards/chosen": -0.5851498246192932,
"rewards/margins": 0.20684213936328888,
"rewards/rejected": -0.7919918298721313,
"step": 390
},
{
"epoch": 0.4314267417169436,
"grad_norm": 21.375,
"learning_rate": 2.5700431034482762e-06,
"logits/chosen": -1.7043625116348267,
"logits/rejected": -1.721663236618042,
"logps/chosen": -547.33935546875,
"logps/rejected": -535.8040771484375,
"loss": 0.6314,
"rewards/accuracies": 0.645312488079071,
"rewards/chosen": -0.6050974130630493,
"rewards/margins": 0.18475469946861267,
"rewards/rejected": -0.7898520827293396,
"step": 400
},
{
"epoch": 0.4314267417169436,
"eval_logits/chosen": -1.7199591398239136,
"eval_logits/rejected": -1.7800374031066895,
"eval_logps/chosen": -562.4886474609375,
"eval_logps/rejected": -509.6477355957031,
"eval_loss": 0.6607488989830017,
"eval_rewards/accuracies": 0.6153846383094788,
"eval_rewards/chosen": -0.7131773829460144,
"eval_rewards/margins": 0.11868361383676529,
"eval_rewards/rejected": -0.831861138343811,
"eval_runtime": 12.8809,
"eval_samples_per_second": 7.763,
"eval_steps_per_second": 1.009,
"step": 400
},
{
"epoch": 0.4422124102598672,
"grad_norm": 20.25,
"learning_rate": 2.5592672413793105e-06,
"logits/chosen": -1.7056515216827393,
"logits/rejected": -1.7032356262207031,
"logps/chosen": -544.6566162109375,
"logps/rejected": -526.3981323242188,
"loss": 0.6346,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -0.6110819578170776,
"rewards/margins": 0.17850695550441742,
"rewards/rejected": -0.7895889282226562,
"step": 410
},
{
"epoch": 0.4529980788027908,
"grad_norm": 21.125,
"learning_rate": 2.5484913793103452e-06,
"logits/chosen": -1.7077720165252686,
"logits/rejected": -1.7262470722198486,
"logps/chosen": -538.9916381835938,
"logps/rejected": -514.1024780273438,
"loss": 0.6319,
"rewards/accuracies": 0.6390625238418579,
"rewards/chosen": -0.6130255460739136,
"rewards/margins": 0.18954649567604065,
"rewards/rejected": -0.8025720715522766,
"step": 420
},
{
"epoch": 0.46378374734571437,
"grad_norm": 21.125,
"learning_rate": 2.5377155172413795e-06,
"logits/chosen": -1.6869211196899414,
"logits/rejected": -1.7010374069213867,
"logps/chosen": -519.823486328125,
"logps/rejected": -514.0619506835938,
"loss": 0.6187,
"rewards/accuracies": 0.6734374761581421,
"rewards/chosen": -0.6134659647941589,
"rewards/margins": 0.21337684988975525,
"rewards/rejected": -0.8268427848815918,
"step": 430
},
{
"epoch": 0.474569415888638,
"grad_norm": 20.75,
"learning_rate": 2.526939655172414e-06,
"logits/chosen": -1.7328685522079468,
"logits/rejected": -1.7307264804840088,
"logps/chosen": -519.9312744140625,
"logps/rejected": -513.1961669921875,
"loss": 0.6278,
"rewards/accuracies": 0.676562488079071,
"rewards/chosen": -0.6580926179885864,
"rewards/margins": 0.20139017701148987,
"rewards/rejected": -0.8594827651977539,
"step": 440
},
{
"epoch": 0.48535508443156156,
"grad_norm": 21.0,
"learning_rate": 2.5161637931034486e-06,
"logits/chosen": -1.7335723638534546,
"logits/rejected": -1.7415554523468018,
"logps/chosen": -526.5194702148438,
"logps/rejected": -518.3421020507812,
"loss": 0.6219,
"rewards/accuracies": 0.659375011920929,
"rewards/chosen": -0.6824163198471069,
"rewards/margins": 0.2106122523546219,
"rewards/rejected": -0.8930285573005676,
"step": 450
},
{
"epoch": 0.48535508443156156,
"eval_logits/chosen": -1.7433769702911377,
"eval_logits/rejected": -1.8047083616256714,
"eval_logps/chosen": -564.2274169921875,
"eval_logps/rejected": -511.63067626953125,
"eval_loss": 0.6593914031982422,
"eval_rewards/accuracies": 0.5769230723381042,
"eval_rewards/chosen": -0.8001139760017395,
"eval_rewards/margins": 0.13089530169963837,
"eval_rewards/rejected": -0.9310091733932495,
"eval_runtime": 12.8159,
"eval_samples_per_second": 7.803,
"eval_steps_per_second": 1.014,
"step": 450
},
{
"epoch": 0.4961407529744852,
"grad_norm": 20.875,
"learning_rate": 2.505387931034483e-06,
"logits/chosen": -1.7393690347671509,
"logits/rejected": -1.7588695287704468,
"logps/chosen": -552.0422973632812,
"logps/rejected": -530.3653564453125,
"loss": 0.6202,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.686679482460022,
"rewards/margins": 0.22524280846118927,
"rewards/rejected": -0.9119223356246948,
"step": 460
},
{
"epoch": 0.5069264215174087,
"grad_norm": 20.25,
"learning_rate": 2.4946120689655176e-06,
"logits/chosen": -1.7419532537460327,
"logits/rejected": -1.7394039630889893,
"logps/chosen": -526.1238403320312,
"logps/rejected": -506.7391052246094,
"loss": 0.6301,
"rewards/accuracies": 0.653124988079071,
"rewards/chosen": -0.6740504503250122,
"rewards/margins": 0.18307238817214966,
"rewards/rejected": -0.8571227788925171,
"step": 470
},
{
"epoch": 0.5177120900603324,
"grad_norm": 22.375,
"learning_rate": 2.483836206896552e-06,
"logits/chosen": -1.6943247318267822,
"logits/rejected": -1.6783298254013062,
"logps/chosen": -568.21435546875,
"logps/rejected": -553.17822265625,
"loss": 0.6339,
"rewards/accuracies": 0.6390625238418579,
"rewards/chosen": -0.6776462197303772,
"rewards/margins": 0.1882401704788208,
"rewards/rejected": -0.865886390209198,
"step": 480
},
{
"epoch": 0.5284977586032559,
"grad_norm": 21.125,
"learning_rate": 2.473060344827586e-06,
"logits/chosen": -1.7065378427505493,
"logits/rejected": -1.7249234914779663,
"logps/chosen": -529.4561767578125,
"logps/rejected": -506.04071044921875,
"loss": 0.6378,
"rewards/accuracies": 0.645312488079071,
"rewards/chosen": -0.6808103322982788,
"rewards/margins": 0.17550134658813477,
"rewards/rejected": -0.8563116788864136,
"step": 490
},
{
"epoch": 0.5392834271461795,
"grad_norm": 23.75,
"learning_rate": 2.462284482758621e-06,
"logits/chosen": -1.6840463876724243,
"logits/rejected": -1.70773184299469,
"logps/chosen": -530.1900634765625,
"logps/rejected": -512.8983764648438,
"loss": 0.6382,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.6517452001571655,
"rewards/margins": 0.17097657918930054,
"rewards/rejected": -0.8227217793464661,
"step": 500
},
{
"epoch": 0.5392834271461795,
"eval_logits/chosen": -1.7378157377243042,
"eval_logits/rejected": -1.7989575862884521,
"eval_logps/chosen": -563.543212890625,
"eval_logps/rejected": -510.96929931640625,
"eval_loss": 0.6553998589515686,
"eval_rewards/accuracies": 0.6442307829856873,
"eval_rewards/chosen": -0.7659082412719727,
"eval_rewards/margins": 0.1320342719554901,
"eval_rewards/rejected": -0.8979425430297852,
"eval_runtime": 12.8216,
"eval_samples_per_second": 7.799,
"eval_steps_per_second": 1.014,
"step": 500
},
{
"epoch": 0.5500690956891031,
"grad_norm": 23.375,
"learning_rate": 2.451508620689655e-06,
"logits/chosen": -1.7287929058074951,
"logits/rejected": -1.747097373008728,
"logps/chosen": -568.22900390625,
"logps/rejected": -537.8626098632812,
"loss": 0.6234,
"rewards/accuracies": 0.660937488079071,
"rewards/chosen": -0.6510963439941406,
"rewards/margins": 0.20002727210521698,
"rewards/rejected": -0.8511236310005188,
"step": 510
},
{
"epoch": 0.5608547642320267,
"grad_norm": 21.375,
"learning_rate": 2.44073275862069e-06,
"logits/chosen": -1.7096725702285767,
"logits/rejected": -1.713513970375061,
"logps/chosen": -578.6484375,
"logps/rejected": -552.06884765625,
"loss": 0.618,
"rewards/accuracies": 0.6734374761581421,
"rewards/chosen": -0.6893950700759888,
"rewards/margins": 0.22265811264514923,
"rewards/rejected": -0.9120532274246216,
"step": 520
},
{
"epoch": 0.5716404327749502,
"grad_norm": 24.375,
"learning_rate": 2.429956896551724e-06,
"logits/chosen": -1.7773420810699463,
"logits/rejected": -1.7806062698364258,
"logps/chosen": -561.3190307617188,
"logps/rejected": -557.246826171875,
"loss": 0.6274,
"rewards/accuracies": 0.660937488079071,
"rewards/chosen": -0.6820749640464783,
"rewards/margins": 0.23882977664470673,
"rewards/rejected": -0.9209047555923462,
"step": 530
},
{
"epoch": 0.5824261013178739,
"grad_norm": 21.625,
"learning_rate": 2.419181034482759e-06,
"logits/chosen": -1.7619960308074951,
"logits/rejected": -1.7642993927001953,
"logps/chosen": -520.9690551757812,
"logps/rejected": -509.5274353027344,
"loss": 0.6309,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.7083293795585632,
"rewards/margins": 0.1926591694355011,
"rewards/rejected": -0.9009885787963867,
"step": 540
},
{
"epoch": 0.5932117698607975,
"grad_norm": 18.75,
"learning_rate": 2.408405172413793e-06,
"logits/chosen": -1.7466360330581665,
"logits/rejected": -1.746272325515747,
"logps/chosen": -513.3010864257812,
"logps/rejected": -490.46624755859375,
"loss": 0.6377,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.7287647128105164,
"rewards/margins": 0.16981419920921326,
"rewards/rejected": -0.8985790014266968,
"step": 550
},
{
"epoch": 0.5932117698607975,
"eval_logits/chosen": -1.7363488674163818,
"eval_logits/rejected": -1.7969601154327393,
"eval_logps/chosen": -564.2831420898438,
"eval_logps/rejected": -511.74945068359375,
"eval_loss": 0.6561428904533386,
"eval_rewards/accuracies": 0.6442307829856873,
"eval_rewards/chosen": -0.80290687084198,
"eval_rewards/margins": 0.13403938710689545,
"eval_rewards/rejected": -0.9369462728500366,
"eval_runtime": 12.8373,
"eval_samples_per_second": 7.79,
"eval_steps_per_second": 1.013,
"step": 550
},
{
"epoch": 0.6039974384037211,
"grad_norm": 21.75,
"learning_rate": 2.3976293103448275e-06,
"logits/chosen": -1.7120717763900757,
"logits/rejected": -1.717246651649475,
"logps/chosen": -544.954833984375,
"logps/rejected": -519.8233032226562,
"loss": 0.6182,
"rewards/accuracies": 0.6703125238418579,
"rewards/chosen": -0.6614271402359009,
"rewards/margins": 0.22156758606433868,
"rewards/rejected": -0.8829947710037231,
"step": 560
},
{
"epoch": 0.6147831069466446,
"grad_norm": 22.625,
"learning_rate": 2.386853448275862e-06,
"logits/chosen": -1.6905927658081055,
"logits/rejected": -1.709628701210022,
"logps/chosen": -529.6423950195312,
"logps/rejected": -513.8201904296875,
"loss": 0.626,
"rewards/accuracies": 0.692187488079071,
"rewards/chosen": -0.6795625686645508,
"rewards/margins": 0.20236878097057343,
"rewards/rejected": -0.8819311857223511,
"step": 570
},
{
"epoch": 0.6255687754895682,
"grad_norm": 21.375,
"learning_rate": 2.3760775862068965e-06,
"logits/chosen": -1.7218097448349,
"logits/rejected": -1.7233638763427734,
"logps/chosen": -566.1005249023438,
"logps/rejected": -550.3419799804688,
"loss": 0.6288,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.7431906461715698,
"rewards/margins": 0.20129895210266113,
"rewards/rejected": -0.9444894790649414,
"step": 580
},
{
"epoch": 0.6363544440324919,
"grad_norm": 21.625,
"learning_rate": 2.365301724137931e-06,
"logits/chosen": -1.718971848487854,
"logits/rejected": -1.7108612060546875,
"logps/chosen": -552.8258056640625,
"logps/rejected": -534.0645141601562,
"loss": 0.6376,
"rewards/accuracies": 0.6421874761581421,
"rewards/chosen": -0.7128755450248718,
"rewards/margins": 0.18357399106025696,
"rewards/rejected": -0.8964495658874512,
"step": 590
},
{
"epoch": 0.6471401125754154,
"grad_norm": 21.75,
"learning_rate": 2.3545258620689655e-06,
"logits/chosen": -1.6846719980239868,
"logits/rejected": -1.6692850589752197,
"logps/chosen": -575.1969604492188,
"logps/rejected": -551.0662841796875,
"loss": 0.6428,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.7172713279724121,
"rewards/margins": 0.16418126225471497,
"rewards/rejected": -0.8814526796340942,
"step": 600
},
{
"epoch": 0.6471401125754154,
"eval_logits/chosen": -1.7446637153625488,
"eval_logits/rejected": -1.805701494216919,
"eval_logps/chosen": -564.8399658203125,
"eval_logps/rejected": -512.3929443359375,
"eval_loss": 0.6552779674530029,
"eval_rewards/accuracies": 0.6153846383094788,
"eval_rewards/chosen": -0.8307470083236694,
"eval_rewards/margins": 0.13837800920009613,
"eval_rewards/rejected": -0.9691251516342163,
"eval_runtime": 12.8832,
"eval_samples_per_second": 7.762,
"eval_steps_per_second": 1.009,
"step": 600
},
{
"epoch": 0.657925781118339,
"grad_norm": 19.625,
"learning_rate": 2.3437500000000002e-06,
"logits/chosen": -1.7127196788787842,
"logits/rejected": -1.722394585609436,
"logps/chosen": -534.408447265625,
"logps/rejected": -524.6024169921875,
"loss": 0.6382,
"rewards/accuracies": 0.6656249761581421,
"rewards/chosen": -0.7230153679847717,
"rewards/margins": 0.17467036843299866,
"rewards/rejected": -0.897685706615448,
"step": 610
},
{
"epoch": 0.6687114496612626,
"grad_norm": 19.25,
"learning_rate": 2.3329741379310345e-06,
"logits/chosen": -1.731837511062622,
"logits/rejected": -1.7623554468154907,
"logps/chosen": -528.728515625,
"logps/rejected": -500.19500732421875,
"loss": 0.6294,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.695734441280365,
"rewards/margins": 0.19728422164916992,
"rewards/rejected": -0.8930186033248901,
"step": 620
},
{
"epoch": 0.6794971182041862,
"grad_norm": 22.875,
"learning_rate": 2.322198275862069e-06,
"logits/chosen": -1.6935323476791382,
"logits/rejected": -1.7253952026367188,
"logps/chosen": -557.6375122070312,
"logps/rejected": -530.9302368164062,
"loss": 0.6425,
"rewards/accuracies": 0.6421874761581421,
"rewards/chosen": -0.7235587239265442,
"rewards/margins": 0.1692447066307068,
"rewards/rejected": -0.8928033709526062,
"step": 630
},
{
"epoch": 0.6902827867471097,
"grad_norm": 22.75,
"learning_rate": 2.3114224137931035e-06,
"logits/chosen": -1.7822033166885376,
"logits/rejected": -1.7954456806182861,
"logps/chosen": -548.27294921875,
"logps/rejected": -529.1452026367188,
"loss": 0.6357,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.733366847038269,
"rewards/margins": 0.18996267020702362,
"rewards/rejected": -0.9233294725418091,
"step": 640
},
{
"epoch": 0.7010684552900334,
"grad_norm": 23.625,
"learning_rate": 2.300646551724138e-06,
"logits/chosen": -1.7504284381866455,
"logits/rejected": -1.7501709461212158,
"logps/chosen": -563.0472412109375,
"logps/rejected": -547.3125,
"loss": 0.6376,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.7303886413574219,
"rewards/margins": 0.17333553731441498,
"rewards/rejected": -0.903724193572998,
"step": 650
},
{
"epoch": 0.7010684552900334,
"eval_logits/chosen": -1.7363975048065186,
"eval_logits/rejected": -1.7971389293670654,
"eval_logps/chosen": -564.5283813476562,
"eval_logps/rejected": -512.2813110351562,
"eval_loss": 0.6501194834709167,
"eval_rewards/accuracies": 0.6442307829856873,
"eval_rewards/chosen": -0.8151662349700928,
"eval_rewards/margins": 0.1483728587627411,
"eval_rewards/rejected": -0.9635391235351562,
"eval_runtime": 12.8651,
"eval_samples_per_second": 7.773,
"eval_steps_per_second": 1.01,
"step": 650
},
{
"epoch": 0.711854123832957,
"grad_norm": 21.0,
"learning_rate": 2.2898706896551725e-06,
"logits/chosen": -1.6629375219345093,
"logits/rejected": -1.686265230178833,
"logps/chosen": -579.2559814453125,
"logps/rejected": -550.0233764648438,
"loss": 0.6339,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.7081414461135864,
"rewards/margins": 0.19901154935359955,
"rewards/rejected": -0.9071530103683472,
"step": 660
},
{
"epoch": 0.7226397923758805,
"grad_norm": 24.25,
"learning_rate": 2.279094827586207e-06,
"logits/chosen": -1.772698998451233,
"logits/rejected": -1.8019428253173828,
"logps/chosen": -505.48577880859375,
"logps/rejected": -490.10662841796875,
"loss": 0.6425,
"rewards/accuracies": 0.6078125238418579,
"rewards/chosen": -0.7496585249900818,
"rewards/margins": 0.17492921650409698,
"rewards/rejected": -0.9245878458023071,
"step": 670
},
{
"epoch": 0.7334254609188041,
"grad_norm": 23.125,
"learning_rate": 2.2683189655172415e-06,
"logits/chosen": -1.7226699590682983,
"logits/rejected": -1.7439358234405518,
"logps/chosen": -551.814208984375,
"logps/rejected": -530.10888671875,
"loss": 0.6147,
"rewards/accuracies": 0.6703125238418579,
"rewards/chosen": -0.7403059601783752,
"rewards/margins": 0.22587330639362335,
"rewards/rejected": -0.966179370880127,
"step": 680
},
{
"epoch": 0.7442111294617277,
"grad_norm": 19.625,
"learning_rate": 2.257543103448276e-06,
"logits/chosen": -1.7711864709854126,
"logits/rejected": -1.7668606042861938,
"logps/chosen": -540.2945556640625,
"logps/rejected": -520.48828125,
"loss": 0.6112,
"rewards/accuracies": 0.6781250238418579,
"rewards/chosen": -0.7231726050376892,
"rewards/margins": 0.24855723977088928,
"rewards/rejected": -0.9717298746109009,
"step": 690
},
{
"epoch": 0.7549967980046514,
"grad_norm": 22.375,
"learning_rate": 2.24676724137931e-06,
"logits/chosen": -1.7296479940414429,
"logits/rejected": -1.7336111068725586,
"logps/chosen": -525.4596557617188,
"logps/rejected": -499.53826904296875,
"loss": 0.644,
"rewards/accuracies": 0.6578124761581421,
"rewards/chosen": -0.7867623567581177,
"rewards/margins": 0.18225276470184326,
"rewards/rejected": -0.9690152406692505,
"step": 700
},
{
"epoch": 0.7549967980046514,
"eval_logits/chosen": -1.7320126295089722,
"eval_logits/rejected": -1.7923235893249512,
"eval_logps/chosen": -565.2008056640625,
"eval_logps/rejected": -512.877197265625,
"eval_loss": 0.6524822115898132,
"eval_rewards/accuracies": 0.6153846383094788,
"eval_rewards/chosen": -0.8487869501113892,
"eval_rewards/margins": 0.1445501744747162,
"eval_rewards/rejected": -0.9933372139930725,
"eval_runtime": 12.8132,
"eval_samples_per_second": 7.804,
"eval_steps_per_second": 1.015,
"step": 700
},
{
"epoch": 0.7657824665475749,
"grad_norm": 36.25,
"learning_rate": 2.235991379310345e-06,
"logits/chosen": -1.6764347553253174,
"logits/rejected": -1.6931612491607666,
"logps/chosen": -555.44091796875,
"logps/rejected": -529.9513549804688,
"loss": 0.6177,
"rewards/accuracies": 0.667187511920929,
"rewards/chosen": -0.7847446203231812,
"rewards/margins": 0.23240776360034943,
"rewards/rejected": -1.0171524286270142,
"step": 710
},
{
"epoch": 0.7765681350904985,
"grad_norm": 21.375,
"learning_rate": 2.225215517241379e-06,
"logits/chosen": -1.7237228155136108,
"logits/rejected": -1.7398170232772827,
"logps/chosen": -542.1912231445312,
"logps/rejected": -518.4722900390625,
"loss": 0.6354,
"rewards/accuracies": 0.6421874761581421,
"rewards/chosen": -0.7876893281936646,
"rewards/margins": 0.1978759765625,
"rewards/rejected": -0.9855653643608093,
"step": 720
},
{
"epoch": 0.7873538036334221,
"grad_norm": 23.0,
"learning_rate": 2.214439655172414e-06,
"logits/chosen": -1.752096176147461,
"logits/rejected": -1.7941181659698486,
"logps/chosen": -548.2041015625,
"logps/rejected": -513.8216552734375,
"loss": 0.6555,
"rewards/accuracies": 0.628125011920929,
"rewards/chosen": -0.7717311978340149,
"rewards/margins": 0.14397189021110535,
"rewards/rejected": -0.9157029986381531,
"step": 730
},
{
"epoch": 0.7981394721763457,
"grad_norm": 19.75,
"learning_rate": 2.203663793103448e-06,
"logits/chosen": -1.709242820739746,
"logits/rejected": -1.7145198583602905,
"logps/chosen": -539.0947265625,
"logps/rejected": -521.4993896484375,
"loss": 0.6394,
"rewards/accuracies": 0.6468750238418579,
"rewards/chosen": -0.7486387491226196,
"rewards/margins": 0.18467199802398682,
"rewards/rejected": -0.933310866355896,
"step": 740
},
{
"epoch": 0.8089251407192692,
"grad_norm": 23.875,
"learning_rate": 2.192887931034483e-06,
"logits/chosen": -1.632927656173706,
"logits/rejected": -1.6515719890594482,
"logps/chosen": -559.2658081054688,
"logps/rejected": -526.0423583984375,
"loss": 0.6322,
"rewards/accuracies": 0.653124988079071,
"rewards/chosen": -0.7207778692245483,
"rewards/margins": 0.19092032313346863,
"rewards/rejected": -0.911698043346405,
"step": 750
},
{
"epoch": 0.8089251407192692,
"eval_logits/chosen": -1.717968463897705,
"eval_logits/rejected": -1.7769808769226074,
"eval_logps/chosen": -564.5967407226562,
"eval_logps/rejected": -512.3129272460938,
"eval_loss": 0.6494570970535278,
"eval_rewards/accuracies": 0.6634615659713745,
"eval_rewards/chosen": -0.8185831308364868,
"eval_rewards/margins": 0.1465369611978531,
"eval_rewards/rejected": -0.9651200175285339,
"eval_runtime": 12.8208,
"eval_samples_per_second": 7.8,
"eval_steps_per_second": 1.014,
"step": 750
},
{
"epoch": 0.8197108092621929,
"grad_norm": 22.25,
"learning_rate": 2.182112068965517e-06,
"logits/chosen": -1.7085649967193604,
"logits/rejected": -1.7567522525787354,
"logps/chosen": -565.7103271484375,
"logps/rejected": -539.7507934570312,
"loss": 0.626,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.7772555351257324,
"rewards/margins": 0.20999836921691895,
"rewards/rejected": -0.9872539639472961,
"step": 760
},
{
"epoch": 0.8304964778051165,
"grad_norm": 22.125,
"learning_rate": 2.1713362068965515e-06,
"logits/chosen": -1.7073513269424438,
"logits/rejected": -1.7328789234161377,
"logps/chosen": -536.3829345703125,
"logps/rejected": -514.23779296875,
"loss": 0.6285,
"rewards/accuracies": 0.6578124761581421,
"rewards/chosen": -0.7473222613334656,
"rewards/margins": 0.20041945576667786,
"rewards/rejected": -0.947741687297821,
"step": 770
},
{
"epoch": 0.84128214634804,
"grad_norm": 21.75,
"learning_rate": 2.160560344827586e-06,
"logits/chosen": -1.6973340511322021,
"logits/rejected": -1.691319465637207,
"logps/chosen": -557.1148071289062,
"logps/rejected": -558.3483276367188,
"loss": 0.6268,
"rewards/accuracies": 0.6578124761581421,
"rewards/chosen": -0.7272000312805176,
"rewards/margins": 0.21510732173919678,
"rewards/rejected": -0.9423073530197144,
"step": 780
},
{
"epoch": 0.8520678148909636,
"grad_norm": 20.875,
"learning_rate": 2.1497844827586205e-06,
"logits/chosen": -1.679863691329956,
"logits/rejected": -1.6904770135879517,
"logps/chosen": -543.8391723632812,
"logps/rejected": -531.5890502929688,
"loss": 0.625,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -0.756318986415863,
"rewards/margins": 0.20582714676856995,
"rewards/rejected": -0.9621461629867554,
"step": 790
},
{
"epoch": 0.8628534834338872,
"grad_norm": 21.875,
"learning_rate": 2.139008620689655e-06,
"logits/chosen": -1.7281272411346436,
"logits/rejected": -1.745347261428833,
"logps/chosen": -547.1482543945312,
"logps/rejected": -527.178466796875,
"loss": 0.6278,
"rewards/accuracies": 0.671875,
"rewards/chosen": -0.7604348063468933,
"rewards/margins": 0.19645504653453827,
"rewards/rejected": -0.9568899273872375,
"step": 800
},
{
"epoch": 0.8628534834338872,
"eval_logits/chosen": -1.7295496463775635,
"eval_logits/rejected": -1.7896690368652344,
"eval_logps/chosen": -565.3712158203125,
"eval_logps/rejected": -513.1586303710938,
"eval_loss": 0.6495404243469238,
"eval_rewards/accuracies": 0.6730769276618958,
"eval_rewards/chosen": -0.8573046922683716,
"eval_rewards/margins": 0.15010415017604828,
"eval_rewards/rejected": -1.0074087381362915,
"eval_runtime": 12.8304,
"eval_samples_per_second": 7.794,
"eval_steps_per_second": 1.013,
"step": 800
},
{
"epoch": 0.8736391519768109,
"grad_norm": 24.0,
"learning_rate": 2.1282327586206895e-06,
"logits/chosen": -1.6808828115463257,
"logits/rejected": -1.7251535654067993,
"logps/chosen": -545.0057373046875,
"logps/rejected": -514.2200317382812,
"loss": 0.6473,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.7858768701553345,
"rewards/margins": 0.16710534691810608,
"rewards/rejected": -0.9529821276664734,
"step": 810
},
{
"epoch": 0.8844248205197344,
"grad_norm": 21.625,
"learning_rate": 2.117456896551724e-06,
"logits/chosen": -1.7713664770126343,
"logits/rejected": -1.7678531408309937,
"logps/chosen": -544.8568115234375,
"logps/rejected": -516.0807495117188,
"loss": 0.6342,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.8098779916763306,
"rewards/margins": 0.199488565325737,
"rewards/rejected": -1.009366512298584,
"step": 820
},
{
"epoch": 0.895210489062658,
"grad_norm": 22.375,
"learning_rate": 2.106681034482759e-06,
"logits/chosen": -1.7336772680282593,
"logits/rejected": -1.7515900135040283,
"logps/chosen": -549.8850708007812,
"logps/rejected": -530.6832275390625,
"loss": 0.6074,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.7838916778564453,
"rewards/margins": 0.2659648060798645,
"rewards/rejected": -1.049856424331665,
"step": 830
},
{
"epoch": 0.9059961576055816,
"grad_norm": 21.375,
"learning_rate": 2.0959051724137932e-06,
"logits/chosen": -1.7343952655792236,
"logits/rejected": -1.7316901683807373,
"logps/chosen": -574.8560791015625,
"logps/rejected": -536.2384033203125,
"loss": 0.6342,
"rewards/accuracies": 0.651562511920929,
"rewards/chosen": -0.7904583811759949,
"rewards/margins": 0.1979624629020691,
"rewards/rejected": -0.988420844078064,
"step": 840
},
{
"epoch": 0.9167818261485051,
"grad_norm": 21.5,
"learning_rate": 2.085129310344828e-06,
"logits/chosen": -1.733007788658142,
"logits/rejected": -1.727246880531311,
"logps/chosen": -524.6848754882812,
"logps/rejected": -510.7845764160156,
"loss": 0.6307,
"rewards/accuracies": 0.6421874761581421,
"rewards/chosen": -0.8132136464118958,
"rewards/margins": 0.20207151770591736,
"rewards/rejected": -1.0152852535247803,
"step": 850
},
{
"epoch": 0.9167818261485051,
"eval_logits/chosen": -1.724961757659912,
"eval_logits/rejected": -1.7844202518463135,
"eval_logps/chosen": -565.8350830078125,
"eval_logps/rejected": -513.8818969726562,
"eval_loss": 0.6429941058158875,
"eval_rewards/accuracies": 0.6538461446762085,
"eval_rewards/chosen": -0.8805010318756104,
"eval_rewards/margins": 0.16307112574577332,
"eval_rewards/rejected": -1.043572187423706,
"eval_runtime": 12.8678,
"eval_samples_per_second": 7.771,
"eval_steps_per_second": 1.01,
"step": 850
},
{
"epoch": 0.9275674946914287,
"grad_norm": 21.0,
"learning_rate": 2.0743534482758622e-06,
"logits/chosen": -1.678770661354065,
"logits/rejected": -1.6889280080795288,
"logps/chosen": -549.0493774414062,
"logps/rejected": -529.0235595703125,
"loss": 0.6311,
"rewards/accuracies": 0.6578124761581421,
"rewards/chosen": -0.8022142648696899,
"rewards/margins": 0.21385040879249573,
"rewards/rejected": -1.0160646438598633,
"step": 860
},
{
"epoch": 0.9383531632343524,
"grad_norm": 21.25,
"learning_rate": 2.063577586206897e-06,
"logits/chosen": -1.677046775817871,
"logits/rejected": -1.715746521949768,
"logps/chosen": -558.4188842773438,
"logps/rejected": -526.5531616210938,
"loss": 0.6388,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.7892870903015137,
"rewards/margins": 0.1829388290643692,
"rewards/rejected": -0.9722259640693665,
"step": 870
},
{
"epoch": 0.949138831777276,
"grad_norm": 22.25,
"learning_rate": 2.0528017241379312e-06,
"logits/chosen": -1.71084463596344,
"logits/rejected": -1.7499706745147705,
"logps/chosen": -547.9580078125,
"logps/rejected": -525.6466674804688,
"loss": 0.6215,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.7694340348243713,
"rewards/margins": 0.2342531681060791,
"rewards/rejected": -1.0036872625350952,
"step": 880
},
{
"epoch": 0.9599245003201995,
"grad_norm": 22.5,
"learning_rate": 2.0420258620689655e-06,
"logits/chosen": -1.7067375183105469,
"logits/rejected": -1.7360172271728516,
"logps/chosen": -553.2872924804688,
"logps/rejected": -533.8543090820312,
"loss": 0.6409,
"rewards/accuracies": 0.6390625238418579,
"rewards/chosen": -0.7895435690879822,
"rewards/margins": 0.18897457420825958,
"rewards/rejected": -0.9785181879997253,
"step": 890
},
{
"epoch": 0.9707101688631231,
"grad_norm": 22.125,
"learning_rate": 2.0312500000000002e-06,
"logits/chosen": -1.7305114269256592,
"logits/rejected": -1.733814001083374,
"logps/chosen": -544.6137084960938,
"logps/rejected": -523.6353149414062,
"loss": 0.6191,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.786180317401886,
"rewards/margins": 0.22723452746868134,
"rewards/rejected": -1.013414978981018,
"step": 900
},
{
"epoch": 0.9707101688631231,
"eval_logits/chosen": -1.7138627767562866,
"eval_logits/rejected": -1.773409128189087,
"eval_logps/chosen": -565.8074340820312,
"eval_logps/rejected": -513.8848266601562,
"eval_loss": 0.6439433097839355,
"eval_rewards/accuracies": 0.6634615659713745,
"eval_rewards/chosen": -0.8791185617446899,
"eval_rewards/margins": 0.16459915041923523,
"eval_rewards/rejected": -1.043717622756958,
"eval_runtime": 12.8601,
"eval_samples_per_second": 7.776,
"eval_steps_per_second": 1.011,
"step": 900
},
{
"epoch": 0.9814958374060467,
"grad_norm": 20.5,
"learning_rate": 2.0204741379310345e-06,
"logits/chosen": -1.7137806415557861,
"logits/rejected": -1.714971899986267,
"logps/chosen": -546.196533203125,
"logps/rejected": -519.9130859375,
"loss": 0.6336,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.7919280529022217,
"rewards/margins": 0.189212828874588,
"rewards/rejected": -0.9811409115791321,
"step": 910
},
{
"epoch": 0.9922815059489704,
"grad_norm": 21.375,
"learning_rate": 2.0096982758620693e-06,
"logits/chosen": -1.7518551349639893,
"logits/rejected": -1.789482831954956,
"logps/chosen": -539.8787841796875,
"logps/rejected": -518.9136962890625,
"loss": 0.6408,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.8202158808708191,
"rewards/margins": 0.1931016743183136,
"rewards/rejected": -1.0133177042007446,
"step": 920
}
],
"logging_steps": 10,
"max_steps": 2784,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}