Files
zephyr-8b-dpo-full/trainer_state.json

779 lines
26 KiB
JSON
Raw Permalink Normal View History

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9984301412872841,
"eval_steps": 500,
"global_step": 477,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0020931449502878076,
"grad_norm": 4.5917708857834985,
"learning_rate": 1.0416666666666666e-08,
"logits/chosen": -0.8526347279548645,
"logits/rejected": -0.7768423557281494,
"logps/chosen": -363.13519287109375,
"logps/rejected": -364.9631042480469,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.020931449502878074,
"grad_norm": 4.404674449441554,
"learning_rate": 1.0416666666666667e-07,
"logits/chosen": -0.7482305765151978,
"logits/rejected": -0.7081854343414307,
"logps/chosen": -311.2024841308594,
"logps/rejected": -284.1365966796875,
"loss": 0.6931,
"rewards/accuracies": 0.4375,
"rewards/chosen": 2.1014602680224925e-05,
"rewards/margins": 8.458160300506279e-05,
"rewards/rejected": -6.356705853249878e-05,
"step": 10
},
{
"epoch": 0.04186289900575615,
"grad_norm": 4.2921416180433765,
"learning_rate": 2.0833333333333333e-07,
"logits/chosen": -0.7403894066810608,
"logits/rejected": -0.6793709993362427,
"logps/chosen": -324.6893005371094,
"logps/rejected": -290.2327575683594,
"loss": 0.693,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": 0.0014148516347631812,
"rewards/margins": 0.0002438486844766885,
"rewards/rejected": 0.0011710028629750013,
"step": 20
},
{
"epoch": 0.06279434850863422,
"grad_norm": 4.218970991450984,
"learning_rate": 3.1249999999999997e-07,
"logits/chosen": -0.7561457753181458,
"logits/rejected": -0.7098526954650879,
"logps/chosen": -295.118408203125,
"logps/rejected": -255.83407592773438,
"loss": 0.6917,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.008535891771316528,
"rewards/margins": 0.0028298485558480024,
"rewards/rejected": 0.005706042982637882,
"step": 30
},
{
"epoch": 0.0837257980115123,
"grad_norm": 3.614368719783126,
"learning_rate": 4.1666666666666667e-07,
"logits/chosen": -0.7299980521202087,
"logits/rejected": -0.6768942475318909,
"logps/chosen": -267.0862121582031,
"logps/rejected": -267.53863525390625,
"loss": 0.6885,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.020161841064691544,
"rewards/margins": 0.008152564987540245,
"rewards/rejected": 0.012009273283183575,
"step": 40
},
{
"epoch": 0.10465724751439037,
"grad_norm": 3.6040620075273546,
"learning_rate": 4.999731868769026e-07,
"logits/chosen": -0.7151128053665161,
"logits/rejected": -0.6647322177886963,
"logps/chosen": -296.5942077636719,
"logps/rejected": -277.5081787109375,
"loss": 0.6833,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.0398605577647686,
"rewards/margins": 0.02836265228688717,
"rewards/rejected": 0.011497899889945984,
"step": 50
},
{
"epoch": 0.12558869701726844,
"grad_norm": 3.5922190973220163,
"learning_rate": 4.990353313429303e-07,
"logits/chosen": -0.7289865016937256,
"logits/rejected": -0.6785635352134705,
"logps/chosen": -262.1878967285156,
"logps/rejected": -253.5371856689453,
"loss": 0.6783,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.0495939627289772,
"rewards/margins": 0.042832277715206146,
"rewards/rejected": 0.0067616915330290794,
"step": 60
},
{
"epoch": 0.14652014652014653,
"grad_norm": 3.6371057840364927,
"learning_rate": 4.967625656594781e-07,
"logits/chosen": -0.6846636533737183,
"logits/rejected": -0.6486319303512573,
"logps/chosen": -304.8815002441406,
"logps/rejected": -293.3005065917969,
"loss": 0.6683,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.014333389699459076,
"rewards/margins": 0.050421230494976044,
"rewards/rejected": -0.036087844520807266,
"step": 70
},
{
"epoch": 0.1674515960230246,
"grad_norm": 4.641698206330642,
"learning_rate": 4.93167072587771e-07,
"logits/chosen": -0.7803142070770264,
"logits/rejected": -0.6576212048530579,
"logps/chosen": -338.9702453613281,
"logps/rejected": -270.46124267578125,
"loss": 0.6684,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.041751302778720856,
"rewards/margins": 0.06293781846761703,
"rewards/rejected": -0.10468912124633789,
"step": 80
},
{
"epoch": 0.18838304552590268,
"grad_norm": 5.809449166665223,
"learning_rate": 4.882681251368548e-07,
"logits/chosen": -0.730857253074646,
"logits/rejected": -0.6790161728858948,
"logps/chosen": -270.4856872558594,
"logps/rejected": -280.201171875,
"loss": 0.6533,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.05735975503921509,
"rewards/margins": 0.0867304801940918,
"rewards/rejected": -0.14409023523330688,
"step": 90
},
{
"epoch": 0.20931449502878074,
"grad_norm": 6.629060471777615,
"learning_rate": 4.820919832540181e-07,
"logits/chosen": -0.819484531879425,
"logits/rejected": -0.7444473505020142,
"logps/chosen": -320.7297058105469,
"logps/rejected": -315.49786376953125,
"loss": 0.6452,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.106062151491642,
"rewards/margins": 0.15797743201255798,
"rewards/rejected": -0.2640395760536194,
"step": 100
},
{
"epoch": 0.2302459445316588,
"grad_norm": 8.885427777088127,
"learning_rate": 4.7467175306295647e-07,
"logits/chosen": -0.7485495805740356,
"logits/rejected": -0.6900595426559448,
"logps/chosen": -313.8240966796875,
"logps/rejected": -310.7196960449219,
"loss": 0.6442,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.1286977082490921,
"rewards/margins": 0.1550484299659729,
"rewards/rejected": -0.2837461233139038,
"step": 110
},
{
"epoch": 0.25117739403453687,
"grad_norm": 10.779857080720818,
"learning_rate": 4.6604720940421207e-07,
"logits/chosen": -0.6856316328048706,
"logits/rejected": -0.6849483251571655,
"logps/chosen": -303.8964538574219,
"logps/rejected": -321.4309387207031,
"loss": 0.6199,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.2916944622993469,
"rewards/margins": 0.21943287551403046,
"rewards/rejected": -0.5111273527145386,
"step": 120
},
{
"epoch": 0.272108843537415,
"grad_norm": 11.982309184139016,
"learning_rate": 4.5626458262912735e-07,
"logits/chosen": -0.6801525950431824,
"logits/rejected": -0.6393054723739624,
"logps/chosen": -319.20672607421875,
"logps/rejected": -333.1614685058594,
"loss": 0.609,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.27636662125587463,
"rewards/margins": 0.2492021769285202,
"rewards/rejected": -0.5255688428878784,
"step": 130
},
{
"epoch": 0.29304029304029305,
"grad_norm": 19.151178395769573,
"learning_rate": 4.453763107901675e-07,
"logits/chosen": -0.7184507846832275,
"logits/rejected": -0.6374621987342834,
"logps/chosen": -356.2397766113281,
"logps/rejected": -337.32354736328125,
"loss": 0.6109,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.3082619607448578,
"rewards/margins": 0.31162941455841064,
"rewards/rejected": -0.6198914051055908,
"step": 140
},
{
"epoch": 0.3139717425431711,
"grad_norm": 13.824909994267095,
"learning_rate": 4.3344075855595097e-07,
"logits/chosen": -0.6901696920394897,
"logits/rejected": -0.6279430389404297,
"logps/chosen": -353.95184326171875,
"logps/rejected": -346.9781494140625,
"loss": 0.6132,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.6322077512741089,
"rewards/margins": 0.2923499643802643,
"rewards/rejected": -0.9245575666427612,
"step": 150
},
{
"epoch": 0.3349031920460492,
"grad_norm": 13.009971235384292,
"learning_rate": 4.2052190435769554e-07,
"logits/chosen": -0.7091597318649292,
"logits/rejected": -0.6455188989639282,
"logps/chosen": -340.92657470703125,
"logps/rejected": -347.0225830078125,
"loss": 0.603,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.5523598790168762,
"rewards/margins": 0.33642885088920593,
"rewards/rejected": -0.8887887001037598,
"step": 160
},
{
"epoch": 0.35583464154892724,
"grad_norm": 13.829339790258013,
"learning_rate": 4.0668899744407567e-07,
"logits/chosen": -0.6223039627075195,
"logits/rejected": -0.5784906148910522,
"logps/chosen": -351.1839904785156,
"logps/rejected": -358.60479736328125,
"loss": 0.5953,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.8050142526626587,
"rewards/margins": 0.3386740982532501,
"rewards/rejected": -1.143688440322876,
"step": 170
},
{
"epoch": 0.37676609105180536,
"grad_norm": 20.849086588528724,
"learning_rate": 3.920161866827889e-07,
"logits/chosen": -0.6424199342727661,
"logits/rejected": -0.5930343270301819,
"logps/chosen": -358.6197204589844,
"logps/rejected": -367.137451171875,
"loss": 0.5849,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.8172851800918579,
"rewards/margins": 0.3052961528301239,
"rewards/rejected": -1.1225812435150146,
"step": 180
},
{
"epoch": 0.3976975405546834,
"grad_norm": 17.753231429350524,
"learning_rate": 3.765821230985757e-07,
"logits/chosen": -0.6292937994003296,
"logits/rejected": -0.615179717540741,
"logps/chosen": -343.19952392578125,
"logps/rejected": -375.33929443359375,
"loss": 0.5817,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.68468177318573,
"rewards/margins": 0.34494417905807495,
"rewards/rejected": -1.0296258926391602,
"step": 190
},
{
"epoch": 0.4186289900575615,
"grad_norm": 28.151101969379706,
"learning_rate": 3.604695382782159e-07,
"logits/chosen": -0.5903419256210327,
"logits/rejected": -0.5930633544921875,
"logps/chosen": -360.65179443359375,
"logps/rejected": -412.225830078125,
"loss": 0.5821,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.0000778436660767,
"rewards/margins": 0.33244088292121887,
"rewards/rejected": -1.3325188159942627,
"step": 200
},
{
"epoch": 0.43956043956043955,
"grad_norm": 17.118814416105273,
"learning_rate": 3.4376480090239047e-07,
"logits/chosen": -0.6688283085823059,
"logits/rejected": -0.5644041895866394,
"logps/chosen": -433.589599609375,
"logps/rejected": -432.74993896484375,
"loss": 0.5853,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.17396080493927,
"rewards/margins": 0.4916624426841736,
"rewards/rejected": -1.6656233072280884,
"step": 210
},
{
"epoch": 0.4604918890633176,
"grad_norm": 23.06196255102623,
"learning_rate": 3.265574537815398e-07,
"logits/chosen": -0.5818850994110107,
"logits/rejected": -0.5700303316116333,
"logps/chosen": -351.98638916015625,
"logps/rejected": -410.9193420410156,
"loss": 0.5704,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.0643993616104126,
"rewards/margins": 0.5418257713317871,
"rewards/rejected": -1.6062252521514893,
"step": 220
},
{
"epoch": 0.48142333856619574,
"grad_norm": 21.04733996523729,
"learning_rate": 3.0893973387735683e-07,
"logits/chosen": -0.6803761720657349,
"logits/rejected": -0.6139528751373291,
"logps/chosen": -355.9638671875,
"logps/rejected": -392.5525207519531,
"loss": 0.5753,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.8678590059280396,
"rewards/margins": 0.6038464307785034,
"rewards/rejected": -1.4717055559158325,
"step": 230
},
{
"epoch": 0.5023547880690737,
"grad_norm": 18.458075241435953,
"learning_rate": 2.910060778827554e-07,
"logits/chosen": -0.6669884920120239,
"logits/rejected": -0.5953234434127808,
"logps/chosen": -363.3609313964844,
"logps/rejected": -395.0166320800781,
"loss": 0.5445,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.6804043054580688,
"rewards/margins": 0.501660168170929,
"rewards/rejected": -1.1820645332336426,
"step": 240
},
{
"epoch": 0.5232862375719518,
"grad_norm": 24.001388696637466,
"learning_rate": 2.7285261601056697e-07,
"logits/chosen": -0.6821622252464294,
"logits/rejected": -0.5736308693885803,
"logps/chosen": -392.59375,
"logps/rejected": -420.7662658691406,
"loss": 0.5592,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.037414789199829,
"rewards/margins": 0.6777431964874268,
"rewards/rejected": -1.7151581048965454,
"step": 250
},
{
"epoch": 0.54421768707483,
"grad_norm": 20.80336944317341,
"learning_rate": 2.5457665670441937e-07,
"logits/chosen": -0.6666806936264038,
"logits/rejected": -0.6545027494430542,
"logps/chosen": -379.15423583984375,
"logps/rejected": -414.10302734375,
"loss": 0.549,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.1116763353347778,
"rewards/margins": 0.48681873083114624,
"rewards/rejected": -1.5984950065612793,
"step": 260
},
{
"epoch": 0.565149136577708,
"grad_norm": 18.999186941850777,
"learning_rate": 2.3627616503391812e-07,
"logits/chosen": -0.6578361988067627,
"logits/rejected": -0.6166576147079468,
"logps/chosen": -415.3619079589844,
"logps/rejected": -453.8072814941406,
"loss": 0.5596,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.0673983097076416,
"rewards/margins": 0.6960801482200623,
"rewards/rejected": -1.7634785175323486,
"step": 270
},
{
"epoch": 0.5860805860805861,
"grad_norm": 25.011257430836675,
"learning_rate": 2.1804923757009882e-07,
"logits/chosen": -0.5656932592391968,
"logits/rejected": -0.5232654809951782,
"logps/chosen": -409.4795837402344,
"logps/rejected": -441.3401794433594,
"loss": 0.5636,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.4023996591567993,
"rewards/margins": 0.5914163589477539,
"rewards/rejected": -1.9938161373138428,
"step": 280
},
{
"epoch": 0.6070120355834642,
"grad_norm": 16.649025738470474,
"learning_rate": 1.9999357655598891e-07,
"logits/chosen": -0.617931067943573,
"logits/rejected": -0.5761314034461975,
"logps/chosen": -406.4532775878906,
"logps/rejected": -464.67822265625,
"loss": 0.5591,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.4842880964279175,
"rewards/margins": 0.5774600505828857,
"rewards/rejected": -2.0617482662200928,
"step": 290
},
{
"epoch": 0.6279434850863422,
"grad_norm": 16.730004932577106,
"learning_rate": 1.8220596619089573e-07,
"logits/chosen": -0.6562352180480957,
"logits/rejected": -0.5790780186653137,
"logps/chosen": -451.573486328125,
"logps/rejected": -457.2960510253906,
"loss": 0.5394,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.1794800758361816,
"rewards/margins": 0.5321540832519531,
"rewards/rejected": -1.7116340398788452,
"step": 300
},
{
"epoch": 0.6488749345892203,
"grad_norm": 20.824128290150536,
"learning_rate": 1.647817538357072e-07,
"logits/chosen": -0.6351410150527954,
"logits/rejected": -0.5707007050514221,
"logps/chosen": -443.58978271484375,
"logps/rejected": -464.4762268066406,
"loss": 0.5379,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.2887599468231201,
"rewards/margins": 0.7534160614013672,
"rewards/rejected": -2.042175769805908,
"step": 310
},
{
"epoch": 0.6698063840920984,
"grad_norm": 21.443811248225554,
"learning_rate": 1.478143389201113e-07,
"logits/chosen": -0.652029275894165,
"logits/rejected": -0.5766469240188599,
"logps/chosen": -430.428466796875,
"logps/rejected": -456.797607421875,
"loss": 0.5387,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.6468877792358398,
"rewards/margins": 0.5889655351638794,
"rewards/rejected": -2.235853433609009,
"step": 320
},
{
"epoch": 0.6907378335949764,
"grad_norm": 18.054876727682743,
"learning_rate": 1.3139467229135998e-07,
"logits/chosen": -0.6749883890151978,
"logits/rejected": -0.6647608876228333,
"logps/chosen": -418.3038635253906,
"logps/rejected": -472.7947692871094,
"loss": 0.5367,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.3353520631790161,
"rewards/margins": 0.6326448917388916,
"rewards/rejected": -1.9679968357086182,
"step": 330
},
{
"epoch": 0.7116692830978545,
"grad_norm": 26.52212132673544,
"learning_rate": 1.1561076868822755e-07,
"logits/chosen": -0.6122914552688599,
"logits/rejected": -0.5800005793571472,
"logps/chosen": -433.57635498046875,
"logps/rejected": -465.54669189453125,
"loss": 0.5428,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.294762134552002,
"rewards/margins": 0.587591290473938,
"rewards/rejected": -1.8823535442352295,
"step": 340
},
{
"epoch": 0.7326007326007326,
"grad_norm": 19.655279922879483,
"learning_rate": 1.0054723495346482e-07,
"logits/chosen": -0.6827625036239624,
"logits/rejected": -0.6138468980789185,
"logps/chosen": -397.7406921386719,
"logps/rejected": -434.5318298339844,
"loss": 0.5167,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.2331862449645996,
"rewards/margins": 0.6665691137313843,
"rewards/rejected": -1.8997554779052734,
"step": 350
},
{
"epoch": 0.7535321821036107,
"grad_norm": 19.235838308842027,
"learning_rate": 8.628481651367875e-08,
"logits/chosen": -0.6631180047988892,
"logits/rejected": -0.5895651578903198,
"logps/chosen": -461.24176025390625,
"logps/rejected": -491.12176513671875,
"loss": 0.5577,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.4328138828277588,
"rewards/margins": 0.6876929402351379,
"rewards/rejected": -2.120506763458252,
"step": 360
},
{
"epoch": 0.7744636316064888,
"grad_norm": 19.962323367834856,
"learning_rate": 7.289996455765748e-08,
"logits/chosen": -0.6887942552566528,
"logits/rejected": -0.6230372190475464,
"logps/chosen": -414.96270751953125,
"logps/rejected": -449.39874267578125,
"loss": 0.5411,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.4094620943069458,
"rewards/margins": 0.6524969935417175,
"rewards/rejected": -2.0619590282440186,
"step": 370
},
{
"epoch": 0.7953950811093669,
"grad_norm": 17.262620635801852,
"learning_rate": 6.046442623320145e-08,
"logits/chosen": -0.6020098924636841,
"logits/rejected": -0.6000246405601501,
"logps/chosen": -398.9418029785156,
"logps/rejected": -501.1026916503906,
"loss": 0.5387,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.440071940422058,
"rewards/margins": 0.8659427762031555,
"rewards/rejected": -2.3060147762298584,
"step": 380
},
{
"epoch": 0.8163265306122449,
"grad_norm": 23.268294087473826,
"learning_rate": 4.904486005914027e-08,
"logits/chosen": -0.7086952328681946,
"logits/rejected": -0.6402121782302856,
"logps/chosen": -475.9898376464844,
"logps/rejected": -513.5247802734375,
"loss": 0.5207,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.3465341329574585,
"rewards/margins": 0.7135976552963257,
"rewards/rejected": -2.060131788253784,
"step": 390
},
{
"epoch": 0.837257980115123,
"grad_norm": 20.66365114424933,
"learning_rate": 3.8702478614051345e-08,
"logits/chosen": -0.641961932182312,
"logits/rejected": -0.5901409983634949,
"logps/chosen": -395.93157958984375,
"logps/rejected": -442.00244140625,
"loss": 0.5353,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.2355709075927734,
"rewards/margins": 0.6957671642303467,
"rewards/rejected": -1.9313379526138306,
"step": 400
},
{
"epoch": 0.858189429618001,
"grad_norm": 18.94839439294029,
"learning_rate": 2.9492720416985e-08,
"logits/chosen": -0.7429651021957397,
"logits/rejected": -0.6672912836074829,
"logps/chosen": -440.3075256347656,
"logps/rejected": -467.0520935058594,
"loss": 0.5506,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.2028748989105225,
"rewards/margins": 0.7443949580192566,
"rewards/rejected": -1.9472697973251343,
"step": 410
},
{
"epoch": 0.8791208791208791,
"grad_norm": 19.442404647939284,
"learning_rate": 2.1464952759020856e-08,
"logits/chosen": -0.597920298576355,
"logits/rejected": -0.586058497428894,
"logps/chosen": -406.3937072753906,
"logps/rejected": -479.6614685058594,
"loss": 0.5384,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.3360610008239746,
"rewards/margins": 0.6579158902168274,
"rewards/rejected": -1.9939768314361572,
"step": 420
},
{
"epoch": 0.9000523286237572,
"grad_norm": 19.433771609383193,
"learning_rate": 1.4662207078575684e-08,
"logits/chosen": -0.6416221857070923,
"logits/rejected": -0.5748856663703918,
"logps/chosen": -432.43292236328125,
"logps/rejected": -473.4640197753906,
"loss": 0.5268,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.289609670639038,
"rewards/margins": 0.6828420162200928,
"rewards/rejected": -1.9724515676498413,
"step": 430
},
{
"epoch": 0.9209837781266352,
"grad_norm": 25.563733256044383,
"learning_rate": 9.12094829893642e-09,
"logits/chosen": -0.7145225405693054,
"logits/rejected": -0.6558529138565063,
"logps/chosen": -401.7915344238281,
"logps/rejected": -416.718994140625,
"loss": 0.5331,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.2713123559951782,
"rewards/margins": 0.5642818212509155,
"rewards/rejected": -1.8355941772460938,
"step": 440
},
{
"epoch": 0.9419152276295133,
"grad_norm": 18.949969564615213,
"learning_rate": 4.8708793644441086e-09,
"logits/chosen": -0.5803197622299194,
"logits/rejected": -0.5434113144874573,
"logps/chosen": -410.92730712890625,
"logps/rejected": -468.50323486328125,
"loss": 0.534,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.2884615659713745,
"rewards/margins": 0.7078900337219238,
"rewards/rejected": -1.9963515996932983,
"step": 450
},
{
"epoch": 0.9628466771323915,
"grad_norm": 22.7412601308732,
"learning_rate": 1.9347820230782295e-09,
"logits/chosen": -0.6422279477119446,
"logits/rejected": -0.5603567361831665,
"logps/chosen": -407.45465087890625,
"logps/rejected": -431.9442443847656,
"loss": 0.5408,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.2375915050506592,
"rewards/margins": 0.686954140663147,
"rewards/rejected": -1.9245456457138062,
"step": 460
},
{
"epoch": 0.9837781266352695,
"grad_norm": 17.505664359316036,
"learning_rate": 3.2839470889836627e-10,
"logits/chosen": -0.6509039998054504,
"logits/rejected": -0.6031205654144287,
"logps/chosen": -423.6900939941406,
"logps/rejected": -475.7867736816406,
"loss": 0.5134,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.1757431030273438,
"rewards/margins": 0.6970219612121582,
"rewards/rejected": -1.8727651834487915,
"step": 470
},
{
"epoch": 0.9984301412872841,
"eval_logits/chosen": -0.5977884531021118,
"eval_logits/rejected": -0.5622259378433228,
"eval_logps/chosen": -401.6270751953125,
"eval_logps/rejected": -481.71246337890625,
"eval_loss": 0.5371974110603333,
"eval_rewards/accuracies": 0.765625,
"eval_rewards/chosen": -1.1911048889160156,
"eval_rewards/margins": 0.8110275268554688,
"eval_rewards/rejected": -2.0021324157714844,
"eval_runtime": 196.6339,
"eval_samples_per_second": 10.171,
"eval_steps_per_second": 0.163,
"step": 477
},
{
"epoch": 0.9984301412872841,
"step": 477,
"total_flos": 0.0,
"train_loss": 0.5847830807387954,
"train_runtime": 56722.3251,
"train_samples_per_second": 1.078,
"train_steps_per_second": 0.008
}
],
"logging_steps": 10,
"max_steps": 477,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}