Files
qwen2_7B-dis-wspo-full_E1/trainer_state.json
ModelHub XC 96f590edeb 初始化项目,由ModelHub XC社区提供模型
Model: pltops/qwen2_7B-dis-wspo-full_E1
Source: Original Platform
2026-05-18 04:22:56 +08:00

2717 lines
92 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9998691270776077,
"eval_steps": 1000,
"global_step": 1910,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005234916895694281,
"grad_norm": 3.790904594664249,
"learning_rate": 5.2356020942408376e-08,
"logits/chosen": -1.074317216873169,
"logits/rejected": -1.2653461694717407,
"logps/chosen": -0.4452144503593445,
"logps/rejected": -0.6091843247413635,
"loss": 1.4151,
"rewards/accuracies": 0.3499999940395355,
"rewards/chosen": 4.849554898100905e-05,
"rewards/rejected": -3.19069076795131e-05,
"step": 10
},
{
"epoch": 0.010469833791388562,
"grad_norm": 4.012740047235236,
"learning_rate": 1.0471204188481675e-07,
"logits/chosen": -0.9944978952407837,
"logits/rejected": -1.1899915933609009,
"logps/chosen": -0.4916958212852478,
"logps/rejected": -0.6457526683807373,
"loss": 2.9024,
"rewards/accuracies": 0.5625,
"rewards/chosen": 5.545483509195037e-05,
"rewards/rejected": 4.941503357258625e-05,
"step": 20
},
{
"epoch": 0.015704750687082842,
"grad_norm": 7.854381323221543,
"learning_rate": 1.5706806282722514e-07,
"logits/chosen": -1.0470011234283447,
"logits/rejected": -1.308021068572998,
"logps/chosen": -0.4594300389289856,
"logps/rejected": -0.6046071648597717,
"loss": 1.2899,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -3.4633787436177954e-05,
"rewards/rejected": -0.00032249835203401744,
"step": 30
},
{
"epoch": 0.020939667582777124,
"grad_norm": 3.6452610558813654,
"learning_rate": 2.094240837696335e-07,
"logits/chosen": -1.1655104160308838,
"logits/rejected": -1.3452240228652954,
"logps/chosen": -0.39312028884887695,
"logps/rejected": -0.5556824207305908,
"loss": 2.2173,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -8.240896568167955e-05,
"rewards/rejected": -0.001136856502853334,
"step": 40
},
{
"epoch": 0.026174584478471406,
"grad_norm": 8.500475399649632,
"learning_rate": 2.6178010471204185e-07,
"logits/chosen": -1.1437081098556519,
"logits/rejected": -1.4434831142425537,
"logps/chosen": -0.43456870317459106,
"logps/rejected": -0.5780390501022339,
"loss": 1.2709,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.00010843189375009388,
"rewards/rejected": -0.001327984849922359,
"step": 50
},
{
"epoch": 0.031409501374165684,
"grad_norm": 20.273102817060455,
"learning_rate": 3.1413612565445027e-07,
"logits/chosen": -1.0886688232421875,
"logits/rejected": -1.2838691473007202,
"logps/chosen": -0.44190168380737305,
"logps/rejected": -0.6152902841567993,
"loss": 1.8135,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.0007149001467041671,
"rewards/rejected": -0.005795066244900227,
"step": 60
},
{
"epoch": 0.036644418269859966,
"grad_norm": 43.16254505746086,
"learning_rate": 3.6649214659685864e-07,
"logits/chosen": -1.094167709350586,
"logits/rejected": -1.3319257497787476,
"logps/chosen": -0.4520147740840912,
"logps/rejected": -0.6239620447158813,
"loss": 1.7838,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.000759888265747577,
"rewards/rejected": -0.0066768391989171505,
"step": 70
},
{
"epoch": 0.04187933516555425,
"grad_norm": 11.998623689285427,
"learning_rate": 4.18848167539267e-07,
"logits/chosen": -1.2943776845932007,
"logits/rejected": -1.503999948501587,
"logps/chosen": -0.3685997724533081,
"logps/rejected": -0.5420706272125244,
"loss": 2.3502,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.0021785215940326452,
"rewards/rejected": -0.016134750097990036,
"step": 80
},
{
"epoch": 0.04711425206124853,
"grad_norm": 24.994033392121455,
"learning_rate": 4.712041884816754e-07,
"logits/chosen": -1.2916871309280396,
"logits/rejected": -1.3448667526245117,
"logps/chosen": -0.4025228023529053,
"logps/rejected": -0.5868708491325378,
"loss": 1.8316,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.007973430678248405,
"rewards/rejected": -0.02014215663075447,
"step": 90
},
{
"epoch": 0.05234916895694281,
"grad_norm": 22.171843701204743,
"learning_rate": 5.235602094240837e-07,
"logits/chosen": -1.4637973308563232,
"logits/rejected": -1.6466014385223389,
"logps/chosen": -0.32837918400764465,
"logps/rejected": -0.515870213508606,
"loss": 1.8409,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.01690755970776081,
"rewards/rejected": -0.050148021429777145,
"step": 100
},
{
"epoch": 0.057584085852637086,
"grad_norm": 41.913524204299165,
"learning_rate": 5.759162303664922e-07,
"logits/chosen": -1.4786913394927979,
"logits/rejected": -1.5899611711502075,
"logps/chosen": -0.42642760276794434,
"logps/rejected": -0.6284693479537964,
"loss": 2.2794,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.009459340013563633,
"rewards/rejected": -0.022919194772839546,
"step": 110
},
{
"epoch": 0.06281900274833137,
"grad_norm": 9.042363746985718,
"learning_rate": 6.282722513089005e-07,
"logits/chosen": -1.6049375534057617,
"logits/rejected": -1.756801962852478,
"logps/chosen": -0.47461098432540894,
"logps/rejected": -0.6772693991661072,
"loss": 0.8005,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.014842224307358265,
"rewards/rejected": -0.04914752393960953,
"step": 120
},
{
"epoch": 0.06805391964402566,
"grad_norm": 7.811632000475067,
"learning_rate": 6.806282722513089e-07,
"logits/chosen": -1.5858232975006104,
"logits/rejected": -1.7477552890777588,
"logps/chosen": -0.46297797560691833,
"logps/rejected": -0.6604179739952087,
"loss": 0.9989,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.0641578882932663,
"rewards/rejected": -0.09264906495809555,
"step": 130
},
{
"epoch": 0.07328883653971993,
"grad_norm": 15.65125219657381,
"learning_rate": 7.329842931937173e-07,
"logits/chosen": -1.5584498643875122,
"logits/rejected": -1.709670066833496,
"logps/chosen": -0.5157219171524048,
"logps/rejected": -0.7746927738189697,
"loss": 0.9063,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.09854600578546524,
"rewards/rejected": -0.13169622421264648,
"step": 140
},
{
"epoch": 0.0785237534354142,
"grad_norm": 44.24701426992428,
"learning_rate": 7.853403141361256e-07,
"logits/chosen": -1.7312242984771729,
"logits/rejected": -1.827368140220642,
"logps/chosen": -0.4762954115867615,
"logps/rejected": -0.7440527081489563,
"loss": 1.3761,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.1354602426290512,
"rewards/rejected": -0.22626717388629913,
"step": 150
},
{
"epoch": 0.0837586703311085,
"grad_norm": 44.94371714749407,
"learning_rate": 8.37696335078534e-07,
"logits/chosen": -1.8007497787475586,
"logits/rejected": -1.8725961446762085,
"logps/chosen": -0.4830542504787445,
"logps/rejected": -0.7854688763618469,
"loss": 1.1893,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.07177692651748657,
"rewards/rejected": -0.15431641042232513,
"step": 160
},
{
"epoch": 0.08899358722680277,
"grad_norm": 31.115198767499926,
"learning_rate": 8.900523560209424e-07,
"logits/chosen": -1.9976160526275635,
"logits/rejected": -2.054600954055786,
"logps/chosen": -0.539434015750885,
"logps/rejected": -0.8588225245475769,
"loss": 1.3249,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.12421885877847672,
"rewards/rejected": -0.3051489591598511,
"step": 170
},
{
"epoch": 0.09422850412249706,
"grad_norm": 12.347508697538823,
"learning_rate": 9.424083769633508e-07,
"logits/chosen": -2.005187749862671,
"logits/rejected": -1.9666427373886108,
"logps/chosen": -0.5240647196769714,
"logps/rejected": -0.9816449880599976,
"loss": 1.2205,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.0668339729309082,
"rewards/rejected": -0.3174983561038971,
"step": 180
},
{
"epoch": 0.09946342101819133,
"grad_norm": 2.227548071196379,
"learning_rate": 9.947643979057591e-07,
"logits/chosen": -2.117922782897949,
"logits/rejected": -2.1063754558563232,
"logps/chosen": -0.6270676851272583,
"logps/rejected": -1.067392110824585,
"loss": 0.5393,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.05558066442608833,
"rewards/rejected": -0.2207319438457489,
"step": 190
},
{
"epoch": 0.10469833791388562,
"grad_norm": 7.953337477610359,
"learning_rate": 9.999323662872996e-07,
"logits/chosen": -2.4052085876464844,
"logits/rejected": -2.478701591491699,
"logps/chosen": -0.6346315145492554,
"logps/rejected": -1.130063772201538,
"loss": 0.4773,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.12082117795944214,
"rewards/rejected": -0.4421643316745758,
"step": 200
},
{
"epoch": 0.1099332548095799,
"grad_norm": 21.087545681911077,
"learning_rate": 9.996985942280678e-07,
"logits/chosen": -2.4521114826202393,
"logits/rejected": -2.5669069290161133,
"logps/chosen": -0.6473835110664368,
"logps/rejected": -1.1176555156707764,
"loss": 1.0686,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.11380704492330551,
"rewards/rejected": -0.4065336287021637,
"step": 210
},
{
"epoch": 0.11516817170527417,
"grad_norm": 15.632346870535166,
"learning_rate": 9.99297926897573e-07,
"logits/chosen": -2.7503812313079834,
"logits/rejected": -2.812741994857788,
"logps/chosen": -0.585160493850708,
"logps/rejected": -1.0489227771759033,
"loss": 0.2239,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.1262103021144867,
"rewards/rejected": -0.5222524404525757,
"step": 220
},
{
"epoch": 0.12040308860096846,
"grad_norm": 12.519053406194372,
"learning_rate": 9.987304981154493e-07,
"logits/chosen": -2.837965965270996,
"logits/rejected": -2.9414877891540527,
"logps/chosen": -0.722270131111145,
"logps/rejected": -1.3173713684082031,
"loss": 0.654,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.14661520719528198,
"rewards/rejected": -0.5090414881706238,
"step": 230
},
{
"epoch": 0.12563800549666274,
"grad_norm": 289.81551499279595,
"learning_rate": 9.979964973983e-07,
"logits/chosen": -2.9277539253234863,
"logits/rejected": -3.028458833694458,
"logps/chosen": -0.7722570300102234,
"logps/rejected": -1.3954808712005615,
"loss": 0.5797,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.2518690526485443,
"rewards/rejected": -0.5512933135032654,
"step": 240
},
{
"epoch": 0.130872922392357,
"grad_norm": 7.898569451837497,
"learning_rate": 9.970961698964024e-07,
"logits/chosen": -2.903446912765503,
"logits/rejected": -2.9949727058410645,
"logps/chosen": -0.6554209589958191,
"logps/rejected": -1.3086931705474854,
"loss": 0.2079,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.12546400725841522,
"rewards/rejected": -0.540800929069519,
"step": 250
},
{
"epoch": 0.1361078392880513,
"grad_norm": 35.37908090460439,
"learning_rate": 9.960298163118284e-07,
"logits/chosen": -2.862393617630005,
"logits/rejected": -3.070669174194336,
"logps/chosen": -0.6949301958084106,
"logps/rejected": -1.335038185119629,
"loss": 0.2834,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.1361745148897171,
"rewards/rejected": -0.5548567771911621,
"step": 260
},
{
"epoch": 0.1413427561837456,
"grad_norm": 2.8575991741134716,
"learning_rate": 9.94797792798013e-07,
"logits/chosen": -3.01228666305542,
"logits/rejected": -3.385458469390869,
"logps/chosen": -0.6274330615997314,
"logps/rejected": -1.239761471748352,
"loss": 0.6421,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.14010125398635864,
"rewards/rejected": -0.600482702255249,
"step": 270
},
{
"epoch": 0.14657767307943986,
"grad_norm": 59.20898505824435,
"learning_rate": 9.934005108408016e-07,
"logits/chosen": -3.242931842803955,
"logits/rejected": -3.3713455200195312,
"logps/chosen": -0.7664941549301147,
"logps/rejected": -1.5041484832763672,
"loss": 0.2062,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.10625378042459488,
"rewards/rejected": -0.4088156819343567,
"step": 280
},
{
"epoch": 0.15181258997513414,
"grad_norm": 67.37145724737809,
"learning_rate": 9.918384371210175e-07,
"logits/chosen": -3.297367572784424,
"logits/rejected": -3.390179395675659,
"logps/chosen": -0.6818624138832092,
"logps/rejected": -1.3045790195465088,
"loss": 0.2825,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.14947417378425598,
"rewards/rejected": -0.5264440774917603,
"step": 290
},
{
"epoch": 0.1570475068708284,
"grad_norm": 6.991708239819734,
"learning_rate": 9.901120933585937e-07,
"logits/chosen": -2.914552688598633,
"logits/rejected": -2.9296183586120605,
"logps/chosen": -0.7332116961479187,
"logps/rejected": -1.3207851648330688,
"loss": 0.2746,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.1801702082157135,
"rewards/rejected": -0.4402903616428375,
"step": 300
},
{
"epoch": 0.16228242376652272,
"grad_norm": 5.467852048052765,
"learning_rate": 9.882220561383237e-07,
"logits/chosen": -2.6258440017700195,
"logits/rejected": -2.7410550117492676,
"logps/chosen": -0.6673339605331421,
"logps/rejected": -1.259270429611206,
"loss": 0.2535,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.1613113135099411,
"rewards/rejected": -0.5873299837112427,
"step": 310
},
{
"epoch": 0.167517340662217,
"grad_norm": 3.8941661088693595,
"learning_rate": 9.861689567172849e-07,
"logits/chosen": -2.7174181938171387,
"logits/rejected": -2.859903573989868,
"logps/chosen": -0.7855610251426697,
"logps/rejected": -1.3645145893096924,
"loss": 0.571,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.11017797142267227,
"rewards/rejected": -0.33003589510917664,
"step": 320
},
{
"epoch": 0.17275225755791127,
"grad_norm": 16.181666110930603,
"learning_rate": 9.839534808140065e-07,
"logits/chosen": -2.7446448802948,
"logits/rejected": -2.9309000968933105,
"logps/chosen": -0.6837766170501709,
"logps/rejected": -1.2414804697036743,
"loss": 0.4195,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.17641454935073853,
"rewards/rejected": -0.6049523949623108,
"step": 330
},
{
"epoch": 0.17798717445360554,
"grad_norm": 6.6623417517030274,
"learning_rate": 9.815763683794431e-07,
"logits/chosen": -3.110708713531494,
"logits/rejected": -3.232154130935669,
"logps/chosen": -0.9199361801147461,
"logps/rejected": -1.6282669305801392,
"loss": 0.3528,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.16633550822734833,
"rewards/rejected": -0.5285095572471619,
"step": 340
},
{
"epoch": 0.18322209134929984,
"grad_norm": 42.75001280615641,
"learning_rate": 9.790384133498377e-07,
"logits/chosen": -3.181398868560791,
"logits/rejected": -3.380305528640747,
"logps/chosen": -0.6931721568107605,
"logps/rejected": -1.301574468612671,
"loss": 0.4167,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.238613560795784,
"rewards/rejected": -0.5942808985710144,
"step": 350
},
{
"epoch": 0.18845700824499412,
"grad_norm": 25.407488562124865,
"learning_rate": 9.763404633815536e-07,
"logits/chosen": -3.138686418533325,
"logits/rejected": -3.3759498596191406,
"logps/chosen": -0.7499098777770996,
"logps/rejected": -1.2932870388031006,
"loss": 0.1701,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.13701362907886505,
"rewards/rejected": -0.4852725863456726,
"step": 360
},
{
"epoch": 0.1936919251406884,
"grad_norm": 1.4892370693809598,
"learning_rate": 9.73483419567964e-07,
"logits/chosen": -3.4019501209259033,
"logits/rejected": -3.549142837524414,
"logps/chosen": -0.7944781184196472,
"logps/rejected": -1.424883484840393,
"loss": 0.1402,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.12177709490060806,
"rewards/rejected": -0.3804728090763092,
"step": 370
},
{
"epoch": 0.19892684203638267,
"grad_norm": 54.39502830839909,
"learning_rate": 9.70468236138494e-07,
"logits/chosen": -3.2426352500915527,
"logits/rejected": -3.4838757514953613,
"logps/chosen": -0.6787932515144348,
"logps/rejected": -1.1444861888885498,
"loss": 0.5084,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.1906341016292572,
"rewards/rejected": -0.5342355966567993,
"step": 380
},
{
"epoch": 0.20416175893207694,
"grad_norm": 4.102532676377055,
"learning_rate": 9.672959201399155e-07,
"logits/chosen": -3.1436872482299805,
"logits/rejected": -3.3000025749206543,
"logps/chosen": -0.6593716144561768,
"logps/rejected": -1.2297086715698242,
"loss": 0.2254,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.1245049387216568,
"rewards/rejected": -0.63857102394104,
"step": 390
},
{
"epoch": 0.20939667582777124,
"grad_norm": 26.264208746234342,
"learning_rate": 9.639675311000027e-07,
"logits/chosen": -2.8664770126342773,
"logits/rejected": -3.217289686203003,
"logps/chosen": -0.5279222726821899,
"logps/rejected": -0.9587762951850891,
"loss": 0.2976,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.08593594282865524,
"rewards/rejected": -0.5586038827896118,
"step": 400
},
{
"epoch": 0.21463159272346552,
"grad_norm": 38.41571021062487,
"learning_rate": 9.60484180673657e-07,
"logits/chosen": -3.0221009254455566,
"logits/rejected": -3.3375930786132812,
"logps/chosen": -0.6153509020805359,
"logps/rejected": -1.208428144454956,
"loss": 0.2007,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.1186663880944252,
"rewards/rejected": -0.7561261057853699,
"step": 410
},
{
"epoch": 0.2198665096191598,
"grad_norm": 26.129325664394095,
"learning_rate": 9.568470322716246e-07,
"logits/chosen": -3.197547435760498,
"logits/rejected": -3.473541736602783,
"logps/chosen": -0.7100402116775513,
"logps/rejected": -1.3731144666671753,
"loss": 0.2129,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.15885800123214722,
"rewards/rejected": -0.7041777968406677,
"step": 420
},
{
"epoch": 0.22510142651485407,
"grad_norm": 3.3568794858215614,
"learning_rate": 9.530573006719263e-07,
"logits/chosen": -3.1833243370056152,
"logits/rejected": -3.5457568168640137,
"logps/chosen": -0.7644280195236206,
"logps/rejected": -1.4675564765930176,
"loss": 0.3047,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.10174532234668732,
"rewards/rejected": -0.43558469414711,
"step": 430
},
{
"epoch": 0.23033634341054834,
"grad_norm": 6.858790048554929,
"learning_rate": 9.491162516141307e-07,
"logits/chosen": -3.0963964462280273,
"logits/rejected": -3.347712993621826,
"logps/chosen": -0.7016817927360535,
"logps/rejected": -1.3338514566421509,
"loss": 0.3073,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.09215731918811798,
"rewards/rejected": -0.45697230100631714,
"step": 440
},
{
"epoch": 0.23557126030624265,
"grad_norm": 23.530428559730304,
"learning_rate": 9.450252013766092e-07,
"logits/chosen": -3.2469124794006348,
"logits/rejected": -3.4776294231414795,
"logps/chosen": -0.7150281667709351,
"logps/rejected": -1.317091703414917,
"loss": 0.2759,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.11948645114898682,
"rewards/rejected": -0.5289221405982971,
"step": 450
},
{
"epoch": 0.24080617720193692,
"grad_norm": 3.0816780004971567,
"learning_rate": 9.407855163369078e-07,
"logits/chosen": -3.1366970539093018,
"logits/rejected": -3.4390456676483154,
"logps/chosen": -0.7695866823196411,
"logps/rejected": -1.28377366065979,
"loss": 0.1834,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.10271289199590683,
"rewards/rejected": -0.4681883454322815,
"step": 460
},
{
"epoch": 0.2460410940976312,
"grad_norm": 0.8968075253326218,
"learning_rate": 9.3639861251539e-07,
"logits/chosen": -3.204808473587036,
"logits/rejected": -3.410945415496826,
"logps/chosen": -0.7953635454177856,
"logps/rejected": -1.4356180429458618,
"loss": 0.2817,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -0.11539553105831146,
"rewards/rejected": -0.5434930324554443,
"step": 470
},
{
"epoch": 0.25127601099332547,
"grad_norm": 15.250575277553056,
"learning_rate": 9.318659551022955e-07,
"logits/chosen": -3.5916152000427246,
"logits/rejected": -3.745251417160034,
"logps/chosen": -0.7927001118659973,
"logps/rejected": -1.4079248905181885,
"loss": 0.3906,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.14077363908290863,
"rewards/rejected": -0.6647804379463196,
"step": 480
},
{
"epoch": 0.25651092788901975,
"grad_norm": 8.302573476788423,
"learning_rate": 9.271890579683804e-07,
"logits/chosen": -3.6112866401672363,
"logits/rejected": -3.7867329120635986,
"logps/chosen": -0.7936150431632996,
"logps/rejected": -1.3792264461517334,
"loss": 0.155,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.1597827672958374,
"rewards/rejected": -0.7419155240058899,
"step": 490
},
{
"epoch": 0.261745844784714,
"grad_norm": 19.15043456191315,
"learning_rate": 9.223694831592952e-07,
"logits/chosen": -3.4215950965881348,
"logits/rejected": -3.5204520225524902,
"logps/chosen": -0.6265555024147034,
"logps/rejected": -1.1579577922821045,
"loss": 0.465,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.20739369094371796,
"rewards/rejected": -0.7289354205131531,
"step": 500
},
{
"epoch": 0.2669807616804083,
"grad_norm": 64.7184486122584,
"learning_rate": 9.174088403738755e-07,
"logits/chosen": -3.099766969680786,
"logits/rejected": -3.3204002380371094,
"logps/chosen": -0.690481424331665,
"logps/rejected": -1.2163944244384766,
"loss": 0.3773,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.13189923763275146,
"rewards/rejected": -0.520399808883667,
"step": 510
},
{
"epoch": 0.2722156785761026,
"grad_norm": 17.501316331807786,
"learning_rate": 9.123087864265147e-07,
"logits/chosen": -3.1762442588806152,
"logits/rejected": -3.2784526348114014,
"logps/chosen": -0.6847952008247375,
"logps/rejected": -1.1162774562835693,
"loss": 0.1267,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.06572765856981277,
"rewards/rejected": -0.351901113986969,
"step": 520
},
{
"epoch": 0.2774505954717969,
"grad_norm": 14.212473293142173,
"learning_rate": 9.070710246938016e-07,
"logits/chosen": -3.2481791973114014,
"logits/rejected": -3.489774227142334,
"logps/chosen": -0.772381603717804,
"logps/rejected": -1.4248247146606445,
"loss": 0.2427,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.09138830751180649,
"rewards/rejected": -0.4058937132358551,
"step": 530
},
{
"epoch": 0.2826855123674912,
"grad_norm": 3.7993918380305227,
"learning_rate": 9.016973045456073e-07,
"logits/chosen": -3.5233864784240723,
"logits/rejected": -3.5879790782928467,
"logps/chosen": -0.6680731773376465,
"logps/rejected": -1.2521740198135376,
"loss": 0.233,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.16955754160881042,
"rewards/rejected": -0.6247085928916931,
"step": 540
},
{
"epoch": 0.28792042926318545,
"grad_norm": 31.268425824083035,
"learning_rate": 8.961894207608087e-07,
"logits/chosen": -3.3877577781677246,
"logits/rejected": -3.6265366077423096,
"logps/chosen": -0.7721540331840515,
"logps/rejected": -1.4250587224960327,
"loss": 0.2769,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.16779252886772156,
"rewards/rejected": -0.6825228333473206,
"step": 550
},
{
"epoch": 0.2931553461588797,
"grad_norm": 7.718273564892154,
"learning_rate": 8.905492129278477e-07,
"logits/chosen": -3.29071044921875,
"logits/rejected": -3.561675548553467,
"logps/chosen": -0.8162961006164551,
"logps/rejected": -1.4463467597961426,
"loss": 0.1915,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.204677551984787,
"rewards/rejected": -0.47629499435424805,
"step": 560
},
{
"epoch": 0.298390263054574,
"grad_norm": 12.025693422800462,
"learning_rate": 8.847785648303233e-07,
"logits/chosen": -3.204369306564331,
"logits/rejected": -3.2629711627960205,
"logps/chosen": -0.7215951085090637,
"logps/rejected": -1.2496535778045654,
"loss": 0.3283,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.17979201674461365,
"rewards/rejected": -0.6653040647506714,
"step": 570
},
{
"epoch": 0.3036251799502683,
"grad_norm": 16.91677605832855,
"learning_rate": 8.788794038178232e-07,
"logits/chosen": -3.354393482208252,
"logits/rejected": -3.566246747970581,
"logps/chosen": -0.6297786831855774,
"logps/rejected": -1.2458020448684692,
"loss": 0.4011,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.1064259260892868,
"rewards/rejected": -0.6175190210342407,
"step": 580
},
{
"epoch": 0.30886009684596255,
"grad_norm": 3.0634420462864362,
"learning_rate": 8.728537001622049e-07,
"logits/chosen": -3.4152801036834717,
"logits/rejected": -3.7001967430114746,
"logps/chosen": -0.6182211637496948,
"logps/rejected": -1.2031091451644897,
"loss": 0.3326,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.12622275948524475,
"rewards/rejected": -0.6566742062568665,
"step": 590
},
{
"epoch": 0.3140950137416568,
"grad_norm": 35.90548539348865,
"learning_rate": 8.667034663995408e-07,
"logits/chosen": -3.3982882499694824,
"logits/rejected": -3.6578991413116455,
"logps/chosen": -0.6449892520904541,
"logps/rejected": -1.2093414068222046,
"loss": 0.3771,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.0947725772857666,
"rewards/rejected": -0.46876105666160583,
"step": 600
},
{
"epoch": 0.31932993063735116,
"grad_norm": 1.5338911609375667,
"learning_rate": 8.604307566579472e-07,
"logits/chosen": -3.5967631340026855,
"logits/rejected": -3.748669385910034,
"logps/chosen": -0.759170651435852,
"logps/rejected": -1.21732497215271,
"loss": 0.0886,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.12660792469978333,
"rewards/rejected": -0.35170167684555054,
"step": 610
},
{
"epoch": 0.32456484753304543,
"grad_norm": 3.030605610103173,
"learning_rate": 8.540376659715225e-07,
"logits/chosen": -3.6591286659240723,
"logits/rejected": -3.9070792198181152,
"logps/chosen": -0.6764650344848633,
"logps/rejected": -1.1898220777511597,
"loss": 0.1434,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.1459110528230667,
"rewards/rejected": -0.43774351477622986,
"step": 620
},
{
"epoch": 0.3297997644287397,
"grad_norm": 10.069918354827117,
"learning_rate": 8.47526329580623e-07,
"logits/chosen": -3.5482306480407715,
"logits/rejected": -3.7556564807891846,
"logps/chosen": -0.6558570265769958,
"logps/rejected": -1.2123215198516846,
"loss": 0.516,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.10284946113824844,
"rewards/rejected": -0.5016738772392273,
"step": 630
},
{
"epoch": 0.335034681324434,
"grad_norm": 3.5244932761338124,
"learning_rate": 8.408989222187096e-07,
"logits/chosen": -3.4110941886901855,
"logits/rejected": -3.6678364276885986,
"logps/chosen": -0.6549906730651855,
"logps/rejected": -1.246897578239441,
"loss": 0.7311,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.10875538736581802,
"rewards/rejected": -0.5850102305412292,
"step": 640
},
{
"epoch": 0.34026959822012826,
"grad_norm": 4.0414984437360175,
"learning_rate": 8.341576573860047e-07,
"logits/chosen": -3.478461503982544,
"logits/rejected": -3.702072858810425,
"logps/chosen": -0.7687514424324036,
"logps/rejected": -1.43941330909729,
"loss": 0.2588,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.16434721648693085,
"rewards/rejected": -0.5769973993301392,
"step": 650
},
{
"epoch": 0.34550451511582253,
"grad_norm": 17.089029660346316,
"learning_rate": 8.27304786610201e-07,
"logits/chosen": -3.6008193492889404,
"logits/rejected": -3.9538185596466064,
"logps/chosen": -0.6982223987579346,
"logps/rejected": -1.2972527742385864,
"loss": 0.2549,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.0916595607995987,
"rewards/rejected": -0.38794606924057007,
"step": 660
},
{
"epoch": 0.3507394320115168,
"grad_norm": 23.354286685814746,
"learning_rate": 8.203425986944696e-07,
"logits/chosen": -3.7454254627227783,
"logits/rejected": -3.954153537750244,
"logps/chosen": -0.6409385800361633,
"logps/rejected": -1.1634663343429565,
"loss": 0.1437,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.09134040772914886,
"rewards/rejected": -0.4564918577671051,
"step": 670
},
{
"epoch": 0.3559743489072111,
"grad_norm": 12.016594060247717,
"learning_rate": 8.132734189530182e-07,
"logits/chosen": -3.7062535285949707,
"logits/rejected": -3.933081865310669,
"logps/chosen": -0.5595335960388184,
"logps/rejected": -1.047524333000183,
"loss": 0.1974,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.20238538086414337,
"rewards/rejected": -0.5873401165008545,
"step": 680
},
{
"epoch": 0.36120926580290535,
"grad_norm": 13.607141196369907,
"learning_rate": 8.060996084344553e-07,
"logits/chosen": -3.6081855297088623,
"logits/rejected": -3.7106146812438965,
"logps/chosen": -0.7058667540550232,
"logps/rejected": -1.333534836769104,
"loss": 0.2337,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.20908991992473602,
"rewards/rejected": -0.5528732538223267,
"step": 690
},
{
"epoch": 0.3664441826985997,
"grad_norm": 9.343448893616527,
"learning_rate": 7.98823563133219e-07,
"logits/chosen": -3.7106995582580566,
"logits/rejected": -3.8569560050964355,
"logps/chosen": -0.5677663087844849,
"logps/rejected": -1.077682614326477,
"loss": 0.1728,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.1268927901983261,
"rewards/rejected": -0.4453812539577484,
"step": 700
},
{
"epoch": 0.37167909959429396,
"grad_norm": 1.674723781078412,
"learning_rate": 7.914477131893342e-07,
"logits/chosen": -3.6300597190856934,
"logits/rejected": -3.8654580116271973,
"logps/chosen": -0.6926698684692383,
"logps/rejected": -1.3483922481536865,
"loss": 0.2708,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.11889272928237915,
"rewards/rejected": -0.7035388350486755,
"step": 710
},
{
"epoch": 0.37691401648998824,
"grad_norm": 26.516721992313247,
"learning_rate": 7.839745220767661e-07,
"logits/chosen": -3.356396436691284,
"logits/rejected": -3.629464626312256,
"logps/chosen": -0.6213072538375854,
"logps/rejected": -1.278612494468689,
"loss": 0.4189,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.17910563945770264,
"rewards/rejected": -0.6009346842765808,
"step": 720
},
{
"epoch": 0.3821489333856825,
"grad_norm": 29.551216070368934,
"learning_rate": 7.764064857806389e-07,
"logits/chosen": -3.349151611328125,
"logits/rejected": -3.5175952911376953,
"logps/chosen": -0.6614469289779663,
"logps/rejected": -1.1986842155456543,
"loss": 0.3052,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.1393895149230957,
"rewards/rejected": -0.49458661675453186,
"step": 730
},
{
"epoch": 0.3873838502813768,
"grad_norm": 25.357071934733334,
"learning_rate": 7.68746131963598e-07,
"logits/chosen": -3.4714951515197754,
"logits/rejected": -3.677035093307495,
"logps/chosen": -0.620179295539856,
"logps/rejected": -1.2890572547912598,
"loss": 0.228,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.13783490657806396,
"rewards/rejected": -0.794097363948822,
"step": 740
},
{
"epoch": 0.39261876717707106,
"grad_norm": 25.198350240467338,
"learning_rate": 7.609960191215909e-07,
"logits/chosen": -3.4828593730926514,
"logits/rejected": -3.795466899871826,
"logps/chosen": -0.6967580318450928,
"logps/rejected": -1.377361536026001,
"loss": 0.3695,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.19926968216896057,
"rewards/rejected": -0.9696201086044312,
"step": 750
},
{
"epoch": 0.39785368407276533,
"grad_norm": 36.505889073660846,
"learning_rate": 7.531587357293505e-07,
"logits/chosen": -3.595017910003662,
"logits/rejected": -3.8477072715759277,
"logps/chosen": -0.7997711896896362,
"logps/rejected": -1.3395113945007324,
"loss": 0.1863,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.1399848610162735,
"rewards/rejected": -0.5361682176589966,
"step": 760
},
{
"epoch": 0.4030886009684596,
"grad_norm": 9.852370874435213,
"learning_rate": 7.452368993758645e-07,
"logits/chosen": -3.4401755332946777,
"logits/rejected": -3.7114880084991455,
"logps/chosen": -0.6346908211708069,
"logps/rejected": -1.3159992694854736,
"loss": 0.2026,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.16581778228282928,
"rewards/rejected": -0.7039278745651245,
"step": 770
},
{
"epoch": 0.4083235178641539,
"grad_norm": 39.35986226984939,
"learning_rate": 7.372331558901237e-07,
"logits/chosen": -3.411632537841797,
"logits/rejected": -3.6088695526123047,
"logps/chosen": -0.6711673140525818,
"logps/rejected": -1.2047076225280762,
"loss": 0.1863,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.1048843041062355,
"rewards/rejected": -0.6397637128829956,
"step": 780
},
{
"epoch": 0.4135584347598482,
"grad_norm": 7.036281988177846,
"learning_rate": 7.291501784574355e-07,
"logits/chosen": -3.5028297901153564,
"logits/rejected": -3.7100181579589844,
"logps/chosen": -0.6011011600494385,
"logps/rejected": -1.1766241788864136,
"loss": 0.3423,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.12209127098321915,
"rewards/rejected": -0.5233365893363953,
"step": 790
},
{
"epoch": 0.4187933516555425,
"grad_norm": 15.877033932538007,
"learning_rate": 7.209906667266017e-07,
"logits/chosen": -3.6198973655700684,
"logits/rejected": -3.881483793258667,
"logps/chosen": -0.7075928449630737,
"logps/rejected": -1.1975640058517456,
"loss": 0.1767,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.13878247141838074,
"rewards/rejected": -0.5438691973686218,
"step": 800
},
{
"epoch": 0.42402826855123676,
"grad_norm": 8.32857274649763,
"learning_rate": 7.12757345908258e-07,
"logits/chosen": -3.478787660598755,
"logits/rejected": -3.5588173866271973,
"logps/chosen": -0.5804450511932373,
"logps/rejected": -1.1082279682159424,
"loss": 0.1955,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.10243809223175049,
"rewards/rejected": -0.5699917674064636,
"step": 810
},
{
"epoch": 0.42926318544693104,
"grad_norm": 1.971725698476674,
"learning_rate": 7.044529658646761e-07,
"logits/chosen": -3.325711488723755,
"logits/rejected": -3.489297866821289,
"logps/chosen": -0.6689733266830444,
"logps/rejected": -1.2249457836151123,
"loss": 0.1751,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.15426844358444214,
"rewards/rejected": -0.5962169170379639,
"step": 820
},
{
"epoch": 0.4344981023426253,
"grad_norm": 6.500882333020027,
"learning_rate": 6.960803001913314e-07,
"logits/chosen": -3.3526389598846436,
"logits/rejected": -3.616393566131592,
"logps/chosen": -0.6438730955123901,
"logps/rejected": -1.241818904876709,
"loss": 0.241,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.1811794489622116,
"rewards/rejected": -0.5559738874435425,
"step": 830
},
{
"epoch": 0.4397330192383196,
"grad_norm": 1.5485575267591558,
"learning_rate": 6.876421452905448e-07,
"logits/chosen": -3.6445419788360596,
"logits/rejected": -3.844398021697998,
"logps/chosen": -0.6539190411567688,
"logps/rejected": -1.1970140933990479,
"loss": 0.1112,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.12994791567325592,
"rewards/rejected": -0.5939737558364868,
"step": 840
},
{
"epoch": 0.44496793613401386,
"grad_norm": 8.846891414177467,
"learning_rate": 6.791413194375076e-07,
"logits/chosen": -3.665837049484253,
"logits/rejected": -4.038450717926025,
"logps/chosen": -0.6240882873535156,
"logps/rejected": -1.2376606464385986,
"loss": 0.1749,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.13635075092315674,
"rewards/rejected": -0.5054816007614136,
"step": 850
},
{
"epoch": 0.45020285302970814,
"grad_norm": 50.539531832216845,
"learning_rate": 6.705806618389997e-07,
"logits/chosen": -3.5491700172424316,
"logits/rejected": -3.891484498977661,
"logps/chosen": -0.6607200503349304,
"logps/rejected": -1.2415850162506104,
"loss": 0.2544,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.11792077124118805,
"rewards/rejected": -0.7187885642051697,
"step": 860
},
{
"epoch": 0.4554377699254024,
"grad_norm": 2.281104923342322,
"learning_rate": 6.619630316851182e-07,
"logits/chosen": -3.623032331466675,
"logits/rejected": -3.9570438861846924,
"logps/chosen": -0.5454970598220825,
"logps/rejected": -1.008527398109436,
"loss": 0.4412,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.11349859088659286,
"rewards/rejected": -0.49833065271377563,
"step": 870
},
{
"epoch": 0.4606726868210967,
"grad_norm": 21.825427920475462,
"learning_rate": 6.532913071943307e-07,
"logits/chosen": -3.726950168609619,
"logits/rejected": -3.893709182739258,
"logps/chosen": -0.7641295194625854,
"logps/rejected": -1.277066946029663,
"loss": 0.0858,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.11578428745269775,
"rewards/rejected": -0.31977829337120056,
"step": 880
},
{
"epoch": 0.465907603716791,
"grad_norm": 10.863674979284927,
"learning_rate": 6.445683846521738e-07,
"logits/chosen": -3.648641586303711,
"logits/rejected": -3.897275447845459,
"logps/chosen": -0.7207273244857788,
"logps/rejected": -1.2213026285171509,
"loss": 0.1486,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.11959713697433472,
"rewards/rejected": -0.580722987651825,
"step": 890
},
{
"epoch": 0.4711425206124853,
"grad_norm": 11.024990455629961,
"learning_rate": 6.357971774439177e-07,
"logits/chosen": -3.67216157913208,
"logits/rejected": -3.8727009296417236,
"logps/chosen": -0.5953903198242188,
"logps/rejected": -1.187514066696167,
"loss": 0.3925,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.162687286734581,
"rewards/rejected": -0.5742394328117371,
"step": 900
},
{
"epoch": 0.47637743750817957,
"grad_norm": 9.020917089934892,
"learning_rate": 6.269806150815187e-07,
"logits/chosen": -3.646181583404541,
"logits/rejected": -3.939856767654419,
"logps/chosen": -0.6697233319282532,
"logps/rejected": -1.2958990335464478,
"loss": 0.14,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.1477135419845581,
"rewards/rejected": -0.5787355899810791,
"step": 910
},
{
"epoch": 0.48161235440387384,
"grad_norm": 17.07185019161522,
"learning_rate": 6.181216422251862e-07,
"logits/chosen": -3.652355909347534,
"logits/rejected": -3.9332756996154785,
"logps/chosen": -0.6171292066574097,
"logps/rejected": -1.1538760662078857,
"loss": 0.1442,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.1485549807548523,
"rewards/rejected": -0.5386639833450317,
"step": 920
},
{
"epoch": 0.4868472712995681,
"grad_norm": 11.35798051607949,
"learning_rate": 6.092232176998897e-07,
"logits/chosen": -3.3274683952331543,
"logits/rejected": -3.683140993118286,
"logps/chosen": -0.6949746012687683,
"logps/rejected": -1.2979357242584229,
"loss": 0.2627,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -0.10100005567073822,
"rewards/rejected": -0.40933284163475037,
"step": 930
},
{
"epoch": 0.4920821881952624,
"grad_norm": 7.48204999503568,
"learning_rate": 6.002883135071362e-07,
"logits/chosen": -3.6361114978790283,
"logits/rejected": -3.845881700515747,
"logps/chosen": -0.6067990064620972,
"logps/rejected": -1.1521469354629517,
"loss": 0.2342,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.08356883376836777,
"rewards/rejected": -0.3899988830089569,
"step": 940
},
{
"epoch": 0.49731710509095667,
"grad_norm": 9.65932504460624,
"learning_rate": 5.913199138323448e-07,
"logits/chosen": -3.4517874717712402,
"logits/rejected": -3.6863322257995605,
"logps/chosen": -0.6873298287391663,
"logps/rejected": -1.2405993938446045,
"loss": 0.0732,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.13713274896144867,
"rewards/rejected": -0.5883907079696655,
"step": 950
},
{
"epoch": 0.5025520219866509,
"grad_norm": 6.954926765138611,
"learning_rate": 5.82321014048154e-07,
"logits/chosen": -3.531513214111328,
"logits/rejected": -3.8438732624053955,
"logps/chosen": -0.6925119161605835,
"logps/rejected": -1.3587197065353394,
"loss": 0.2015,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.12155506759881973,
"rewards/rejected": -0.5510644912719727,
"step": 960
},
{
"epoch": 0.5077869388823453,
"grad_norm": 29.092706700807746,
"learning_rate": 5.732946197139906e-07,
"logits/chosen": -3.543038845062256,
"logits/rejected": -3.7100837230682373,
"logps/chosen": -0.6176570057868958,
"logps/rejected": -1.1735661029815674,
"loss": 0.1762,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.11741694062948227,
"rewards/rejected": -0.5648257732391357,
"step": 970
},
{
"epoch": 0.5130218557780395,
"grad_norm": 15.364134485434684,
"learning_rate": 5.642437455722381e-07,
"logits/chosen": -3.527390718460083,
"logits/rejected": -3.6467444896698,
"logps/chosen": -0.5736885070800781,
"logps/rejected": -1.1105291843414307,
"loss": 0.1348,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.1023680791258812,
"rewards/rejected": -0.5240501761436462,
"step": 980
},
{
"epoch": 0.5182567726737338,
"grad_norm": 10.7420392782178,
"learning_rate": 5.551714145413368e-07,
"logits/chosen": -3.633018970489502,
"logits/rejected": -3.8861050605773926,
"logps/chosen": -0.6110260486602783,
"logps/rejected": -1.197975516319275,
"loss": 0.1833,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.2393743246793747,
"rewards/rejected": -0.8618858456611633,
"step": 990
},
{
"epoch": 0.523491689569428,
"grad_norm": 29.652979248675152,
"learning_rate": 5.460806567061533e-07,
"logits/chosen": -3.5295844078063965,
"logits/rejected": -3.756152629852295,
"logps/chosen": -0.6682409048080444,
"logps/rejected": -1.2239763736724854,
"loss": 0.1562,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.061064548790454865,
"rewards/rejected": -0.29901862144470215,
"step": 1000
},
{
"epoch": 0.5287266064651224,
"grad_norm": 4.935111757809973,
"learning_rate": 5.369745083059577e-07,
"logits/chosen": -3.706066608428955,
"logits/rejected": -3.871903657913208,
"logps/chosen": -0.8551700711250305,
"logps/rejected": -1.495203971862793,
"loss": 0.09,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.08738512545824051,
"rewards/rejected": -0.4607165455818176,
"step": 1010
},
{
"epoch": 0.5339615233608166,
"grad_norm": 14.950139526881932,
"learning_rate": 5.278560107203437e-07,
"logits/chosen": -3.6445841789245605,
"logits/rejected": -4.002659320831299,
"logps/chosen": -0.678175151348114,
"logps/rejected": -1.2137267589569092,
"loss": 0.164,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.16553157567977905,
"rewards/rejected": -0.7228254079818726,
"step": 1020
},
{
"epoch": 0.5391964402565109,
"grad_norm": 10.883180795778303,
"learning_rate": 5.18728209453432e-07,
"logits/chosen": -3.707653760910034,
"logits/rejected": -3.845078229904175,
"logps/chosen": -0.6454753875732422,
"logps/rejected": -1.1513105630874634,
"loss": 0.1559,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.1302897185087204,
"rewards/rejected": -0.5494991540908813,
"step": 1030
},
{
"epoch": 0.5444313571522053,
"grad_norm": 33.05212121743528,
"learning_rate": 5.095941531166982e-07,
"logits/chosen": -3.78800630569458,
"logits/rejected": -4.111274242401123,
"logps/chosen": -0.650190532207489,
"logps/rejected": -1.2724605798721313,
"loss": 0.3122,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.16642943024635315,
"rewards/rejected": -0.7723706960678101,
"step": 1040
},
{
"epoch": 0.5496662740478995,
"grad_norm": 25.0104518604515,
"learning_rate": 5.004568924107598e-07,
"logits/chosen": -3.5907185077667236,
"logits/rejected": -3.9407639503479004,
"logps/chosen": -0.6616253852844238,
"logps/rejected": -1.3152577877044678,
"loss": 0.3186,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.16016852855682373,
"rewards/rejected": -0.4862311780452728,
"step": 1050
},
{
"epoch": 0.5549011909435938,
"grad_norm": 31.981933797923496,
"learning_rate": 4.913194791064675e-07,
"logits/chosen": -3.687349796295166,
"logits/rejected": -3.9528133869171143,
"logps/chosen": -0.8306125402450562,
"logps/rejected": -1.4087917804718018,
"loss": 0.362,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.1095658391714096,
"rewards/rejected": -0.4266139566898346,
"step": 1060
},
{
"epoch": 0.560136107839288,
"grad_norm": 43.09016084350836,
"learning_rate": 4.82184965025639e-07,
"logits/chosen": -3.7304611206054688,
"logits/rejected": -3.975867748260498,
"logps/chosen": -0.6367403268814087,
"logps/rejected": -1.249473214149475,
"loss": 0.2017,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.23390457034111023,
"rewards/rejected": -0.6657984256744385,
"step": 1070
},
{
"epoch": 0.5653710247349824,
"grad_norm": 14.680677683118907,
"learning_rate": 4.73056401021775e-07,
"logits/chosen": -3.6824183464050293,
"logits/rejected": -3.9073410034179688,
"logps/chosen": -0.7108127474784851,
"logps/rejected": -1.4023791551589966,
"loss": 0.479,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.14911451935768127,
"rewards/rejected": -0.5604814291000366,
"step": 1080
},
{
"epoch": 0.5706059416306766,
"grad_norm": 15.630100484670548,
"learning_rate": 4.639368359610982e-07,
"logits/chosen": -3.694349765777588,
"logits/rejected": -4.01293420791626,
"logps/chosen": -0.7198182344436646,
"logps/rejected": -1.2933294773101807,
"loss": 0.3266,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.21065545082092285,
"rewards/rejected": -0.7207514047622681,
"step": 1090
},
{
"epoch": 0.5758408585263709,
"grad_norm": 13.347415996971426,
"learning_rate": 4.5482931570425803e-07,
"logits/chosen": -3.7276809215545654,
"logits/rejected": -4.011933326721191,
"logps/chosen": -0.6607510447502136,
"logps/rejected": -1.281141757965088,
"loss": 0.1704,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.17264564335346222,
"rewards/rejected": -0.6156941652297974,
"step": 1100
},
{
"epoch": 0.5810757754220651,
"grad_norm": 12.382613041230709,
"learning_rate": 4.4573688208903686e-07,
"logits/chosen": -3.6291985511779785,
"logits/rejected": -3.9804370403289795,
"logps/chosen": -0.6637237071990967,
"logps/rejected": -1.391247034072876,
"loss": 0.2221,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.15261736512184143,
"rewards/rejected": -0.8548731803894043,
"step": 1110
},
{
"epoch": 0.5863106923177595,
"grad_norm": 7.303791497022784,
"learning_rate": 4.366625719144016e-07,
"logits/chosen": -3.506834030151367,
"logits/rejected": -3.755375623703003,
"logps/chosen": -0.7788494825363159,
"logps/rejected": -1.385221004486084,
"loss": 0.2242,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.10770156234502792,
"rewards/rejected": -0.43017715215682983,
"step": 1120
},
{
"epoch": 0.5915456092134538,
"grad_norm": 14.381226143020704,
"learning_rate": 4.276094159262368e-07,
"logits/chosen": -3.543980360031128,
"logits/rejected": -3.700058698654175,
"logps/chosen": -0.7008415460586548,
"logps/rejected": -1.2009141445159912,
"loss": 0.1348,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.08333039283752441,
"rewards/rejected": -0.4378291666507721,
"step": 1130
},
{
"epoch": 0.596780526109148,
"grad_norm": 8.760901934055333,
"learning_rate": 4.1858043780510135e-07,
"logits/chosen": -3.62018084526062,
"logits/rejected": -3.911879062652588,
"logps/chosen": -0.5855687260627747,
"logps/rejected": -1.1562426090240479,
"loss": 0.205,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.10116372257471085,
"rewards/rejected": -0.6775528192520142,
"step": 1140
},
{
"epoch": 0.6020154430048423,
"grad_norm": 64.65530781329107,
"learning_rate": 4.0957865315634204e-07,
"logits/chosen": -3.5645194053649902,
"logits/rejected": -3.8217787742614746,
"logps/chosen": -0.641860842704773,
"logps/rejected": -1.2221088409423828,
"loss": 0.2331,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.08705325424671173,
"rewards/rejected": -0.46784210205078125,
"step": 1150
},
{
"epoch": 0.6072503599005366,
"grad_norm": 3.8658549598616143,
"learning_rate": 4.006070685029075e-07,
"logits/chosen": -3.679039716720581,
"logits/rejected": -3.946254253387451,
"logps/chosen": -0.6737911105155945,
"logps/rejected": -1.2060964107513428,
"loss": 0.259,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.1347390115261078,
"rewards/rejected": -0.5623631477355957,
"step": 1160
},
{
"epoch": 0.6124852767962309,
"grad_norm": 0.4610785350786279,
"learning_rate": 3.916686802811927e-07,
"logits/chosen": -3.583909511566162,
"logits/rejected": -3.8461241722106934,
"logps/chosen": -0.6507914662361145,
"logps/rejected": -1.2234233617782593,
"loss": 0.1494,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.09081225097179413,
"rewards/rejected": -0.700011670589447,
"step": 1170
},
{
"epoch": 0.6177201936919251,
"grad_norm": 1.6721740201035047,
"learning_rate": 3.8276647384025467e-07,
"logits/chosen": -3.608611583709717,
"logits/rejected": -3.8778247833251953,
"logps/chosen": -0.6140819191932678,
"logps/rejected": -1.1463892459869385,
"loss": 0.417,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.09192916750907898,
"rewards/rejected": -0.569677472114563,
"step": 1180
},
{
"epoch": 0.6229551105876194,
"grad_norm": 34.92800950922805,
"learning_rate": 3.7390342244472883e-07,
"logits/chosen": -3.686276912689209,
"logits/rejected": -3.9486172199249268,
"logps/chosen": -0.6567327976226807,
"logps/rejected": -1.2959892749786377,
"loss": 0.1887,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.13273802399635315,
"rewards/rejected": -0.6054214239120483,
"step": 1190
},
{
"epoch": 0.6281900274833137,
"grad_norm": 28.837778937392045,
"learning_rate": 3.6508248628178446e-07,
"logits/chosen": -3.635249376296997,
"logits/rejected": -3.995410203933716,
"logps/chosen": -0.6689791679382324,
"logps/rejected": -1.2368990182876587,
"loss": 0.1684,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.11262966692447662,
"rewards/rejected": -0.5506707429885864,
"step": 1200
},
{
"epoch": 0.633424944379008,
"grad_norm": 6.4862027458335465,
"learning_rate": 3.563066114724441e-07,
"logits/chosen": -3.7043285369873047,
"logits/rejected": -3.9376754760742188,
"logps/chosen": -0.6666765213012695,
"logps/rejected": -1.1883481740951538,
"loss": 0.1755,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.24648375809192657,
"rewards/rejected": -0.5887495279312134,
"step": 1210
},
{
"epoch": 0.6386598612747023,
"grad_norm": 29.832662347559868,
"learning_rate": 3.475787290876055e-07,
"logits/chosen": -3.6531460285186768,
"logits/rejected": -3.931300640106201,
"logps/chosen": -0.7262079119682312,
"logps/rejected": -1.4471489191055298,
"loss": 0.4461,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.13697785139083862,
"rewards/rejected": -0.5294802188873291,
"step": 1220
},
{
"epoch": 0.6438947781703965,
"grad_norm": 8.313230076052804,
"learning_rate": 3.389017541690854e-07,
"logits/chosen": -3.6925830841064453,
"logits/rejected": -3.8890395164489746,
"logps/chosen": -0.6669970154762268,
"logps/rejected": -1.1209336519241333,
"loss": 0.1875,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.11731680482625961,
"rewards/rejected": -0.4473400115966797,
"step": 1230
},
{
"epoch": 0.6491296950660909,
"grad_norm": 6.599402039349072,
"learning_rate": 3.30278584756021e-07,
"logits/chosen": -3.7490150928497314,
"logits/rejected": -3.9958243370056152,
"logps/chosen": -0.6234461069107056,
"logps/rejected": -1.2914457321166992,
"loss": 0.2222,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.19926027953624725,
"rewards/rejected": -0.7430733442306519,
"step": 1240
},
{
"epoch": 0.6543646119617851,
"grad_norm": 26.42550655780738,
"learning_rate": 3.2171210091694735e-07,
"logits/chosen": -3.5512046813964844,
"logits/rejected": -3.8761677742004395,
"logps/chosen": -0.6285715699195862,
"logps/rejected": -1.1303393840789795,
"loss": 0.4599,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.10718154907226562,
"rewards/rejected": -0.5068109035491943,
"step": 1250
},
{
"epoch": 0.6595995288574794,
"grad_norm": 43.52112297389642,
"learning_rate": 3.132051637878789e-07,
"logits/chosen": -3.754105806350708,
"logits/rejected": -3.9892616271972656,
"logps/chosen": -0.604164183139801,
"logps/rejected": -1.2354532480239868,
"loss": 0.3654,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.1477893888950348,
"rewards/rejected": -0.6003723740577698,
"step": 1260
},
{
"epoch": 0.6648344457531736,
"grad_norm": 2.249013697408821,
"learning_rate": 3.0476061461671155e-07,
"logits/chosen": -3.6410465240478516,
"logits/rejected": -3.9180960655212402,
"logps/chosen": -0.6587497591972351,
"logps/rejected": -1.34711754322052,
"loss": 0.258,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.13485677540302277,
"rewards/rejected": -0.5674414038658142,
"step": 1270
},
{
"epoch": 0.670069362648868,
"grad_norm": 4.4535505774168636,
"learning_rate": 2.9638127381427127e-07,
"logits/chosen": -3.6331870555877686,
"logits/rejected": -3.924232006072998,
"logps/chosen": -0.6530889272689819,
"logps/rejected": -1.2471270561218262,
"loss": 0.1464,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.13776201009750366,
"rewards/rejected": -0.5627979040145874,
"step": 1280
},
{
"epoch": 0.6753042795445622,
"grad_norm": 13.03175390211842,
"learning_rate": 2.8806994001231766e-07,
"logits/chosen": -3.5974411964416504,
"logits/rejected": -3.770061492919922,
"logps/chosen": -0.57194584608078,
"logps/rejected": -1.1356195211410522,
"loss": 0.2263,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.21213237941265106,
"rewards/rejected": -0.47332197427749634,
"step": 1290
},
{
"epoch": 0.6805391964402565,
"grad_norm": 9.542565711403059,
"learning_rate": 2.7982938912882544e-07,
"logits/chosen": -3.5941874980926514,
"logits/rejected": -3.9326794147491455,
"logps/chosen": -0.6405996084213257,
"logps/rejected": -1.4193586111068726,
"loss": 0.1943,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.1434379667043686,
"rewards/rejected": -0.8874284625053406,
"step": 1300
},
{
"epoch": 0.6857741133359508,
"grad_norm": 20.899845143040352,
"learning_rate": 2.716623734408488e-07,
"logits/chosen": -3.7071640491485596,
"logits/rejected": -3.954035997390747,
"logps/chosen": -0.7084048390388489,
"logps/rejected": -1.408484697341919,
"loss": 0.16,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.11447383463382721,
"rewards/rejected": -0.5277458429336548,
"step": 1310
},
{
"epoch": 0.6910090302316451,
"grad_norm": 26.585542302796576,
"learning_rate": 2.635716206652843e-07,
"logits/chosen": -3.568807601928711,
"logits/rejected": -3.851666212081909,
"logps/chosen": -0.6219191551208496,
"logps/rejected": -1.185127854347229,
"loss": 0.2097,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.16521432995796204,
"rewards/rejected": -0.8127703666687012,
"step": 1320
},
{
"epoch": 0.6962439471273394,
"grad_norm": 0.9910292983788035,
"learning_rate": 2.5555983304783515e-07,
"logits/chosen": -3.706960678100586,
"logits/rejected": -3.9914677143096924,
"logps/chosen": -0.6214891076087952,
"logps/rejected": -1.2848238945007324,
"loss": 0.2059,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.09211207926273346,
"rewards/rejected": -0.45670217275619507,
"step": 1330
},
{
"epoch": 0.7014788640230336,
"grad_norm": 19.940007489482195,
"learning_rate": 2.4762968646048356e-07,
"logits/chosen": -3.6894028186798096,
"logits/rejected": -4.012866497039795,
"logps/chosen": -0.6447241902351379,
"logps/rejected": -1.2848459482192993,
"loss": 0.3272,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.10874289274215698,
"rewards/rejected": -0.6314564943313599,
"step": 1340
},
{
"epoch": 0.7067137809187279,
"grad_norm": 1.6579625246627596,
"learning_rate": 2.397838295077703e-07,
"logits/chosen": -3.513641357421875,
"logits/rejected": -3.8426365852355957,
"logps/chosen": -0.6752243041992188,
"logps/rejected": -1.2607519626617432,
"loss": 0.1058,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.1565995216369629,
"rewards/rejected": -0.6347146034240723,
"step": 1350
},
{
"epoch": 0.7119486978144222,
"grad_norm": 5.659662645549925,
"learning_rate": 2.3202488264218357e-07,
"logits/chosen": -3.4555447101593018,
"logits/rejected": -3.8530869483947754,
"logps/chosen": -0.697325587272644,
"logps/rejected": -1.2927258014678955,
"loss": 0.0785,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.08387357741594315,
"rewards/rejected": -0.39989030361175537,
"step": 1360
},
{
"epoch": 0.7171836147101165,
"grad_norm": 4.657928400009495,
"learning_rate": 2.243554372889479e-07,
"logits/chosen": -3.5664660930633545,
"logits/rejected": -3.876011371612549,
"logps/chosen": -0.705878734588623,
"logps/rejected": -1.364867925643921,
"loss": 0.1093,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.12067972123622894,
"rewards/rejected": -0.6570446491241455,
"step": 1370
},
{
"epoch": 0.7224185316058107,
"grad_norm": 1.3444622420831092,
"learning_rate": 2.1677805498050998e-07,
"logits/chosen": -3.3227603435516357,
"logits/rejected": -3.743194580078125,
"logps/chosen": -0.6672931909561157,
"logps/rejected": -1.1365772485733032,
"loss": 0.1243,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.09715026617050171,
"rewards/rejected": -0.43300461769104004,
"step": 1380
},
{
"epoch": 0.727653448501505,
"grad_norm": 8.614820635695539,
"learning_rate": 2.0929526650100716e-07,
"logits/chosen": -3.5229296684265137,
"logits/rejected": -3.9142937660217285,
"logps/chosen": -0.6047448515892029,
"logps/rejected": -1.2415525913238525,
"loss": 0.2105,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.23621472716331482,
"rewards/rejected": -0.8043211698532104,
"step": 1390
},
{
"epoch": 0.7328883653971994,
"grad_norm": 5.179700896632157,
"learning_rate": 2.0190957104100692e-07,
"logits/chosen": -3.4930293560028076,
"logits/rejected": -3.749809741973877,
"logps/chosen": -0.6624347567558289,
"logps/rejected": -1.1844425201416016,
"loss": 0.1966,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.1073043942451477,
"rewards/rejected": -0.4727447032928467,
"step": 1400
},
{
"epoch": 0.7381232822928936,
"grad_norm": 21.802519076308094,
"learning_rate": 1.9462343536279612e-07,
"logits/chosen": -3.6438193321228027,
"logits/rejected": -4.09710168838501,
"logps/chosen": -0.6798110604286194,
"logps/rejected": -1.2490794658660889,
"loss": 0.18,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.11491537094116211,
"rewards/rejected": -0.5919861197471619,
"step": 1410
},
{
"epoch": 0.7433581991885879,
"grad_norm": 8.921153921786187,
"learning_rate": 1.874392929765044e-07,
"logits/chosen": -3.7223763465881348,
"logits/rejected": -4.110812187194824,
"logps/chosen": -0.6291832327842712,
"logps/rejected": -1.2136573791503906,
"loss": 0.1329,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.10531127452850342,
"rewards/rejected": -0.5478615164756775,
"step": 1420
},
{
"epoch": 0.7485931160842821,
"grad_norm": 69.71187759138483,
"learning_rate": 1.8035954332732889e-07,
"logits/chosen": -3.6256279945373535,
"logits/rejected": -3.9363632202148438,
"logps/chosen": -0.6302305459976196,
"logps/rejected": -1.1628683805465698,
"loss": 0.1542,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.1714845895767212,
"rewards/rejected": -0.623928964138031,
"step": 1430
},
{
"epoch": 0.7538280329799765,
"grad_norm": 20.431726665972217,
"learning_rate": 1.733865509941419e-07,
"logits/chosen": -3.574036121368408,
"logits/rejected": -3.994558811187744,
"logps/chosen": -0.6578459143638611,
"logps/rejected": -1.2709509134292603,
"loss": 0.1268,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.12474487721920013,
"rewards/rejected": -0.477055162191391,
"step": 1440
},
{
"epoch": 0.7590629498756707,
"grad_norm": 9.763384229327023,
"learning_rate": 1.6652264489973861e-07,
"logits/chosen": -3.586714506149292,
"logits/rejected": -3.934051990509033,
"logps/chosen": -0.672654390335083,
"logps/rejected": -1.4327255487442017,
"loss": 0.1755,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.12246622890233994,
"rewards/rejected": -0.6903547644615173,
"step": 1450
},
{
"epoch": 0.764297866771365,
"grad_norm": 5.3595001978781704,
"learning_rate": 1.5977011753299724e-07,
"logits/chosen": -3.651690721511841,
"logits/rejected": -3.986185073852539,
"logps/chosen": -0.6952771544456482,
"logps/rejected": -1.2006165981292725,
"loss": 0.2141,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.23796899616718292,
"rewards/rejected": -0.6073740124702454,
"step": 1460
},
{
"epoch": 0.7695327836670592,
"grad_norm": 1.6432168817434278,
"learning_rate": 1.5313122418320496e-07,
"logits/chosen": -3.5539729595184326,
"logits/rejected": -3.897873640060425,
"logps/chosen": -0.632127583026886,
"logps/rejected": -1.2387049198150635,
"loss": 0.1406,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.10146383196115494,
"rewards/rejected": -0.5321124196052551,
"step": 1470
},
{
"epoch": 0.7747677005627536,
"grad_norm": 1.3794858766661577,
"learning_rate": 1.4660818218681125e-07,
"logits/chosen": -3.5363082885742188,
"logits/rejected": -3.8142707347869873,
"logps/chosen": -0.7643290758132935,
"logps/rejected": -1.4336775541305542,
"loss": 0.1708,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.08295364677906036,
"rewards/rejected": -0.3842242360115051,
"step": 1480
},
{
"epoch": 0.7800026174584479,
"grad_norm": 14.698867702350263,
"learning_rate": 1.4020317018685362e-07,
"logits/chosen": -3.398146152496338,
"logits/rejected": -3.777660369873047,
"logps/chosen": -0.8031834363937378,
"logps/rejected": -1.415450096130371,
"loss": 0.1705,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.08375723659992218,
"rewards/rejected": -0.3890025019645691,
"step": 1490
},
{
"epoch": 0.7852375343541421,
"grad_norm": 12.210364521217109,
"learning_rate": 1.3391832740531055e-07,
"logits/chosen": -3.4719395637512207,
"logits/rejected": -3.8840765953063965,
"logps/chosen": -0.67162024974823,
"logps/rejected": -1.2897964715957642,
"loss": 0.0969,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.15899525582790375,
"rewards/rejected": -0.5975922346115112,
"step": 1500
},
{
"epoch": 0.7904724512498364,
"grad_norm": 23.473420770496503,
"learning_rate": 1.2775575292861707e-07,
"logits/chosen": -3.528533458709717,
"logits/rejected": -3.907036304473877,
"logps/chosen": -0.5467859506607056,
"logps/rejected": -1.1106278896331787,
"loss": 0.1681,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.11632678657770157,
"rewards/rejected": -0.560685396194458,
"step": 1510
},
{
"epoch": 0.7957073681455307,
"grad_norm": 4.542700336855168,
"learning_rate": 1.21717505006588e-07,
"logits/chosen": -3.7147388458251953,
"logits/rejected": -3.999300003051758,
"logps/chosen": -0.7240277528762817,
"logps/rejected": -1.3829585313796997,
"loss": 0.1492,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.16688640415668488,
"rewards/rejected": -0.7482727766036987,
"step": 1520
},
{
"epoch": 0.800942285041225,
"grad_norm": 2.86910989170756,
"learning_rate": 1.1580560036497877e-07,
"logits/chosen": -3.5072569847106934,
"logits/rejected": -3.800105571746826,
"logps/chosen": -0.6719281077384949,
"logps/rejected": -1.196406602859497,
"loss": 0.2545,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.10692320019006729,
"rewards/rejected": -0.5672041177749634,
"step": 1530
},
{
"epoch": 0.8061772019369192,
"grad_norm": 0.7212916751703865,
"learning_rate": 1.1002201353191521e-07,
"logits/chosen": -3.5515499114990234,
"logits/rejected": -3.918053150177002,
"logps/chosen": -0.5516917109489441,
"logps/rejected": -1.1447056531906128,
"loss": 0.2219,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.17747844755649567,
"rewards/rejected": -0.8485193252563477,
"step": 1540
},
{
"epoch": 0.8114121188326135,
"grad_norm": 3.9789628772583874,
"learning_rate": 1.0436867617841766e-07,
"logits/chosen": -3.523468017578125,
"logits/rejected": -4.002907752990723,
"logps/chosen": -0.6106997728347778,
"logps/rejected": -1.2265563011169434,
"loss": 0.1674,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.1898810714483261,
"rewards/rejected": -0.8234789967536926,
"step": 1550
},
{
"epoch": 0.8166470357283078,
"grad_norm": 23.019959616553912,
"learning_rate": 9.884747647323854e-08,
"logits/chosen": -3.5710349082946777,
"logits/rejected": -3.8469862937927246,
"logps/chosen": -0.6847441792488098,
"logps/rejected": -1.2800266742706299,
"loss": 0.1525,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -0.07254868000745773,
"rewards/rejected": -0.3605559766292572,
"step": 1560
},
{
"epoch": 0.8218819526240021,
"grad_norm": 1.9159672672104486,
"learning_rate": 9.346025845222871e-08,
"logits/chosen": -3.5710854530334473,
"logits/rejected": -3.9252638816833496,
"logps/chosen": -0.5996052026748657,
"logps/rejected": -1.185727834701538,
"loss": 0.0693,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.09702242910861969,
"rewards/rejected": -0.43671149015426636,
"step": 1570
},
{
"epoch": 0.8271168695196964,
"grad_norm": 6.0432637545944985,
"learning_rate": 8.82088214024454e-08,
"logits/chosen": -3.4795615673065186,
"logits/rejected": -3.900925397872925,
"logps/chosen": -0.6539788842201233,
"logps/rejected": -1.3062019348144531,
"loss": 0.0777,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.1087212786078453,
"rewards/rejected": -0.6267635226249695,
"step": 1580
},
{
"epoch": 0.8323517864153906,
"grad_norm": 0.8065986090798118,
"learning_rate": 8.309491926120393e-08,
"logits/chosen": -3.445683002471924,
"logits/rejected": -3.8484814167022705,
"logps/chosen": -0.6346088647842407,
"logps/rejected": -1.2594006061553955,
"loss": 0.1251,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.11368497461080551,
"rewards/rejected": -0.4606091380119324,
"step": 1590
},
{
"epoch": 0.837586703311085,
"grad_norm": 1.631423358922648,
"learning_rate": 7.812026003027771e-08,
"logits/chosen": -3.4795145988464355,
"logits/rejected": -3.843942165374756,
"logps/chosen": -0.6449909210205078,
"logps/rejected": -1.2673732042312622,
"loss": 0.1368,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.11242115497589111,
"rewards/rejected": -0.6334723234176636,
"step": 1600
},
{
"epoch": 0.8428216202067792,
"grad_norm": 28.643433953668108,
"learning_rate": 7.328650520543906e-08,
"logits/chosen": -3.5898594856262207,
"logits/rejected": -3.8868191242218018,
"logps/chosen": -0.6432119011878967,
"logps/rejected": -1.2170101404190063,
"loss": 0.24,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.14053374528884888,
"rewards/rejected": -0.6827019453048706,
"step": 1610
},
{
"epoch": 0.8480565371024735,
"grad_norm": 6.576620973204435,
"learning_rate": 6.859526922153352e-08,
"logits/chosen": -3.576659679412842,
"logits/rejected": -3.9092178344726562,
"logps/chosen": -0.5998526811599731,
"logps/rejected": -1.1230926513671875,
"loss": 0.1143,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.0742657333612442,
"rewards/rejected": -0.3632586598396301,
"step": 1620
},
{
"epoch": 0.8532914539981677,
"grad_norm": 19.103957999815783,
"learning_rate": 6.40481189132711e-08,
"logits/chosen": -3.498664379119873,
"logits/rejected": -3.8451638221740723,
"logps/chosen": -0.6496328711509705,
"logps/rejected": -1.1756832599639893,
"loss": 0.148,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.12481925636529922,
"rewards/rejected": -0.3642726540565491,
"step": 1630
},
{
"epoch": 0.8585263708938621,
"grad_norm": 2.479610731568999,
"learning_rate": 5.964657299191711e-08,
"logits/chosen": -3.6090331077575684,
"logits/rejected": -3.8709425926208496,
"logps/chosen": -0.7074568867683411,
"logps/rejected": -1.3075045347213745,
"loss": 0.1935,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.0973527580499649,
"rewards/rejected": -0.4661819338798523,
"step": 1640
},
{
"epoch": 0.8637612877895563,
"grad_norm": 24.31365663092268,
"learning_rate": 5.53921015380539e-08,
"logits/chosen": -3.368114948272705,
"logits/rejected": -3.887836456298828,
"logps/chosen": -0.6916152238845825,
"logps/rejected": -1.3344032764434814,
"loss": 0.2021,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.13118405640125275,
"rewards/rejected": -0.5621457695960999,
"step": 1650
},
{
"epoch": 0.8689962046852506,
"grad_norm": 18.53418978223593,
"learning_rate": 5.1286125510586805e-08,
"logits/chosen": -3.587425708770752,
"logits/rejected": -3.909331798553467,
"logps/chosen": -0.6051632165908813,
"logps/rejected": -1.1239159107208252,
"loss": 0.2629,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -0.08930396288633347,
"rewards/rejected": -0.4709382951259613,
"step": 1660
},
{
"epoch": 0.8742311215809448,
"grad_norm": 4.50359805992076,
"learning_rate": 4.733001627215466e-08,
"logits/chosen": -3.5434436798095703,
"logits/rejected": -3.836855411529541,
"logps/chosen": -0.6643694043159485,
"logps/rejected": -1.2192775011062622,
"loss": 0.3901,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.17054535448551178,
"rewards/rejected": -0.6519421339035034,
"step": 1670
},
{
"epoch": 0.8794660384766392,
"grad_norm": 34.178405196863906,
"learning_rate": 4.352509513110658e-08,
"logits/chosen": -3.487307071685791,
"logits/rejected": -3.8664581775665283,
"logps/chosen": -0.6555562615394592,
"logps/rejected": -1.2320266962051392,
"loss": 0.2195,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.17077895998954773,
"rewards/rejected": -0.64589923620224,
"step": 1680
},
{
"epoch": 0.8847009553723335,
"grad_norm": 2.86389693559937,
"learning_rate": 3.9872632900194936e-08,
"logits/chosen": -3.507810592651367,
"logits/rejected": -3.888345241546631,
"logps/chosen": -0.7281653881072998,
"logps/rejected": -1.3483150005340576,
"loss": 0.1462,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.13594810664653778,
"rewards/rejected": -0.5338603258132935,
"step": 1690
},
{
"epoch": 0.8899358722680277,
"grad_norm": 10.406439092961616,
"learning_rate": 3.6373849472134954e-08,
"logits/chosen": -3.631108045578003,
"logits/rejected": -3.8256747722625732,
"logps/chosen": -0.6505134701728821,
"logps/rejected": -1.196921706199646,
"loss": 0.0768,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.17471420764923096,
"rewards/rejected": -0.6126881241798401,
"step": 1700
},
{
"epoch": 0.8951707891637221,
"grad_norm": 15.532461573765454,
"learning_rate": 3.302991341216976e-08,
"logits/chosen": -3.6967296600341797,
"logits/rejected": -4.029541015625,
"logps/chosen": -0.6618956327438354,
"logps/rejected": -1.3089344501495361,
"loss": 0.1687,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.14995837211608887,
"rewards/rejected": -0.5820103883743286,
"step": 1710
},
{
"epoch": 0.9004057060594163,
"grad_norm": 5.162585825584226,
"learning_rate": 2.9841941567779474e-08,
"logits/chosen": -3.644044876098633,
"logits/rejected": -3.9487557411193848,
"logps/chosen": -0.7292143106460571,
"logps/rejected": -1.3466829061508179,
"loss": 0.1542,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.11079509556293488,
"rewards/rejected": -0.4718669056892395,
"step": 1720
},
{
"epoch": 0.9056406229551106,
"grad_norm": 6.250911119837395,
"learning_rate": 2.681099869566328e-08,
"logits/chosen": -3.541680097579956,
"logits/rejected": -3.892336368560791,
"logps/chosen": -0.6768237948417664,
"logps/rejected": -1.1760004758834839,
"loss": 0.3289,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.10029733180999756,
"rewards/rejected": -0.7108488082885742,
"step": 1730
},
{
"epoch": 0.9108755398508048,
"grad_norm": 12.552976379248005,
"learning_rate": 2.3938097106119216e-08,
"logits/chosen": -3.7278189659118652,
"logits/rejected": -3.972843647003174,
"logps/chosen": -0.6094867587089539,
"logps/rejected": -1.1807337999343872,
"loss": 0.1072,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.15854167938232422,
"rewards/rejected": -0.5542012453079224,
"step": 1740
},
{
"epoch": 0.9161104567464992,
"grad_norm": 2.1007604039088514,
"learning_rate": 2.12241963249406e-08,
"logits/chosen": -3.621340274810791,
"logits/rejected": -4.070878505706787,
"logps/chosen": -0.6373471617698669,
"logps/rejected": -1.2419856786727905,
"loss": 0.1475,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.14094902575016022,
"rewards/rejected": -0.5264729857444763,
"step": 1750
},
{
"epoch": 0.9213453736421934,
"grad_norm": 12.311524530866398,
"learning_rate": 1.8670202772942568e-08,
"logits/chosen": -3.631922483444214,
"logits/rejected": -3.9891021251678467,
"logps/chosen": -0.6251589059829712,
"logps/rejected": -1.3054759502410889,
"loss": 0.2536,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.19476152956485748,
"rewards/rejected": -0.747911810874939,
"step": 1760
},
{
"epoch": 0.9265802905378877,
"grad_norm": 6.827588288274221,
"learning_rate": 1.6276969463224545e-08,
"logits/chosen": -3.5727906227111816,
"logits/rejected": -3.9334945678710938,
"logps/chosen": -0.7015948295593262,
"logps/rejected": -1.320188045501709,
"loss": 0.1134,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.09665700793266296,
"rewards/rejected": -0.5031177401542664,
"step": 1770
},
{
"epoch": 0.931815207433582,
"grad_norm": 2.07334457771776,
"learning_rate": 1.4045295716271e-08,
"logits/chosen": -3.584230899810791,
"logits/rejected": -4.013778209686279,
"logps/chosen": -0.6425756216049194,
"logps/rejected": -1.4021472930908203,
"loss": 0.1508,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.12355978786945343,
"rewards/rejected": -0.4425305426120758,
"step": 1780
},
{
"epoch": 0.9370501243292763,
"grad_norm": 6.207844735521484,
"learning_rate": 1.1975926892984766e-08,
"logits/chosen": -3.419482707977295,
"logits/rejected": -3.659214496612549,
"logps/chosen": -0.5653207898139954,
"logps/rejected": -1.1019750833511353,
"loss": 0.1604,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.1579764187335968,
"rewards/rejected": -0.5906132459640503,
"step": 1790
},
{
"epoch": 0.9422850412249706,
"grad_norm": 9.527007176662295,
"learning_rate": 1.0069554145742787e-08,
"logits/chosen": -3.579073667526245,
"logits/rejected": -3.9208247661590576,
"logps/chosen": -0.6493052840232849,
"logps/rejected": -1.1913435459136963,
"loss": 0.2047,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.10687317699193954,
"rewards/rejected": -0.4368254542350769,
"step": 1800
},
{
"epoch": 0.9475199581206648,
"grad_norm": 31.675698349395965,
"learning_rate": 8.326814187556485e-09,
"logits/chosen": -3.670579433441162,
"logits/rejected": -4.052734375,
"logps/chosen": -0.6743156313896179,
"logps/rejected": -1.2803795337677002,
"loss": 0.2567,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.12054960429668427,
"rewards/rejected": -0.5073380470275879,
"step": 1810
},
{
"epoch": 0.9527548750163591,
"grad_norm": 42.855469273806456,
"learning_rate": 6.7482890794151594e-09,
"logits/chosen": -3.602036237716675,
"logits/rejected": -3.961235761642456,
"logps/chosen": -0.6719382405281067,
"logps/rejected": -1.306571364402771,
"loss": 0.1819,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.09778688848018646,
"rewards/rejected": -0.545978844165802,
"step": 1820
},
{
"epoch": 0.9579897919120534,
"grad_norm": 14.021917338266821,
"learning_rate": 5.334506035882036e-09,
"logits/chosen": -3.5885958671569824,
"logits/rejected": -3.972801685333252,
"logps/chosen": -0.6078780293464661,
"logps/rejected": -1.2133899927139282,
"loss": 0.2865,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.18109995126724243,
"rewards/rejected": -0.8068861961364746,
"step": 1830
},
{
"epoch": 0.9632247088077477,
"grad_norm": 6.0298508108639925,
"learning_rate": 4.0859372490090194e-09,
"logits/chosen": -3.607355833053589,
"logits/rejected": -3.94720196723938,
"logps/chosen": -0.5455012917518616,
"logps/rejected": -1.1150436401367188,
"loss": 0.3449,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.1432439386844635,
"rewards/rejected": -0.6401658058166504,
"step": 1840
},
{
"epoch": 0.9684596257034419,
"grad_norm": 4.5937644631506105,
"learning_rate": 3.0029997306283416e-09,
"logits/chosen": -3.550992250442505,
"logits/rejected": -3.908057451248169,
"logps/chosen": -0.5658137798309326,
"logps/rejected": -1.1313598155975342,
"loss": 0.1792,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.10125686228275299,
"rewards/rejected": -0.37562742829322815,
"step": 1850
},
{
"epoch": 0.9736945425991362,
"grad_norm": 0.9472001507121848,
"learning_rate": 2.0860551730742526e-09,
"logits/chosen": -3.4225573539733887,
"logits/rejected": -3.754948377609253,
"logps/chosen": -0.6793702244758606,
"logps/rejected": -1.271066427230835,
"loss": 0.075,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.12655304372310638,
"rewards/rejected": -0.5372025966644287,
"step": 1860
},
{
"epoch": 0.9789294594948306,
"grad_norm": 28.725611670468133,
"learning_rate": 1.3354098283802628e-09,
"logits/chosen": -3.5682125091552734,
"logits/rejected": -3.978659152984619,
"logps/chosen": -0.6888954043388367,
"logps/rejected": -1.3216421604156494,
"loss": 0.2941,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.17312012612819672,
"rewards/rejected": -0.5689764022827148,
"step": 1870
},
{
"epoch": 0.9841643763905248,
"grad_norm": 6.5557009909155335,
"learning_rate": 7.513144059937415e-10,
"logits/chosen": -3.502890110015869,
"logits/rejected": -3.8684380054473877,
"logps/chosen": -0.6373413801193237,
"logps/rejected": -1.253266453742981,
"loss": 0.1433,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.14787249267101288,
"rewards/rejected": -0.5477157831192017,
"step": 1880
},
{
"epoch": 0.9893992932862191,
"grad_norm": 7.693007289411357,
"learning_rate": 3.3396398904106393e-10,
"logits/chosen": -3.721400737762451,
"logits/rejected": -4.00911808013916,
"logps/chosen": -0.5624986886978149,
"logps/rejected": -1.1344749927520752,
"loss": 0.3553,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.2794850170612335,
"rewards/rejected": -0.8449773788452148,
"step": 1890
},
{
"epoch": 0.9946342101819133,
"grad_norm": 10.230098634327602,
"learning_rate": 8.349796917112018e-11,
"logits/chosen": -3.5617072582244873,
"logits/rejected": -3.961862087249756,
"logps/chosen": -0.6174992322921753,
"logps/rejected": -1.282220482826233,
"loss": 0.2374,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.14698410034179688,
"rewards/rejected": -0.745864987373352,
"step": 1900
},
{
"epoch": 0.9998691270776077,
"grad_norm": 19.58846312067376,
"learning_rate": 0.0,
"logits/chosen": -3.5836310386657715,
"logits/rejected": -3.961308717727661,
"logps/chosen": -0.7217192649841309,
"logps/rejected": -1.424443006515503,
"loss": 0.3428,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -0.1093846932053566,
"rewards/rejected": -0.5250765681266785,
"step": 1910
},
{
"epoch": 0.9998691270776077,
"step": 1910,
"total_flos": 167763918716928.0,
"train_loss": 0.3737170026252407,
"train_runtime": 21184.4179,
"train_samples_per_second": 2.886,
"train_steps_per_second": 0.09
}
],
"logging_steps": 10,
"max_steps": 1910,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 800,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 167763918716928.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}