Files
tinyllama-1.1b-dpo-pku-safe…/trainer_state.json
ModelHub XC edd7af9b86 初始化项目,由ModelHub XC社区提供模型
Model: AIPlans/tinyllama-1.1b-dpo-pku-saferlhf_2
Source: Original Platform
2026-06-11 03:16:17 +08:00

3323 lines
112 KiB
JSON

{
"best_metric": 0.800000011920929,
"best_model_checkpoint": "./outputs/tinyllama-1.1b-dpo-pku-saferlhf/checkpoint-1400",
"epoch": 0.9997600191984641,
"eval_steps": 200,
"global_step": 2083,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004799616030717543,
"grad_norm": 56.5,
"learning_rate": 2.3923444976076555e-07,
"logits/chosen": -2.689218282699585,
"logits/rejected": -2.554370880126953,
"logps/chosen": -212.5878143310547,
"logps/rejected": -186.63473510742188,
"loss": 0.693,
"rewards/accuracies": 0.4437499940395355,
"rewards/chosen": 4.648463800549507e-05,
"rewards/margins": 0.0005794697208330035,
"rewards/rejected": -0.0005329854902811348,
"step": 10
},
{
"epoch": 0.009599232061435085,
"grad_norm": 59.75,
"learning_rate": 4.784688995215311e-07,
"logits/chosen": -2.7294280529022217,
"logits/rejected": -2.6172096729278564,
"logps/chosen": -223.5776824951172,
"logps/rejected": -203.3684539794922,
"loss": 0.6931,
"rewards/accuracies": 0.46875,
"rewards/chosen": 0.0015927983913570642,
"rewards/margins": 0.0005491179181262851,
"rewards/rejected": 0.0010436807060614228,
"step": 20
},
{
"epoch": 0.014398848092152628,
"grad_norm": 50.25,
"learning_rate": 7.177033492822967e-07,
"logits/chosen": -2.7166686058044434,
"logits/rejected": -2.6399471759796143,
"logps/chosen": -238.0247344970703,
"logps/rejected": -219.1692352294922,
"loss": 0.6949,
"rewards/accuracies": 0.4312500059604645,
"rewards/chosen": -0.003216738346964121,
"rewards/margins": -0.0030289709102362394,
"rewards/rejected": -0.0001877670583780855,
"step": 30
},
{
"epoch": 0.01919846412287017,
"grad_norm": 57.0,
"learning_rate": 9.569377990430622e-07,
"logits/chosen": -2.7442615032196045,
"logits/rejected": -2.5918688774108887,
"logps/chosen": -251.1165771484375,
"logps/rejected": -196.37649536132812,
"loss": 0.6883,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.00424658227711916,
"rewards/margins": 0.010354852303862572,
"rewards/rejected": -0.006108270026743412,
"step": 40
},
{
"epoch": 0.023998080153587713,
"grad_norm": 47.25,
"learning_rate": 1.196172248803828e-06,
"logits/chosen": -2.6663458347320557,
"logits/rejected": -2.604515552520752,
"logps/chosen": -234.8127899169922,
"logps/rejected": -199.76278686523438,
"loss": 0.6904,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.0006293629994615912,
"rewards/margins": 0.006024296395480633,
"rewards/rejected": -0.00539493327960372,
"step": 50
},
{
"epoch": 0.028797696184305256,
"grad_norm": 65.0,
"learning_rate": 1.4354066985645934e-06,
"logits/chosen": -2.7063660621643066,
"logits/rejected": -2.6000542640686035,
"logps/chosen": -223.295166015625,
"logps/rejected": -215.13656616210938,
"loss": 0.6887,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.0032431543804705143,
"rewards/margins": 0.009618126787245274,
"rewards/rejected": -0.012861279770731926,
"step": 60
},
{
"epoch": 0.033597312215022795,
"grad_norm": 50.0,
"learning_rate": 1.6746411483253591e-06,
"logits/chosen": -2.727038860321045,
"logits/rejected": -2.585761070251465,
"logps/chosen": -245.62393188476562,
"logps/rejected": -205.93148803710938,
"loss": 0.6877,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.005830951035022736,
"rewards/margins": 0.011942476034164429,
"rewards/rejected": -0.006111525930464268,
"step": 70
},
{
"epoch": 0.03839692824574034,
"grad_norm": 50.25,
"learning_rate": 1.9138755980861244e-06,
"logits/chosen": -2.7241501808166504,
"logits/rejected": -2.6147332191467285,
"logps/chosen": -235.19338989257812,
"logps/rejected": -201.25424194335938,
"loss": 0.6807,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.01283216755837202,
"rewards/margins": 0.026413241401314735,
"rewards/rejected": -0.01358107291162014,
"step": 80
},
{
"epoch": 0.04319654427645788,
"grad_norm": 49.5,
"learning_rate": 2.15311004784689e-06,
"logits/chosen": -2.7387120723724365,
"logits/rejected": -2.5572338104248047,
"logps/chosen": -250.86380004882812,
"logps/rejected": -189.31814575195312,
"loss": 0.6712,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.03785385563969612,
"rewards/margins": 0.04662873595952988,
"rewards/rejected": -0.00877488125115633,
"step": 90
},
{
"epoch": 0.04799616030717543,
"grad_norm": 49.0,
"learning_rate": 2.392344497607656e-06,
"logits/chosen": -2.7061264514923096,
"logits/rejected": -2.5762696266174316,
"logps/chosen": -227.9839630126953,
"logps/rejected": -206.23196411132812,
"loss": 0.6702,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.06363014876842499,
"rewards/margins": 0.05047706514596939,
"rewards/rejected": 0.013153081759810448,
"step": 100
},
{
"epoch": 0.052795776337892966,
"grad_norm": 49.5,
"learning_rate": 2.631578947368421e-06,
"logits/chosen": -2.7290475368499756,
"logits/rejected": -2.548861026763916,
"logps/chosen": -236.59375,
"logps/rejected": -178.90164184570312,
"loss": 0.6619,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.06783770024776459,
"rewards/margins": 0.070295050740242,
"rewards/rejected": -0.0024573481641709805,
"step": 110
},
{
"epoch": 0.05759539236861051,
"grad_norm": 52.0,
"learning_rate": 2.870813397129187e-06,
"logits/chosen": -2.7224419116973877,
"logits/rejected": -2.5972402095794678,
"logps/chosen": -233.290283203125,
"logps/rejected": -204.4892578125,
"loss": 0.6691,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.08400879800319672,
"rewards/margins": 0.05939972400665283,
"rewards/rejected": 0.024609070271253586,
"step": 120
},
{
"epoch": 0.06239500839932805,
"grad_norm": 50.25,
"learning_rate": 3.1100478468899525e-06,
"logits/chosen": -2.7070722579956055,
"logits/rejected": -2.5772013664245605,
"logps/chosen": -250.78231811523438,
"logps/rejected": -209.5062713623047,
"loss": 0.6496,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.165785014629364,
"rewards/margins": 0.10047496855258942,
"rewards/rejected": 0.06531006097793579,
"step": 130
},
{
"epoch": 0.06719462443004559,
"grad_norm": 49.5,
"learning_rate": 3.3492822966507182e-06,
"logits/chosen": -2.6727123260498047,
"logits/rejected": -2.6064090728759766,
"logps/chosen": -233.5751495361328,
"logps/rejected": -217.31576538085938,
"loss": 0.6574,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.20782318711280823,
"rewards/margins": 0.0912146344780922,
"rewards/rejected": 0.11660852283239365,
"step": 140
},
{
"epoch": 0.07199424046076314,
"grad_norm": 45.75,
"learning_rate": 3.5885167464114835e-06,
"logits/chosen": -2.6911652088165283,
"logits/rejected": -2.6086039543151855,
"logps/chosen": -230.4497528076172,
"logps/rejected": -244.7624053955078,
"loss": 0.6583,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.19385293126106262,
"rewards/margins": 0.0921817347407341,
"rewards/rejected": 0.10167120397090912,
"step": 150
},
{
"epoch": 0.07679385649148068,
"grad_norm": 51.5,
"learning_rate": 3.827751196172249e-06,
"logits/chosen": -2.65492844581604,
"logits/rejected": -2.5312721729278564,
"logps/chosen": -241.20199584960938,
"logps/rejected": -200.1884765625,
"loss": 0.6299,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.1793375313282013,
"rewards/margins": 0.1596115678548813,
"rewards/rejected": 0.019725963473320007,
"step": 160
},
{
"epoch": 0.08159347252219823,
"grad_norm": 46.25,
"learning_rate": 4.066985645933015e-06,
"logits/chosen": -2.732844114303589,
"logits/rejected": -2.5666396617889404,
"logps/chosen": -255.0216827392578,
"logps/rejected": -198.9121856689453,
"loss": 0.6366,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.15441159904003143,
"rewards/margins": 0.144814133644104,
"rewards/rejected": 0.009597455151379108,
"step": 170
},
{
"epoch": 0.08639308855291576,
"grad_norm": 43.25,
"learning_rate": 4.30622009569378e-06,
"logits/chosen": -2.7506613731384277,
"logits/rejected": -2.6481566429138184,
"logps/chosen": -240.16055297851562,
"logps/rejected": -200.94822692871094,
"loss": 0.6103,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.23929600417613983,
"rewards/margins": 0.20658496022224426,
"rewards/rejected": 0.032711055129766464,
"step": 180
},
{
"epoch": 0.09119270458363331,
"grad_norm": 50.75,
"learning_rate": 4.5454545454545455e-06,
"logits/chosen": -2.675985097885132,
"logits/rejected": -2.5708577632904053,
"logps/chosen": -238.582763671875,
"logps/rejected": -198.5998992919922,
"loss": 0.6178,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.23324120044708252,
"rewards/margins": 0.195271834731102,
"rewards/rejected": 0.03796938806772232,
"step": 190
},
{
"epoch": 0.09599232061435085,
"grad_norm": 37.25,
"learning_rate": 4.784688995215312e-06,
"logits/chosen": -2.7069146633148193,
"logits/rejected": -2.5851969718933105,
"logps/chosen": -225.49917602539062,
"logps/rejected": -189.277587890625,
"loss": 0.6075,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.3116636574268341,
"rewards/margins": 0.22872300446033478,
"rewards/rejected": 0.08294066041707993,
"step": 200
},
{
"epoch": 0.09599232061435085,
"eval_logits/chosen": -2.6887047290802,
"eval_logits/rejected": -2.5653347969055176,
"eval_logps/chosen": -228.92459106445312,
"eval_logps/rejected": -203.6246337890625,
"eval_loss": 0.6051958799362183,
"eval_rewards/accuracies": 0.7170000076293945,
"eval_rewards/chosen": 0.4208393394947052,
"eval_rewards/margins": 0.23022359609603882,
"eval_rewards/rejected": 0.1906157284975052,
"eval_runtime": 26.0524,
"eval_samples_per_second": 38.384,
"eval_steps_per_second": 9.596,
"step": 200
},
{
"epoch": 0.1007919366450684,
"grad_norm": 45.75,
"learning_rate": 4.999996487062011e-06,
"logits/chosen": -2.645963668823242,
"logits/rejected": -2.5576295852661133,
"logps/chosen": -237.7938995361328,
"logps/rejected": -211.9795684814453,
"loss": 0.6059,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.4848661422729492,
"rewards/margins": 0.2471170723438263,
"rewards/rejected": 0.2377490997314453,
"step": 210
},
{
"epoch": 0.10559155267578593,
"grad_norm": 41.5,
"learning_rate": 4.999574946449064e-06,
"logits/chosen": -2.6820342540740967,
"logits/rejected": -2.550971269607544,
"logps/chosen": -222.2080841064453,
"logps/rejected": -184.314208984375,
"loss": 0.6094,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.38701874017715454,
"rewards/margins": 0.22917309403419495,
"rewards/rejected": 0.1578456610441208,
"step": 220
},
{
"epoch": 0.11039116870650348,
"grad_norm": 41.0,
"learning_rate": 4.9984509539801644e-06,
"logits/chosen": -2.6449620723724365,
"logits/rejected": -2.532531976699829,
"logps/chosen": -227.45291137695312,
"logps/rejected": -221.8866729736328,
"loss": 0.6227,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.3856458067893982,
"rewards/margins": 0.22975265979766846,
"rewards/rejected": 0.15589316189289093,
"step": 230
},
{
"epoch": 0.11519078473722102,
"grad_norm": 48.0,
"learning_rate": 4.996624825529257e-06,
"logits/chosen": -2.7190098762512207,
"logits/rejected": -2.6080780029296875,
"logps/chosen": -212.9981231689453,
"logps/rejected": -191.4314727783203,
"loss": 0.5988,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.36744850873947144,
"rewards/margins": 0.2741442620754242,
"rewards/rejected": 0.09330429136753082,
"step": 240
},
{
"epoch": 0.11999040076793857,
"grad_norm": 47.5,
"learning_rate": 4.994097074290524e-06,
"logits/chosen": -2.6760241985321045,
"logits/rejected": -2.555823564529419,
"logps/chosen": -225.98098754882812,
"logps/rejected": -199.85360717773438,
"loss": 0.6114,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.2855023443698883,
"rewards/margins": 0.23584535717964172,
"rewards/rejected": 0.04965699464082718,
"step": 250
},
{
"epoch": 0.1247900167986561,
"grad_norm": 45.0,
"learning_rate": 4.990868410634163e-06,
"logits/chosen": -2.683492660522461,
"logits/rejected": -2.59763240814209,
"logps/chosen": -222.82955932617188,
"logps/rejected": -192.34671020507812,
"loss": 0.592,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.2924219071865082,
"rewards/margins": 0.2935822010040283,
"rewards/rejected": -0.0011602870654314756,
"step": 260
},
{
"epoch": 0.12958963282937366,
"grad_norm": 42.0,
"learning_rate": 4.9869397419067535e-06,
"logits/chosen": -2.6904869079589844,
"logits/rejected": -2.58687162399292,
"logps/chosen": -210.76589965820312,
"logps/rejected": -191.6869354248047,
"loss": 0.5514,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": 0.34391769766807556,
"rewards/margins": 0.35880047082901,
"rewards/rejected": -0.014882763847708702,
"step": 270
},
{
"epoch": 0.13438924886009118,
"grad_norm": 46.75,
"learning_rate": 4.982312172176264e-06,
"logits/chosen": -2.7495617866516113,
"logits/rejected": -2.549598217010498,
"logps/chosen": -269.2088928222656,
"logps/rejected": -204.53211975097656,
"loss": 0.5697,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.4326564371585846,
"rewards/margins": 0.35469716787338257,
"rewards/rejected": 0.07795925438404083,
"step": 280
},
{
"epoch": 0.13918886489080873,
"grad_norm": 57.0,
"learning_rate": 4.976987001921787e-06,
"logits/chosen": -2.669323444366455,
"logits/rejected": -2.547853708267212,
"logps/chosen": -232.5638885498047,
"logps/rejected": -205.17001342773438,
"loss": 0.5782,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.3145369589328766,
"rewards/margins": 0.34478241205215454,
"rewards/rejected": -0.030245428904891014,
"step": 290
},
{
"epoch": 0.14398848092152627,
"grad_norm": 37.75,
"learning_rate": 4.97096572766805e-06,
"logits/chosen": -2.6843204498291016,
"logits/rejected": -2.5420849323272705,
"logps/chosen": -238.2855987548828,
"logps/rejected": -192.5761260986328,
"loss": 0.548,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.2724495232105255,
"rewards/margins": 0.4326988756656647,
"rewards/rejected": -0.16024938225746155,
"step": 300
},
{
"epoch": 0.14878809695224382,
"grad_norm": 43.75,
"learning_rate": 4.964250041564868e-06,
"logits/chosen": -2.664036989212036,
"logits/rejected": -2.533687114715576,
"logps/chosen": -230.8778076171875,
"logps/rejected": -200.39389038085938,
"loss": 0.5758,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.21766161918640137,
"rewards/margins": 0.3927158713340759,
"rewards/rejected": -0.17505425214767456,
"step": 310
},
{
"epoch": 0.15358771298296137,
"grad_norm": 38.25,
"learning_rate": 4.956841830911588e-06,
"logits/chosen": -2.6427550315856934,
"logits/rejected": -2.5176196098327637,
"logps/chosen": -241.25448608398438,
"logps/rejected": -200.30337524414062,
"loss": 0.5428,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": 0.3211400508880615,
"rewards/margins": 0.465925931930542,
"rewards/rejected": -0.14478585124015808,
"step": 320
},
{
"epoch": 0.1583873290136789,
"grad_norm": 53.5,
"learning_rate": 4.9487431776267095e-06,
"logits/chosen": -2.6581196784973145,
"logits/rejected": -2.5447604656219482,
"logps/chosen": -214.78573608398438,
"logps/rejected": -197.250732421875,
"loss": 0.5574,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.35328906774520874,
"rewards/margins": 0.42138591408729553,
"rewards/rejected": -0.0680968165397644,
"step": 330
},
{
"epoch": 0.16318694504439646,
"grad_norm": 42.0,
"learning_rate": 4.939956357662806e-06,
"logits/chosen": -2.609428882598877,
"logits/rejected": -2.4489550590515137,
"logps/chosen": -230.47982788085938,
"logps/rejected": -180.9293975830078,
"loss": 0.5271,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 0.2847859263420105,
"rewards/margins": 0.49130839109420776,
"rewards/rejected": -0.20652246475219727,
"step": 340
},
{
"epoch": 0.16798656107511398,
"grad_norm": 42.75,
"learning_rate": 4.9304838403669155e-06,
"logits/chosen": -2.586233139038086,
"logits/rejected": -2.4430768489837646,
"logps/chosen": -250.468017578125,
"logps/rejected": -196.97486877441406,
"loss": 0.535,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.3728100061416626,
"rewards/margins": 0.4706306457519531,
"rewards/rejected": -0.0978206992149353,
"step": 350
},
{
"epoch": 0.17278617710583152,
"grad_norm": 46.5,
"learning_rate": 4.920328287786587e-06,
"logits/chosen": -2.603339910507202,
"logits/rejected": -2.4968552589416504,
"logps/chosen": -227.32632446289062,
"logps/rejected": -196.89515686035156,
"loss": 0.5242,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.26675719022750854,
"rewards/margins": 0.5004665851593018,
"rewards/rejected": -0.23370936512947083,
"step": 360
},
{
"epoch": 0.17758579313654907,
"grad_norm": 44.5,
"learning_rate": 4.909492553921761e-06,
"logits/chosen": -2.6493752002716064,
"logits/rejected": -2.486344337463379,
"logps/chosen": -241.6817626953125,
"logps/rejected": -204.93899536132812,
"loss": 0.5103,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 0.33704155683517456,
"rewards/margins": 0.5699586868286133,
"rewards/rejected": -0.23291714489459991,
"step": 370
},
{
"epoch": 0.18238540916726662,
"grad_norm": 39.75,
"learning_rate": 4.897979683922728e-06,
"logits/chosen": -2.670883893966675,
"logits/rejected": -2.564422130584717,
"logps/chosen": -215.97842407226562,
"logps/rejected": -182.91036987304688,
"loss": 0.5319,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 0.2965424954891205,
"rewards/margins": 0.5261348485946655,
"rewards/rejected": -0.22959236800670624,
"step": 380
},
{
"epoch": 0.18718502519798416,
"grad_norm": 38.25,
"learning_rate": 4.885792913234339e-06,
"logits/chosen": -2.603323221206665,
"logits/rejected": -2.5463268756866455,
"logps/chosen": -219.7723388671875,
"logps/rejected": -210.0283203125,
"loss": 0.5334,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.3986433148384094,
"rewards/margins": 0.5653668642044067,
"rewards/rejected": -0.16672348976135254,
"step": 390
},
{
"epoch": 0.1919846412287017,
"grad_norm": 41.0,
"learning_rate": 4.872935666686767e-06,
"logits/chosen": -2.6278882026672363,
"logits/rejected": -2.5184621810913086,
"logps/chosen": -229.7825164794922,
"logps/rejected": -213.7990264892578,
"loss": 0.5198,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 0.38563206791877747,
"rewards/margins": 0.5534602999687195,
"rewards/rejected": -0.16782823204994202,
"step": 400
},
{
"epoch": 0.1919846412287017,
"eval_logits/chosen": -2.6478958129882812,
"eval_logits/rejected": -2.5310354232788086,
"eval_logps/chosen": -230.05001831054688,
"eval_logps/rejected": -208.3632354736328,
"eval_loss": 0.5145431160926819,
"eval_rewards/accuracies": 0.7850000262260437,
"eval_rewards/chosen": 0.3082956075668335,
"eval_rewards/margins": 0.591540515422821,
"eval_rewards/rejected": -0.28324493765830994,
"eval_runtime": 26.9361,
"eval_samples_per_second": 37.125,
"eval_steps_per_second": 9.281,
"step": 400
},
{
"epoch": 0.19678425725941925,
"grad_norm": 49.25,
"learning_rate": 4.859411557533019e-06,
"logits/chosen": -2.640899181365967,
"logits/rejected": -2.532985210418701,
"logps/chosen": -227.2956085205078,
"logps/rejected": -199.454345703125,
"loss": 0.5372,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.2837000787258148,
"rewards/margins": 0.5689016580581665,
"rewards/rejected": -0.2852015495300293,
"step": 410
},
{
"epoch": 0.2015838732901368,
"grad_norm": 36.25,
"learning_rate": 4.8452243864335216e-06,
"logits/chosen": -2.6203932762145996,
"logits/rejected": -2.546025037765503,
"logps/chosen": -206.45437622070312,
"logps/rejected": -212.8076171875,
"loss": 0.5688,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.15797384083271027,
"rewards/margins": 0.42366623878479004,
"rewards/rejected": -0.2656923830509186,
"step": 420
},
{
"epoch": 0.20638348932085432,
"grad_norm": 34.75,
"learning_rate": 4.830378140388016e-06,
"logits/chosen": -2.726120710372925,
"logits/rejected": -2.5788116455078125,
"logps/chosen": -236.8133544921875,
"logps/rejected": -197.07266235351562,
"loss": 0.5284,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.21513256430625916,
"rewards/margins": 0.62417072057724,
"rewards/rejected": -0.40903815627098083,
"step": 430
},
{
"epoch": 0.21118310535157186,
"grad_norm": 37.5,
"learning_rate": 4.814876991615104e-06,
"logits/chosen": -2.612753391265869,
"logits/rejected": -2.5211071968078613,
"logps/chosen": -228.63931274414062,
"logps/rejected": -203.56838989257812,
"loss": 0.5862,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.15136560797691345,
"rewards/margins": 0.43973368406295776,
"rewards/rejected": -0.5910992622375488,
"step": 440
},
{
"epoch": 0.2159827213822894,
"grad_norm": 53.5,
"learning_rate": 4.798725296379736e-06,
"logits/chosen": -2.6407535076141357,
"logits/rejected": -2.5609383583068848,
"logps/chosen": -220.7452392578125,
"logps/rejected": -198.64051818847656,
"loss": 0.5094,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 0.08102991431951523,
"rewards/margins": 0.5998750329017639,
"rewards/rejected": -0.5188450813293457,
"step": 450
},
{
"epoch": 0.22078233741300696,
"grad_norm": 44.25,
"learning_rate": 4.781927593768969e-06,
"logits/chosen": -2.6856577396392822,
"logits/rejected": -2.5593128204345703,
"logps/chosen": -232.1822509765625,
"logps/rejected": -205.1845703125,
"loss": 0.5215,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": 0.05589114502072334,
"rewards/margins": 0.6284939050674438,
"rewards/rejected": -0.5726026892662048,
"step": 460
},
{
"epoch": 0.2255819534437245,
"grad_norm": 46.75,
"learning_rate": 4.764488604416365e-06,
"logits/chosen": -2.6717689037323,
"logits/rejected": -2.506671667098999,
"logps/chosen": -255.5167236328125,
"logps/rejected": -221.23788452148438,
"loss": 0.4516,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": 0.2418864220380783,
"rewards/margins": 0.806577205657959,
"rewards/rejected": -0.5646907687187195,
"step": 470
},
{
"epoch": 0.23038156947444205,
"grad_norm": 40.25,
"learning_rate": 4.7464132291753464e-06,
"logits/chosen": -2.613154888153076,
"logits/rejected": -2.5245513916015625,
"logps/chosen": -212.85330200195312,
"logps/rejected": -193.86257934570312,
"loss": 0.5092,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": 0.12177534401416779,
"rewards/margins": 0.6296752691268921,
"rewards/rejected": -0.5078999996185303,
"step": 480
},
{
"epoch": 0.2351811855051596,
"grad_norm": 42.0,
"learning_rate": 4.727706547741924e-06,
"logits/chosen": -2.6002917289733887,
"logits/rejected": -2.469181537628174,
"logps/chosen": -228.36746215820312,
"logps/rejected": -184.94297790527344,
"loss": 0.5446,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.18128438293933868,
"rewards/margins": 0.4943477213382721,
"rewards/rejected": -0.31306329369544983,
"step": 490
},
{
"epoch": 0.23998080153587714,
"grad_norm": 41.75,
"learning_rate": 4.708373817227158e-06,
"logits/chosen": -2.5972156524658203,
"logits/rejected": -2.468087673187256,
"logps/chosen": -240.36544799804688,
"logps/rejected": -206.00051879882812,
"loss": 0.5061,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": 0.16172266006469727,
"rewards/margins": 0.651920735836029,
"rewards/rejected": -0.49019813537597656,
"step": 500
},
{
"epoch": 0.24478041756659466,
"grad_norm": 35.75,
"learning_rate": 4.688420470679754e-06,
"logits/chosen": -2.5911366939544678,
"logits/rejected": -2.4612526893615723,
"logps/chosen": -244.9521942138672,
"logps/rejected": -197.1681365966797,
"loss": 0.4955,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 0.15817375481128693,
"rewards/margins": 0.7028775215148926,
"rewards/rejected": -0.5447037816047668,
"step": 510
},
{
"epoch": 0.2495800335973122,
"grad_norm": 37.75,
"learning_rate": 4.667852115559227e-06,
"logits/chosen": -2.6258175373077393,
"logits/rejected": -2.492220401763916,
"logps/chosen": -254.71044921875,
"logps/rejected": -222.6087188720703,
"loss": 0.4794,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": 0.30519407987594604,
"rewards/margins": 0.7252100706100464,
"rewards/rejected": -0.4200161099433899,
"step": 520
},
{
"epoch": 0.2543796496280298,
"grad_norm": 38.5,
"learning_rate": 4.646674532160041e-06,
"logits/chosen": -2.657778739929199,
"logits/rejected": -2.5602223873138428,
"logps/chosen": -233.77200317382812,
"logps/rejected": -212.9705810546875,
"loss": 0.5216,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.1333063244819641,
"rewards/margins": 0.635170042514801,
"rewards/rejected": -0.5018636584281921,
"step": 530
},
{
"epoch": 0.2591792656587473,
"grad_norm": 48.75,
"learning_rate": 4.6248936719871855e-06,
"logits/chosen": -2.6264724731445312,
"logits/rejected": -2.5388011932373047,
"logps/chosen": -220.25064086914062,
"logps/rejected": -194.9637451171875,
"loss": 0.4623,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 0.02713530696928501,
"rewards/margins": 0.8179550170898438,
"rewards/rejected": -0.7908197045326233,
"step": 540
},
{
"epoch": 0.2639788816894648,
"grad_norm": 43.5,
"learning_rate": 4.60251565608363e-06,
"logits/chosen": -2.687124729156494,
"logits/rejected": -2.6088709831237793,
"logps/chosen": -234.5717010498047,
"logps/rejected": -225.5623779296875,
"loss": 0.4995,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.011210719123482704,
"rewards/margins": 0.6683915257453918,
"rewards/rejected": -0.6796022653579712,
"step": 550
},
{
"epoch": 0.26877849772018236,
"grad_norm": 39.75,
"learning_rate": 4.579546773310136e-06,
"logits/chosen": -2.619655132293701,
"logits/rejected": -2.446974754333496,
"logps/chosen": -238.61904907226562,
"logps/rejected": -216.04629516601562,
"loss": 0.4966,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.06748590618371964,
"rewards/margins": 0.7394402027130127,
"rewards/rejected": -0.6719542741775513,
"step": 560
},
{
"epoch": 0.2735781137508999,
"grad_norm": 44.0,
"learning_rate": 4.5559934785779115e-06,
"logits/chosen": -2.667036771774292,
"logits/rejected": -2.469698190689087,
"logps/chosen": -243.6424560546875,
"logps/rejected": -192.75326538085938,
"loss": 0.4458,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": 0.23885676264762878,
"rewards/margins": 0.8359629511833191,
"rewards/rejected": -0.5971060991287231,
"step": 570
},
{
"epoch": 0.27837772978161746,
"grad_norm": 39.75,
"learning_rate": 4.531862391034591e-06,
"logits/chosen": -2.58920955657959,
"logits/rejected": -2.4982354640960693,
"logps/chosen": -232.32406616210938,
"logps/rejected": -205.0082244873047,
"loss": 0.4441,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.24206867814064026,
"rewards/margins": 0.9316803216934204,
"rewards/rejected": -0.689611554145813,
"step": 580
},
{
"epoch": 0.283177345812335,
"grad_norm": 52.25,
"learning_rate": 4.507160292204074e-06,
"logits/chosen": -2.675287961959839,
"logits/rejected": -2.5451769828796387,
"logps/chosen": -237.7034149169922,
"logps/rejected": -209.1581573486328,
"loss": 0.4854,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.012547047808766365,
"rewards/margins": 0.7794401049613953,
"rewards/rejected": -0.7919871807098389,
"step": 590
},
{
"epoch": 0.28797696184305255,
"grad_norm": 45.25,
"learning_rate": 4.481894124080714e-06,
"logits/chosen": -2.6691808700561523,
"logits/rejected": -2.557055950164795,
"logps/chosen": -236.5981903076172,
"logps/rejected": -220.3197021484375,
"loss": 0.4703,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.08622226119041443,
"rewards/margins": 0.7651070952415466,
"rewards/rejected": -0.8513293266296387,
"step": 600
},
{
"epoch": 0.28797696184305255,
"eval_logits/chosen": -2.633901596069336,
"eval_logits/rejected": -2.5215346813201904,
"eval_logps/chosen": -233.1143798828125,
"eval_logps/rejected": -213.52012634277344,
"eval_loss": 0.4838341772556305,
"eval_rewards/accuracies": 0.7940000295639038,
"eval_rewards/chosen": 0.0018612403655424714,
"eval_rewards/margins": 0.8007965683937073,
"eval_rewards/rejected": -0.7989352941513062,
"eval_runtime": 31.8483,
"eval_samples_per_second": 31.399,
"eval_steps_per_second": 7.85,
"step": 600
},
{
"epoch": 0.2927765778737701,
"grad_norm": 37.0,
"learning_rate": 4.456070987178427e-06,
"logits/chosen": -2.636303424835205,
"logits/rejected": -2.475978136062622,
"logps/chosen": -220.087646484375,
"logps/rejected": -184.07369995117188,
"loss": 0.4705,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -0.108073391020298,
"rewards/margins": 0.7934460639953613,
"rewards/rejected": -0.9015194177627563,
"step": 610
},
{
"epoch": 0.29757619390448764,
"grad_norm": 38.25,
"learning_rate": 4.429698138535242e-06,
"logits/chosen": -2.6021456718444824,
"logits/rejected": -2.510624885559082,
"logps/chosen": -237.7132110595703,
"logps/rejected": -222.8521728515625,
"loss": 0.5149,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": 0.16316601634025574,
"rewards/margins": 0.6969932317733765,
"rewards/rejected": -0.5338272452354431,
"step": 620
},
{
"epoch": 0.3023758099352052,
"grad_norm": 41.0,
"learning_rate": 4.402782989673867e-06,
"logits/chosen": -2.648524761199951,
"logits/rejected": -2.49703311920166,
"logps/chosen": -240.3001251220703,
"logps/rejected": -206.89743041992188,
"loss": 0.4332,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.16457466781139374,
"rewards/margins": 0.8773033022880554,
"rewards/rejected": -0.7127286195755005,
"step": 630
},
{
"epoch": 0.30717542596592273,
"grad_norm": 53.25,
"learning_rate": 4.375333104518842e-06,
"logits/chosen": -2.567253589630127,
"logits/rejected": -2.5297513008117676,
"logps/chosen": -224.3708038330078,
"logps/rejected": -223.1370086669922,
"loss": 0.5021,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.020184341818094254,
"rewards/margins": 0.7216410636901855,
"rewards/rejected": -0.7418254613876343,
"step": 640
},
{
"epoch": 0.3119750419966403,
"grad_norm": 52.0,
"learning_rate": 4.347356197270852e-06,
"logits/chosen": -2.629487991333008,
"logits/rejected": -2.4799911975860596,
"logps/chosen": -232.6698455810547,
"logps/rejected": -211.57406616210938,
"loss": 0.4987,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": 0.05573948472738266,
"rewards/margins": 0.8003142476081848,
"rewards/rejected": -0.744574785232544,
"step": 650
},
{
"epoch": 0.3167746580273578,
"grad_norm": 51.25,
"learning_rate": 4.318860130238828e-06,
"logits/chosen": -2.564279794692993,
"logits/rejected": -2.4956247806549072,
"logps/chosen": -216.66201782226562,
"logps/rejected": -223.7574005126953,
"loss": 0.4958,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": 0.15754355490207672,
"rewards/margins": 0.775588870048523,
"rewards/rejected": -0.618045449256897,
"step": 660
},
{
"epoch": 0.32157427405807537,
"grad_norm": 35.25,
"learning_rate": 4.289852911630407e-06,
"logits/chosen": -2.656132221221924,
"logits/rejected": -2.506979465484619,
"logps/chosen": -259.60101318359375,
"logps/rejected": -215.4107208251953,
"loss": 0.4511,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": 0.25082486867904663,
"rewards/margins": 0.930018424987793,
"rewards/rejected": -0.6791934967041016,
"step": 670
},
{
"epoch": 0.3263738900887929,
"grad_norm": 65.5,
"learning_rate": 4.260342693301396e-06,
"logits/chosen": -2.630061626434326,
"logits/rejected": -2.4976465702056885,
"logps/chosen": -234.99032592773438,
"logps/rejected": -197.80252075195312,
"loss": 0.4984,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 0.04209694266319275,
"rewards/margins": 0.7394791841506958,
"rewards/rejected": -0.6973822116851807,
"step": 680
},
{
"epoch": 0.33117350611951046,
"grad_norm": 36.25,
"learning_rate": 4.2303377684648735e-06,
"logits/chosen": -2.6006178855895996,
"logits/rejected": -2.531825304031372,
"logps/chosen": -228.2000732421875,
"logps/rejected": -233.7799530029297,
"loss": 0.4636,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": 0.18744561076164246,
"rewards/margins": 0.8298454284667969,
"rewards/rejected": -0.6423999071121216,
"step": 690
},
{
"epoch": 0.33597312215022795,
"grad_norm": 44.0,
"learning_rate": 4.199846569360558e-06,
"logits/chosen": -2.6074843406677246,
"logits/rejected": -2.5071351528167725,
"logps/chosen": -234.10958862304688,
"logps/rejected": -215.6813507080078,
"loss": 0.5146,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": 0.24338316917419434,
"rewards/margins": 0.7707773447036743,
"rewards/rejected": -0.52739417552948,
"step": 700
},
{
"epoch": 0.3407727381809455,
"grad_norm": 47.0,
"learning_rate": 4.168877664885104e-06,
"logits/chosen": -2.610435962677002,
"logits/rejected": -2.462646961212158,
"logps/chosen": -227.91342163085938,
"logps/rejected": -188.88912963867188,
"loss": 0.4715,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": 0.12846776843070984,
"rewards/margins": 0.8934639096260071,
"rewards/rejected": -0.7649961709976196,
"step": 710
},
{
"epoch": 0.34557235421166305,
"grad_norm": 33.5,
"learning_rate": 4.1374397581840035e-06,
"logits/chosen": -2.6382699012756348,
"logits/rejected": -2.5071628093719482,
"logps/chosen": -226.78402709960938,
"logps/rejected": -190.30447387695312,
"loss": 0.4784,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": 0.12799586355686188,
"rewards/margins": 0.7761613130569458,
"rewards/rejected": -0.6481654047966003,
"step": 720
},
{
"epoch": 0.3503719702423806,
"grad_norm": 55.75,
"learning_rate": 4.105541684205752e-06,
"logits/chosen": -2.590768337249756,
"logits/rejected": -2.486171007156372,
"logps/chosen": -217.5499725341797,
"logps/rejected": -202.66116333007812,
"loss": 0.497,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 0.11741193383932114,
"rewards/margins": 0.8474688529968262,
"rewards/rejected": -0.7300569415092468,
"step": 730
},
{
"epoch": 0.35517158627309814,
"grad_norm": 39.75,
"learning_rate": 4.073192407218972e-06,
"logits/chosen": -2.633659839630127,
"logits/rejected": -2.501005172729492,
"logps/chosen": -239.923095703125,
"logps/rejected": -199.56039428710938,
"loss": 0.4329,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": 0.1744803935289383,
"rewards/margins": 1.0109570026397705,
"rewards/rejected": -0.8364765048027039,
"step": 740
},
{
"epoch": 0.3599712023038157,
"grad_norm": 47.25,
"learning_rate": 4.040401018293204e-06,
"logits/chosen": -2.5651516914367676,
"logits/rejected": -2.5003182888031006,
"logps/chosen": -222.2145538330078,
"logps/rejected": -236.4008331298828,
"loss": 0.4993,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.06561844795942307,
"rewards/margins": 0.7554530501365662,
"rewards/rejected": -0.821071445941925,
"step": 750
},
{
"epoch": 0.36477081833453323,
"grad_norm": 52.5,
"learning_rate": 4.007176732744054e-06,
"logits/chosen": -2.5720465183258057,
"logits/rejected": -2.5482380390167236,
"logps/chosen": -236.49081420898438,
"logps/rejected": -238.8624725341797,
"loss": 0.4684,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.019028931856155396,
"rewards/margins": 0.9050389528274536,
"rewards/rejected": -0.9240678548812866,
"step": 760
},
{
"epoch": 0.3695704343652508,
"grad_norm": 41.5,
"learning_rate": 3.9735288875434254e-06,
"logits/chosen": -2.646091938018799,
"logits/rejected": -2.4673545360565186,
"logps/chosen": -244.1385040283203,
"logps/rejected": -195.28207397460938,
"loss": 0.4731,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.11497173458337784,
"rewards/margins": 0.8432220220565796,
"rewards/rejected": -0.9581937789916992,
"step": 770
},
{
"epoch": 0.3743700503959683,
"grad_norm": 55.75,
"learning_rate": 3.939466938695565e-06,
"logits/chosen": -2.5559213161468506,
"logits/rejected": -2.466019630432129,
"logps/chosen": -253.15365600585938,
"logps/rejected": -228.51220703125,
"loss": 0.5308,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": 0.10333450883626938,
"rewards/margins": 0.7484658360481262,
"rewards/rejected": -0.6451312303543091,
"step": 780
},
{
"epoch": 0.37916966642668587,
"grad_norm": 36.25,
"learning_rate": 3.905000458579657e-06,
"logits/chosen": -2.570517063140869,
"logits/rejected": -2.502270460128784,
"logps/chosen": -209.8747100830078,
"logps/rejected": -230.72579956054688,
"loss": 0.5247,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": 0.09132517874240875,
"rewards/margins": 0.6891080141067505,
"rewards/rejected": -0.5977829098701477,
"step": 790
},
{
"epoch": 0.3839692824574034,
"grad_norm": 41.25,
"learning_rate": 3.87013913325971e-06,
"logits/chosen": -2.5875115394592285,
"logits/rejected": -2.4476943016052246,
"logps/chosen": -259.3215637207031,
"logps/rejected": -214.0452423095703,
"loss": 0.5223,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.05909506604075432,
"rewards/margins": 0.7436100840568542,
"rewards/rejected": -0.6845150589942932,
"step": 800
},
{
"epoch": 0.3839692824574034,
"eval_logits/chosen": -2.6117820739746094,
"eval_logits/rejected": -2.496819257736206,
"eval_logps/chosen": -231.5717010498047,
"eval_logps/rejected": -212.9447784423828,
"eval_loss": 0.46309149265289307,
"eval_rewards/accuracies": 0.7879999876022339,
"eval_rewards/chosen": 0.15612684190273285,
"eval_rewards/margins": 0.8975253701210022,
"eval_rewards/rejected": -0.7413985729217529,
"eval_runtime": 31.9562,
"eval_samples_per_second": 31.293,
"eval_steps_per_second": 7.823,
"step": 800
},
{
"epoch": 0.38876889848812096,
"grad_norm": 41.75,
"learning_rate": 3.8348927597624965e-06,
"logits/chosen": -2.635223388671875,
"logits/rejected": -2.5298709869384766,
"logps/chosen": -231.2666473388672,
"logps/rejected": -217.6613311767578,
"loss": 0.4714,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": 0.08412560075521469,
"rewards/margins": 0.8177760243415833,
"rewards/rejected": -0.7336505055427551,
"step": 810
},
{
"epoch": 0.3935685145188385,
"grad_norm": 40.75,
"learning_rate": 3.7992712433243117e-06,
"logits/chosen": -2.6278603076934814,
"logits/rejected": -2.470078945159912,
"logps/chosen": -235.6140899658203,
"logps/rejected": -188.5288848876953,
"loss": 0.4796,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.1967240571975708,
"rewards/margins": 0.8465791940689087,
"rewards/rejected": -1.0433032512664795,
"step": 820
},
{
"epoch": 0.39836813054955605,
"grad_norm": 48.0,
"learning_rate": 3.7632845946073136e-06,
"logits/chosen": -2.6680498123168945,
"logits/rejected": -2.513561964035034,
"logps/chosen": -250.830810546875,
"logps/rejected": -192.11917114257812,
"loss": 0.4341,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.35141512751579285,
"rewards/margins": 0.932411789894104,
"rewards/rejected": -1.2838269472122192,
"step": 830
},
{
"epoch": 0.4031677465802736,
"grad_norm": 40.0,
"learning_rate": 3.7269429268862513e-06,
"logits/chosen": -2.630174398422241,
"logits/rejected": -2.5625929832458496,
"logps/chosen": -213.17184448242188,
"logps/rejected": -208.76736450195312,
"loss": 0.4875,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.3925122618675232,
"rewards/margins": 0.8376442193984985,
"rewards/rejected": -1.230156421661377,
"step": 840
},
{
"epoch": 0.40796736261099115,
"grad_norm": 35.25,
"learning_rate": 3.690256453206334e-06,
"logits/chosen": -2.6113486289978027,
"logits/rejected": -2.562234401702881,
"logps/chosen": -217.27377319335938,
"logps/rejected": -210.09207153320312,
"loss": 0.4761,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.20575007796287537,
"rewards/margins": 0.932608425617218,
"rewards/rejected": -1.1383583545684814,
"step": 850
},
{
"epoch": 0.41276697864170864,
"grad_norm": 54.5,
"learning_rate": 3.6532354835130844e-06,
"logits/chosen": -2.6446421146392822,
"logits/rejected": -2.535243511199951,
"logps/chosen": -249.38037109375,
"logps/rejected": -225.61679077148438,
"loss": 0.4788,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.0025300472043454647,
"rewards/margins": 0.9329078793525696,
"rewards/rejected": -0.9354379773139954,
"step": 860
},
{
"epoch": 0.4175665946724262,
"grad_norm": 43.5,
"learning_rate": 3.6158904217549446e-06,
"logits/chosen": -2.640524387359619,
"logits/rejected": -2.5773513317108154,
"logps/chosen": -225.6608123779297,
"logps/rejected": -203.6904296875,
"loss": 0.5129,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.13905248045921326,
"rewards/margins": 0.7723478078842163,
"rewards/rejected": -0.911400318145752,
"step": 870
},
{
"epoch": 0.42236621070314373,
"grad_norm": 56.5,
"learning_rate": 3.5782317629594708e-06,
"logits/chosen": -2.6161234378814697,
"logits/rejected": -2.5211544036865234,
"logps/chosen": -240.87353515625,
"logps/rejected": -220.1843719482422,
"loss": 0.5035,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": 0.07784248143434525,
"rewards/margins": 0.8290297389030457,
"rewards/rejected": -0.7511872053146362,
"step": 880
},
{
"epoch": 0.4271658267338613,
"grad_norm": 41.5,
"learning_rate": 3.5402700902839317e-06,
"logits/chosen": -2.5109052658081055,
"logits/rejected": -2.469113826751709,
"logps/chosen": -206.3893280029297,
"logps/rejected": -216.2916717529297,
"loss": 0.481,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.0023569464683532715,
"rewards/margins": 0.8143336176872253,
"rewards/rejected": -0.8166904449462891,
"step": 890
},
{
"epoch": 0.4319654427645788,
"grad_norm": 38.0,
"learning_rate": 3.5020160720411408e-06,
"logits/chosen": -2.620961904525757,
"logits/rejected": -2.4989562034606934,
"logps/chosen": -232.7259979248047,
"logps/rejected": -224.18222045898438,
"loss": 0.457,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 0.1927359253168106,
"rewards/margins": 0.9317037463188171,
"rewards/rejected": -0.738967776298523,
"step": 900
},
{
"epoch": 0.43676505879529637,
"grad_norm": 60.5,
"learning_rate": 3.4634804587013505e-06,
"logits/chosen": -2.5633127689361572,
"logits/rejected": -2.51041841506958,
"logps/chosen": -216.7091827392578,
"logps/rejected": -217.47030639648438,
"loss": 0.5084,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.10815312713384628,
"rewards/margins": 0.7729175090789795,
"rewards/rejected": -0.8810704946517944,
"step": 910
},
{
"epoch": 0.4415646748260139,
"grad_norm": 44.0,
"learning_rate": 3.424674079871073e-06,
"logits/chosen": -2.5757408142089844,
"logits/rejected": -2.4900078773498535,
"logps/chosen": -217.65444946289062,
"logps/rejected": -204.16091918945312,
"loss": 0.5214,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.12332279980182648,
"rewards/margins": 0.7060663104057312,
"rewards/rejected": -0.8293890953063965,
"step": 920
},
{
"epoch": 0.44636429085673146,
"grad_norm": 36.25,
"learning_rate": 3.3856078412496424e-06,
"logits/chosen": -2.6325020790100098,
"logits/rejected": -2.5013327598571777,
"logps/chosen": -240.5934295654297,
"logps/rejected": -201.51132202148438,
"loss": 0.4388,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.12551036477088928,
"rewards/margins": 0.971343994140625,
"rewards/rejected": -1.0968544483184814,
"step": 930
},
{
"epoch": 0.451163906887449,
"grad_norm": 50.5,
"learning_rate": 3.346292721564407e-06,
"logits/chosen": -2.654001474380493,
"logits/rejected": -2.5543465614318848,
"logps/chosen": -265.34912109375,
"logps/rejected": -228.43905639648438,
"loss": 0.4906,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.23702020943164825,
"rewards/margins": 0.903741717338562,
"rewards/rejected": -1.1407619714736938,
"step": 940
},
{
"epoch": 0.45596352291816655,
"grad_norm": 43.75,
"learning_rate": 3.306739769485394e-06,
"logits/chosen": -2.593740940093994,
"logits/rejected": -2.4713730812072754,
"logps/chosen": -235.24942016601562,
"logps/rejected": -202.47262573242188,
"loss": 0.446,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.12331932783126831,
"rewards/margins": 0.9522517919540405,
"rewards/rejected": -1.075571060180664,
"step": 950
},
{
"epoch": 0.4607631389488841,
"grad_norm": 49.75,
"learning_rate": 3.266960100520316e-06,
"logits/chosen": -2.6186511516571045,
"logits/rejected": -2.5293595790863037,
"logps/chosen": -207.3675537109375,
"logps/rejected": -198.95132446289062,
"loss": 0.445,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.22513869404792786,
"rewards/margins": 1.0883581638336182,
"rewards/rejected": -1.3134969472885132,
"step": 960
},
{
"epoch": 0.46556275497960165,
"grad_norm": 35.75,
"learning_rate": 3.2269648938907977e-06,
"logits/chosen": -2.5796661376953125,
"logits/rejected": -2.469088315963745,
"logps/chosen": -219.3197479248047,
"logps/rejected": -195.90982055664062,
"loss": 0.4861,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.38874492049217224,
"rewards/margins": 0.9575654864311218,
"rewards/rejected": -1.3463106155395508,
"step": 970
},
{
"epoch": 0.4703623710103192,
"grad_norm": 35.75,
"learning_rate": 3.186765389390696e-06,
"logits/chosen": -2.6649694442749023,
"logits/rejected": -2.528609037399292,
"logps/chosen": -255.8763885498047,
"logps/rejected": -208.0857391357422,
"loss": 0.4569,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -0.3541041314601898,
"rewards/margins": 0.9702304005622864,
"rewards/rejected": -1.3243346214294434,
"step": 980
},
{
"epoch": 0.47516198704103674,
"grad_norm": 53.0,
"learning_rate": 3.146372884227393e-06,
"logits/chosen": -2.630551815032959,
"logits/rejected": -2.5320029258728027,
"logps/chosen": -253.5113983154297,
"logps/rejected": -226.9222412109375,
"loss": 0.5222,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.3444979786872864,
"rewards/margins": 0.7771540880203247,
"rewards/rejected": -1.1216518878936768,
"step": 990
},
{
"epoch": 0.4799616030717543,
"grad_norm": 35.0,
"learning_rate": 3.1057987298469693e-06,
"logits/chosen": -2.55789852142334,
"logits/rejected": -2.4452617168426514,
"logps/chosen": -217.3821258544922,
"logps/rejected": -195.25726318359375,
"loss": 0.4335,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.20461265742778778,
"rewards/margins": 1.0646028518676758,
"rewards/rejected": -1.26921546459198,
"step": 1000
},
{
"epoch": 0.4799616030717543,
"eval_logits/chosen": -2.6097192764282227,
"eval_logits/rejected": -2.497605085372925,
"eval_logps/chosen": -234.7398681640625,
"eval_logps/rejected": -217.08157348632812,
"eval_loss": 0.45548132061958313,
"eval_rewards/accuracies": 0.7950000166893005,
"eval_rewards/chosen": -0.16069155931472778,
"eval_rewards/margins": 0.9943889379501343,
"eval_rewards/rejected": -1.1550804376602173,
"eval_runtime": 32.012,
"eval_samples_per_second": 31.238,
"eval_steps_per_second": 7.81,
"step": 1000
},
{
"epoch": 0.48476121910247183,
"grad_norm": 35.75,
"learning_rate": 3.06505432874411e-06,
"logits/chosen": -2.5709657669067383,
"logits/rejected": -2.432443141937256,
"logps/chosen": -251.8423614501953,
"logps/rejected": -219.88711547851562,
"loss": 0.469,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -0.1605864316225052,
"rewards/margins": 0.9071685671806335,
"rewards/rejected": -1.0677549839019775,
"step": 1010
},
{
"epoch": 0.4895608351331893,
"grad_norm": 44.0,
"learning_rate": 3.024151131257688e-06,
"logits/chosen": -2.601938009262085,
"logits/rejected": -2.4874444007873535,
"logps/chosen": -249.327880859375,
"logps/rejected": -204.016357421875,
"loss": 0.4554,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.34456175565719604,
"rewards/margins": 0.9197514653205872,
"rewards/rejected": -1.2643131017684937,
"step": 1020
},
{
"epoch": 0.49436045116390687,
"grad_norm": 52.25,
"learning_rate": 2.983100632352889e-06,
"logits/chosen": -2.67118501663208,
"logits/rejected": -2.495823860168457,
"logps/chosen": -256.9388732910156,
"logps/rejected": -210.40402221679688,
"loss": 0.4574,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.21712104976177216,
"rewards/margins": 1.0202686786651611,
"rewards/rejected": -1.2373896837234497,
"step": 1030
},
{
"epoch": 0.4991600671946244,
"grad_norm": 48.0,
"learning_rate": 2.9419143683907987e-06,
"logits/chosen": -2.5967347621917725,
"logits/rejected": -2.5237019062042236,
"logps/chosen": -223.098876953125,
"logps/rejected": -219.5789031982422,
"loss": 0.4829,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.4448261260986328,
"rewards/margins": 0.9350289106369019,
"rewards/rejected": -1.3798550367355347,
"step": 1040
},
{
"epoch": 0.503959683225342,
"grad_norm": 44.25,
"learning_rate": 2.9006039138863572e-06,
"logits/chosen": -2.5805346965789795,
"logits/rejected": -2.4660849571228027,
"logps/chosen": -247.05136108398438,
"logps/rejected": -224.231201171875,
"loss": 0.415,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.20926351845264435,
"rewards/margins": 1.0519158840179443,
"rewards/rejected": -1.2611795663833618,
"step": 1050
},
{
"epoch": 0.5087592992560596,
"grad_norm": 36.0,
"learning_rate": 2.8591808782555883e-06,
"logits/chosen": -2.58505916595459,
"logits/rejected": -2.5212624073028564,
"logps/chosen": -233.4881134033203,
"logps/rejected": -227.78945922851562,
"loss": 0.4266,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.09728523343801498,
"rewards/margins": 1.1170459985733032,
"rewards/rejected": -1.2143312692642212,
"step": 1060
},
{
"epoch": 0.5135589152867771,
"grad_norm": 52.75,
"learning_rate": 2.817656902553024e-06,
"logits/chosen": -2.6064798831939697,
"logits/rejected": -2.511019468307495,
"logps/chosen": -234.0939483642578,
"logps/rejected": -214.7052764892578,
"loss": 0.4975,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.10685852915048599,
"rewards/margins": 0.8924004435539246,
"rewards/rejected": -0.9992589950561523,
"step": 1070
},
{
"epoch": 0.5183585313174947,
"grad_norm": 52.75,
"learning_rate": 2.7760436562002354e-06,
"logits/chosen": -2.5962636470794678,
"logits/rejected": -2.428013324737549,
"logps/chosen": -267.91326904296875,
"logps/rejected": -191.6916046142578,
"loss": 0.5203,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.1610618531703949,
"rewards/margins": 0.8274253606796265,
"rewards/rejected": -0.9884872436523438,
"step": 1080
},
{
"epoch": 0.5231581473482121,
"grad_norm": 39.0,
"learning_rate": 2.7343528337063924e-06,
"logits/chosen": -2.6854658126831055,
"logits/rejected": -2.561249256134033,
"logps/chosen": -250.7506866455078,
"logps/rejected": -225.0154266357422,
"loss": 0.4171,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.04954652860760689,
"rewards/margins": 1.022822618484497,
"rewards/rejected": -1.07236909866333,
"step": 1090
},
{
"epoch": 0.5279577633789296,
"grad_norm": 28.75,
"learning_rate": 2.692596151381774e-06,
"logits/chosen": -2.5916976928710938,
"logits/rejected": -2.5371453762054443,
"logps/chosen": -202.2080078125,
"logps/rejected": -212.92587280273438,
"loss": 0.4372,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.23717375099658966,
"rewards/margins": 0.9503253698348999,
"rewards/rejected": -1.1874991655349731,
"step": 1100
},
{
"epoch": 0.5327573794096472,
"grad_norm": 43.5,
"learning_rate": 2.650785344045149e-06,
"logits/chosen": -2.6039624214172363,
"logits/rejected": -2.515532970428467,
"logps/chosen": -228.81729125976562,
"logps/rejected": -217.0961151123047,
"loss": 0.4695,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.10938185453414917,
"rewards/margins": 1.068205714225769,
"rewards/rejected": -1.177587628364563,
"step": 1110
},
{
"epoch": 0.5375569954403647,
"grad_norm": 33.75,
"learning_rate": 2.6089321617259583e-06,
"logits/chosen": -2.6028213500976562,
"logits/rejected": -2.479320764541626,
"logps/chosen": -232.82186889648438,
"logps/rejected": -216.1719207763672,
"loss": 0.4028,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.030288416892290115,
"rewards/margins": 1.2107077836990356,
"rewards/rejected": -1.2409961223602295,
"step": 1120
},
{
"epoch": 0.5423566114710823,
"grad_norm": 58.75,
"learning_rate": 2.567048366362225e-06,
"logits/chosen": -2.5917420387268066,
"logits/rejected": -2.491093873977661,
"logps/chosen": -242.26766967773438,
"logps/rejected": -212.3538055419922,
"loss": 0.4948,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.2735952138900757,
"rewards/margins": 0.8888144493103027,
"rewards/rejected": -1.1624095439910889,
"step": 1130
},
{
"epoch": 0.5471562275017998,
"grad_norm": 33.5,
"learning_rate": 2.525145728495106e-06,
"logits/chosen": -2.608853816986084,
"logits/rejected": -2.5186550617218018,
"logps/chosen": -230.33004760742188,
"logps/rejected": -204.28524780273438,
"loss": 0.4649,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.2685418426990509,
"rewards/margins": 1.0452083349227905,
"rewards/rejected": -1.313750147819519,
"step": 1140
},
{
"epoch": 0.5519558435325174,
"grad_norm": 38.0,
"learning_rate": 2.4832360239610416e-06,
"logits/chosen": -2.6019065380096436,
"logits/rejected": -2.4967987537384033,
"logps/chosen": -231.5498809814453,
"logps/rejected": -213.8773651123047,
"loss": 0.4629,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.29873836040496826,
"rewards/margins": 0.9136406183242798,
"rewards/rejected": -1.212378978729248,
"step": 1150
},
{
"epoch": 0.5567554595632349,
"grad_norm": 49.25,
"learning_rate": 2.441331030582407e-06,
"logits/chosen": -2.615382671356201,
"logits/rejected": -2.5341572761535645,
"logps/chosen": -226.35079956054688,
"logps/rejected": -216.3585662841797,
"loss": 0.4676,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -0.21219320595264435,
"rewards/margins": 0.8832317590713501,
"rewards/rejected": -1.0954248905181885,
"step": 1160
},
{
"epoch": 0.5615550755939525,
"grad_norm": 52.0,
"learning_rate": 2.3994425248576102e-06,
"logits/chosen": -2.6414036750793457,
"logits/rejected": -2.504991054534912,
"logps/chosen": -248.39572143554688,
"logps/rejected": -202.54635620117188,
"loss": 0.5142,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.26081639528274536,
"rewards/margins": 0.7797340154647827,
"rewards/rejected": -1.0405504703521729,
"step": 1170
},
{
"epoch": 0.56635469162467,
"grad_norm": 56.0,
"learning_rate": 2.357582278651553e-06,
"logits/chosen": -2.580275535583496,
"logits/rejected": -2.4745254516601562,
"logps/chosen": -234.48818969726562,
"logps/rejected": -222.6685028076172,
"loss": 0.4538,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.2316078245639801,
"rewards/margins": 1.032219409942627,
"rewards/rejected": -1.2638272047042847,
"step": 1180
},
{
"epoch": 0.5711543076553875,
"grad_norm": 44.5,
"learning_rate": 2.315762055887411e-06,
"logits/chosen": -2.6241419315338135,
"logits/rejected": -2.4953231811523438,
"logps/chosen": -246.7222442626953,
"logps/rejected": -206.4970703125,
"loss": 0.5141,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.31947702169418335,
"rewards/margins": 0.9516233205795288,
"rewards/rejected": -1.271100401878357,
"step": 1190
},
{
"epoch": 0.5759539236861051,
"grad_norm": 37.5,
"learning_rate": 2.273993609240629e-06,
"logits/chosen": -2.5731539726257324,
"logits/rejected": -2.483686685562134,
"logps/chosen": -229.90762329101562,
"logps/rejected": -225.7178955078125,
"loss": 0.5214,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.3317897915840149,
"rewards/margins": 0.849661648273468,
"rewards/rejected": -1.1814515590667725,
"step": 1200
},
{
"epoch": 0.5759539236861051,
"eval_logits/chosen": -2.6119532585144043,
"eval_logits/rejected": -2.5000758171081543,
"eval_logps/chosen": -234.451904296875,
"eval_logps/rejected": -217.00180053710938,
"eval_loss": 0.45114314556121826,
"eval_rewards/accuracies": 0.7990000247955322,
"eval_rewards/chosen": -0.13189174234867096,
"eval_rewards/margins": 1.0152093172073364,
"eval_rewards/rejected": -1.1471011638641357,
"eval_runtime": 31.9635,
"eval_samples_per_second": 31.286,
"eval_steps_per_second": 7.821,
"step": 1200
},
{
"epoch": 0.5807535397168226,
"grad_norm": 59.0,
"learning_rate": 2.2322886768360874e-06,
"logits/chosen": -2.513803005218506,
"logits/rejected": -2.4552152156829834,
"logps/chosen": -239.845703125,
"logps/rejected": -215.5902099609375,
"loss": 0.4561,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -0.0830584168434143,
"rewards/margins": 1.117357850074768,
"rewards/rejected": -1.2004162073135376,
"step": 1210
},
{
"epoch": 0.5855531557475402,
"grad_norm": 49.75,
"learning_rate": 2.190658978949352e-06,
"logits/chosen": -2.607837438583374,
"logits/rejected": -2.4749207496643066,
"logps/chosen": -220.0395050048828,
"logps/rejected": -194.2960968017578,
"loss": 0.4986,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.1924716681241989,
"rewards/margins": 0.8406950235366821,
"rewards/rejected": -1.0331666469573975,
"step": 1220
},
{
"epoch": 0.5903527717782577,
"grad_norm": 57.25,
"learning_rate": 2.149116214712943e-06,
"logits/chosen": -2.6154885292053223,
"logits/rejected": -2.509730577468872,
"logps/chosen": -234.84402465820312,
"logps/rejected": -219.07339477539062,
"loss": 0.4841,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.1697658747434616,
"rewards/margins": 0.8840200304985046,
"rewards/rejected": -1.053786039352417,
"step": 1230
},
{
"epoch": 0.5951523878089753,
"grad_norm": 55.0,
"learning_rate": 2.107672058828544e-06,
"logits/chosen": -2.6242995262145996,
"logits/rejected": -2.51531720161438,
"logps/chosen": -228.4006805419922,
"logps/rejected": -202.6013641357422,
"loss": 0.474,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.13985280692577362,
"rewards/margins": 0.875139594078064,
"rewards/rejected": -1.0149924755096436,
"step": 1240
},
{
"epoch": 0.5999520038396928,
"grad_norm": 60.5,
"learning_rate": 2.066338158286083e-06,
"logits/chosen": -2.6175296306610107,
"logits/rejected": -2.549783229827881,
"logps/chosen": -227.7768096923828,
"logps/rejected": -222.3975830078125,
"loss": 0.3924,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.06711763888597488,
"rewards/margins": 1.2078065872192383,
"rewards/rejected": -1.2749242782592773,
"step": 1250
},
{
"epoch": 0.6047516198704104,
"grad_norm": 43.5,
"learning_rate": 2.025126129090588e-06,
"logits/chosen": -2.673826217651367,
"logits/rejected": -2.53361177444458,
"logps/chosen": -222.8029327392578,
"logps/rejected": -191.4785919189453,
"loss": 0.4424,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -0.0933801457285881,
"rewards/margins": 1.0600812435150146,
"rewards/rejected": -1.1534613370895386,
"step": 1260
},
{
"epoch": 0.6095512359011279,
"grad_norm": 49.75,
"learning_rate": 1.9840475529977655e-06,
"logits/chosen": -2.6106059551239014,
"logits/rejected": -2.5068392753601074,
"logps/chosen": -230.1819610595703,
"logps/rejected": -208.9829559326172,
"loss": 0.4371,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -0.1759682148694992,
"rewards/margins": 1.1224491596221924,
"rewards/rejected": -1.298417329788208,
"step": 1270
},
{
"epoch": 0.6143508519318455,
"grad_norm": 40.0,
"learning_rate": 1.9431139742591897e-06,
"logits/chosen": -2.594632625579834,
"logits/rejected": -2.489180088043213,
"logps/chosen": -209.05859375,
"logps/rejected": -198.956298828125,
"loss": 0.4101,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -0.08843652904033661,
"rewards/margins": 1.0289084911346436,
"rewards/rejected": -1.117344856262207,
"step": 1280
},
{
"epoch": 0.619150467962563,
"grad_norm": 49.0,
"learning_rate": 1.9023368963780458e-06,
"logits/chosen": -2.6134049892425537,
"logits/rejected": -2.5106282234191895,
"logps/chosen": -234.5142059326172,
"logps/rejected": -207.1331329345703,
"loss": 0.4701,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.14769946038722992,
"rewards/margins": 0.9224032163619995,
"rewards/rejected": -1.0701026916503906,
"step": 1290
},
{
"epoch": 0.6239500839932806,
"grad_norm": 42.0,
"learning_rate": 1.861727778876314e-06,
"logits/chosen": -2.5988595485687256,
"logits/rejected": -2.4945011138916016,
"logps/chosen": -209.90835571289062,
"logps/rejected": -185.820068359375,
"loss": 0.4437,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.21710054576396942,
"rewards/margins": 1.0192363262176514,
"rewards/rejected": -1.2363369464874268,
"step": 1300
},
{
"epoch": 0.6287497000239981,
"grad_norm": 37.0,
"learning_rate": 1.8212980340743152e-06,
"logits/chosen": -2.6286704540252686,
"logits/rejected": -2.5620837211608887,
"logps/chosen": -225.5823974609375,
"logps/rejected": -215.69375610351562,
"loss": 0.4882,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.22902747988700867,
"rewards/margins": 0.9663082957267761,
"rewards/rejected": -1.195335865020752,
"step": 1310
},
{
"epoch": 0.6335493160547156,
"grad_norm": 39.75,
"learning_rate": 1.7810590238835279e-06,
"logits/chosen": -2.5562384128570557,
"logits/rejected": -2.534426212310791,
"logps/chosen": -226.40902709960938,
"logps/rejected": -251.94418334960938,
"loss": 0.4495,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -0.19038750231266022,
"rewards/margins": 1.0307300090789795,
"rewards/rejected": -1.2211174964904785,
"step": 1320
},
{
"epoch": 0.6383489320854332,
"grad_norm": 43.0,
"learning_rate": 1.7410220566135605e-06,
"logits/chosen": -2.6242737770080566,
"logits/rejected": -2.5161290168762207,
"logps/chosen": -228.24398803710938,
"logps/rejected": -207.40316772460938,
"loss": 0.4474,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -0.06271727383136749,
"rewards/margins": 1.0257116556167603,
"rewards/rejected": -1.0884288549423218,
"step": 1330
},
{
"epoch": 0.6431485481161507,
"grad_norm": 58.5,
"learning_rate": 1.7011983837942023e-06,
"logits/chosen": -2.6005454063415527,
"logits/rejected": -2.4911046028137207,
"logps/chosen": -235.7122344970703,
"logps/rejected": -217.5489044189453,
"loss": 0.4614,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.12799616158008575,
"rewards/margins": 0.9990032315254211,
"rewards/rejected": -1.1269992589950562,
"step": 1340
},
{
"epoch": 0.6479481641468683,
"grad_norm": 45.25,
"learning_rate": 1.661599197013416e-06,
"logits/chosen": -2.633547306060791,
"logits/rejected": -2.5312139987945557,
"logps/chosen": -223.9100341796875,
"logps/rejected": -203.5366973876953,
"loss": 0.4765,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.2294052541255951,
"rewards/margins": 0.95598304271698,
"rewards/rejected": -1.185388207435608,
"step": 1350
},
{
"epoch": 0.6527477801775858,
"grad_norm": 46.75,
"learning_rate": 1.6222356247721831e-06,
"logits/chosen": -2.5966320037841797,
"logits/rejected": -2.4941086769104004,
"logps/chosen": -234.2575225830078,
"logps/rejected": -221.3917999267578,
"loss": 0.4476,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -0.12550556659698486,
"rewards/margins": 1.0439780950546265,
"rewards/rejected": -1.1694835424423218,
"step": 1360
},
{
"epoch": 0.6575473962083034,
"grad_norm": 46.0,
"learning_rate": 1.5831187293570826e-06,
"logits/chosen": -2.616199016571045,
"logits/rejected": -2.5082523822784424,
"logps/chosen": -275.43511962890625,
"logps/rejected": -229.6664276123047,
"loss": 0.4631,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.2143980711698532,
"rewards/margins": 1.0123686790466309,
"rewards/rejected": -1.22676682472229,
"step": 1370
},
{
"epoch": 0.6623470122390209,
"grad_norm": 48.5,
"learning_rate": 1.544259503731465e-06,
"logits/chosen": -2.612053632736206,
"logits/rejected": -2.4867258071899414,
"logps/chosen": -240.70059204101562,
"logps/rejected": -198.64918518066406,
"loss": 0.3862,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -0.0024546384811401367,
"rewards/margins": 1.1859334707260132,
"rewards/rejected": -1.1883881092071533,
"step": 1380
},
{
"epoch": 0.6671466282697385,
"grad_norm": 36.0,
"learning_rate": 1.5056688684461235e-06,
"logits/chosen": -2.6124138832092285,
"logits/rejected": -2.491760730743408,
"logps/chosen": -245.6887969970703,
"logps/rejected": -216.34268188476562,
"loss": 0.4459,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -0.3357298672199249,
"rewards/margins": 1.0037829875946045,
"rewards/rejected": -1.3395130634307861,
"step": 1390
},
{
"epoch": 0.6719462443004559,
"grad_norm": 54.0,
"learning_rate": 1.4673576685703027e-06,
"logits/chosen": -2.6035733222961426,
"logits/rejected": -2.521059513092041,
"logps/chosen": -243.61123657226562,
"logps/rejected": -220.312744140625,
"loss": 0.4784,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.27499574422836304,
"rewards/margins": 0.896142840385437,
"rewards/rejected": -1.1711386442184448,
"step": 1400
},
{
"epoch": 0.6719462443004559,
"eval_logits/chosen": -2.6097218990325928,
"eval_logits/rejected": -2.4986746311187744,
"eval_logps/chosen": -234.91114807128906,
"eval_logps/rejected": -217.71800231933594,
"eval_loss": 0.4486246407032013,
"eval_rewards/accuracies": 0.800000011920929,
"eval_rewards/chosen": -0.17781423032283783,
"eval_rewards/margins": 1.0409064292907715,
"eval_rewards/rejected": -1.218720555305481,
"eval_runtime": 31.822,
"eval_samples_per_second": 31.425,
"eval_steps_per_second": 7.856,
"step": 1400
},
{
"epoch": 0.6767458603311735,
"grad_norm": 28.0,
"learning_rate": 1.4293366706439293e-06,
"logits/chosen": -2.5871994495391846,
"logits/rejected": -2.4666335582733154,
"logps/chosen": -221.1563262939453,
"logps/rejected": -207.7967529296875,
"loss": 0.4466,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.30613771080970764,
"rewards/margins": 1.0556987524032593,
"rewards/rejected": -1.361836314201355,
"step": 1410
},
{
"epoch": 0.681545476361891,
"grad_norm": 41.25,
"learning_rate": 1.3916165596519015e-06,
"logits/chosen": -2.6185154914855957,
"logits/rejected": -2.452584981918335,
"logps/chosen": -233.75131225585938,
"logps/rejected": -198.76773071289062,
"loss": 0.4315,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.31702926754951477,
"rewards/margins": 0.9871221780776978,
"rewards/rejected": -1.3041512966156006,
"step": 1420
},
{
"epoch": 0.6863450923926085,
"grad_norm": 47.0,
"learning_rate": 1.3542079360213089e-06,
"logits/chosen": -2.6181461811065674,
"logits/rejected": -2.469588041305542,
"logps/chosen": -224.7504425048828,
"logps/rejected": -193.1361541748047,
"loss": 0.4592,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.17981493473052979,
"rewards/margins": 0.9263655543327332,
"rewards/rejected": -1.1061805486679077,
"step": 1430
},
{
"epoch": 0.6911447084233261,
"grad_norm": 52.75,
"learning_rate": 1.317121312642406e-06,
"logits/chosen": -2.6066277027130127,
"logits/rejected": -2.465261936187744,
"logps/chosen": -233.60720825195312,
"logps/rejected": -211.9008026123047,
"loss": 0.4429,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.18064916133880615,
"rewards/margins": 1.064951777458191,
"rewards/rejected": -1.245600938796997,
"step": 1440
},
{
"epoch": 0.6959443244540436,
"grad_norm": 45.0,
"learning_rate": 1.2803671119141953e-06,
"logits/chosen": -2.5287094116210938,
"logits/rejected": -2.4450387954711914,
"logps/chosen": -248.5905303955078,
"logps/rejected": -240.5746307373047,
"loss": 0.478,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.2681611478328705,
"rewards/margins": 1.0177576541900635,
"rewards/rejected": -1.285918951034546,
"step": 1450
},
{
"epoch": 0.7007439404847612,
"grad_norm": 34.5,
"learning_rate": 1.2439556628154293e-06,
"logits/chosen": -2.612083911895752,
"logits/rejected": -2.450336456298828,
"logps/chosen": -250.229248046875,
"logps/rejected": -219.356689453125,
"loss": 0.4872,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.24559959769248962,
"rewards/margins": 0.9850479960441589,
"rewards/rejected": -1.2306474447250366,
"step": 1460
},
{
"epoch": 0.7055435565154787,
"grad_norm": 41.5,
"learning_rate": 1.207897198001878e-06,
"logits/chosen": -2.644726037979126,
"logits/rejected": -2.5499677658081055,
"logps/chosen": -232.20126342773438,
"logps/rejected": -208.1292724609375,
"loss": 0.4442,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.15714068710803986,
"rewards/margins": 1.035380482673645,
"rewards/rejected": -1.192521095275879,
"step": 1470
},
{
"epoch": 0.7103431725461963,
"grad_norm": 42.5,
"learning_rate": 1.1722018509306587e-06,
"logits/chosen": -2.6035397052764893,
"logits/rejected": -2.4596409797668457,
"logps/chosen": -250.045166015625,
"logps/rejected": -202.33425903320312,
"loss": 0.4371,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.22928068041801453,
"rewards/margins": 1.1220018863677979,
"rewards/rejected": -1.3512827157974243,
"step": 1480
},
{
"epoch": 0.7151427885769138,
"grad_norm": 38.5,
"learning_rate": 1.1368796530124442e-06,
"logits/chosen": -2.5670337677001953,
"logits/rejected": -2.446946620941162,
"logps/chosen": -252.01260375976562,
"logps/rejected": -207.9329071044922,
"loss": 0.4135,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.18759331107139587,
"rewards/margins": 1.0254666805267334,
"rewards/rejected": -1.213059902191162,
"step": 1490
},
{
"epoch": 0.7199424046076314,
"grad_norm": 51.25,
"learning_rate": 1.101940530792356e-06,
"logits/chosen": -2.6158971786499023,
"logits/rejected": -2.5017120838165283,
"logps/chosen": -245.5004425048828,
"logps/rejected": -214.7168731689453,
"loss": 0.4562,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -0.01713145337998867,
"rewards/margins": 0.9989277124404907,
"rewards/rejected": -1.016059160232544,
"step": 1500
},
{
"epoch": 0.7247420206383489,
"grad_norm": 30.5,
"learning_rate": 1.0673943031603134e-06,
"logits/chosen": -2.605909824371338,
"logits/rejected": -2.5186052322387695,
"logps/chosen": -225.20675659179688,
"logps/rejected": -219.88949584960938,
"loss": 0.4336,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -0.2961525321006775,
"rewards/margins": 1.0499502420425415,
"rewards/rejected": -1.3461029529571533,
"step": 1510
},
{
"epoch": 0.7295416366690665,
"grad_norm": 42.75,
"learning_rate": 1.0332506785916524e-06,
"logits/chosen": -2.590073823928833,
"logits/rejected": -2.493535041809082,
"logps/chosen": -241.81918334960938,
"logps/rejected": -221.04598999023438,
"loss": 0.4794,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.2541709542274475,
"rewards/margins": 0.9314506649971008,
"rewards/rejected": -1.1856216192245483,
"step": 1520
},
{
"epoch": 0.734341252699784,
"grad_norm": 44.5,
"learning_rate": 9.995192524187639e-07,
"logits/chosen": -2.5296549797058105,
"logits/rejected": -2.480565071105957,
"logps/chosen": -224.52774047851562,
"logps/rejected": -220.7323760986328,
"loss": 0.506,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.37830549478530884,
"rewards/margins": 0.9571624994277954,
"rewards/rejected": -1.335468053817749,
"step": 1530
},
{
"epoch": 0.7391408687305016,
"grad_norm": 48.0,
"learning_rate": 9.662095041345318e-07,
"logits/chosen": -2.569598436355591,
"logits/rejected": -2.4485225677490234,
"logps/chosen": -247.4419708251953,
"logps/rejected": -228.1896209716797,
"loss": 0.4693,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.18139569461345673,
"rewards/margins": 1.0061061382293701,
"rewards/rejected": -1.187502145767212,
"step": 1540
},
{
"epoch": 0.7439404847612191,
"grad_norm": 46.75,
"learning_rate": 9.333307947283258e-07,
"logits/chosen": -2.628610610961914,
"logits/rejected": -2.5239391326904297,
"logps/chosen": -243.1181640625,
"logps/rejected": -224.0470733642578,
"loss": 0.448,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -0.21100196242332458,
"rewards/margins": 0.9517234563827515,
"rewards/rejected": -1.1627254486083984,
"step": 1550
},
{
"epoch": 0.7487401007919366,
"grad_norm": 34.75,
"learning_rate": 9.00892364055298e-07,
"logits/chosen": -2.587705135345459,
"logits/rejected": -2.5059056282043457,
"logps/chosen": -214.00631713867188,
"logps/rejected": -191.03036499023438,
"loss": 0.4244,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.27500849962234497,
"rewards/margins": 1.0308496952056885,
"rewards/rejected": -1.3058582544326782,
"step": 1560
},
{
"epoch": 0.7535397168226542,
"grad_norm": 47.5,
"learning_rate": 8.689033282397166e-07,
"logits/chosen": -2.6136879920959473,
"logits/rejected": -2.5038022994995117,
"logps/chosen": -231.7812957763672,
"logps/rejected": -214.75051879882812,
"loss": 0.4818,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.29626065492630005,
"rewards/margins": 0.9530956149101257,
"rewards/rejected": -1.2493562698364258,
"step": 1570
},
{
"epoch": 0.7583393328533717,
"grad_norm": 30.125,
"learning_rate": 8.373726771130769e-07,
"logits/chosen": -2.607466220855713,
"logits/rejected": -2.4886395931243896,
"logps/chosen": -242.94442749023438,
"logps/rejected": -211.0756072998047,
"loss": 0.4417,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.22200937569141388,
"rewards/margins": 1.0732426643371582,
"rewards/rejected": -1.2952520847320557,
"step": 1580
},
{
"epoch": 0.7631389488840893,
"grad_norm": 54.5,
"learning_rate": 8.063092716877016e-07,
"logits/chosen": -2.549015998840332,
"logits/rejected": -2.454470157623291,
"logps/chosen": -255.9425811767578,
"logps/rejected": -221.3877410888672,
"loss": 0.4876,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.1736731380224228,
"rewards/margins": 0.8451417088508606,
"rewards/rejected": -1.0188149213790894,
"step": 1590
},
{
"epoch": 0.7679385649148068,
"grad_norm": 37.75,
"learning_rate": 7.757218416665446e-07,
"logits/chosen": -2.6379103660583496,
"logits/rejected": -2.488036632537842,
"logps/chosen": -232.8388214111328,
"logps/rejected": -202.93429565429688,
"loss": 0.4223,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.2727198600769043,
"rewards/margins": 1.1647435426712036,
"rewards/rejected": -1.4374632835388184,
"step": 1600
},
{
"epoch": 0.7679385649148068,
"eval_logits/chosen": -2.609999656677246,
"eval_logits/rejected": -2.498737096786499,
"eval_logps/chosen": -234.40658569335938,
"eval_logps/rejected": -217.20668029785156,
"eval_loss": 0.44873249530792236,
"eval_rewards/accuracies": 0.7990000247955322,
"eval_rewards/chosen": -0.12736062705516815,
"eval_rewards/margins": 1.040230393409729,
"eval_rewards/rejected": -1.1675910949707031,
"eval_runtime": 31.9547,
"eval_samples_per_second": 31.294,
"eval_steps_per_second": 7.824,
"step": 1600
},
{
"epoch": 0.7727381809455244,
"grad_norm": 40.5,
"learning_rate": 7.456189829898955e-07,
"logits/chosen": -2.617159366607666,
"logits/rejected": -2.4671359062194824,
"logps/chosen": -239.3896484375,
"logps/rejected": -200.057861328125,
"loss": 0.4496,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.19172294437885284,
"rewards/margins": 1.0450642108917236,
"rewards/rejected": -1.23678719997406,
"step": 1610
},
{
"epoch": 0.7775377969762419,
"grad_norm": 37.0,
"learning_rate": 7.160091554196732e-07,
"logits/chosen": -2.6596245765686035,
"logits/rejected": -2.534268617630005,
"logps/chosen": -236.6833953857422,
"logps/rejected": -209.33248901367188,
"loss": 0.4434,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -0.2100035846233368,
"rewards/margins": 1.0765715837478638,
"rewards/rejected": -1.2865750789642334,
"step": 1620
},
{
"epoch": 0.7823374130069595,
"grad_norm": 48.5,
"learning_rate": 6.869006801619941e-07,
"logits/chosen": -2.57913875579834,
"logits/rejected": -2.5170235633850098,
"logps/chosen": -243.2036895751953,
"logps/rejected": -240.40792846679688,
"loss": 0.4967,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.19071198999881744,
"rewards/margins": 1.0176457166671753,
"rewards/rejected": -1.208357810974121,
"step": 1630
},
{
"epoch": 0.787137029037677,
"grad_norm": 54.5,
"learning_rate": 6.583017375286726e-07,
"logits/chosen": -2.5934314727783203,
"logits/rejected": -2.4739251136779785,
"logps/chosen": -231.6185760498047,
"logps/rejected": -208.304443359375,
"loss": 0.4574,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.2090485543012619,
"rewards/margins": 1.0793238878250122,
"rewards/rejected": -1.2883723974227905,
"step": 1640
},
{
"epoch": 0.7919366450683946,
"grad_norm": 41.75,
"learning_rate": 6.30220364638324e-07,
"logits/chosen": -2.6110548973083496,
"logits/rejected": -2.519888401031494,
"logps/chosen": -244.562255859375,
"logps/rejected": -207.3623504638672,
"loss": 0.4747,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.21555647253990173,
"rewards/margins": 0.8874263763427734,
"rewards/rejected": -1.1029828786849976,
"step": 1650
},
{
"epoch": 0.7967362610991121,
"grad_norm": 45.25,
"learning_rate": 6.02664453157703e-07,
"logits/chosen": -2.6588504314422607,
"logits/rejected": -2.5547537803649902,
"logps/chosen": -235.1988983154297,
"logps/rejected": -226.4233856201172,
"loss": 0.4643,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.2098228931427002,
"rewards/margins": 0.9475749731063843,
"rewards/rejected": -1.157397985458374,
"step": 1660
},
{
"epoch": 0.8015358771298297,
"grad_norm": 38.75,
"learning_rate": 5.756417470839195e-07,
"logits/chosen": -2.6417441368103027,
"logits/rejected": -2.549515724182129,
"logps/chosen": -229.06411743164062,
"logps/rejected": -208.6210174560547,
"loss": 0.445,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -0.18316276371479034,
"rewards/margins": 0.9890239834785461,
"rewards/rejected": -1.1721866130828857,
"step": 1670
},
{
"epoch": 0.8063354931605472,
"grad_norm": 46.0,
"learning_rate": 5.491598405681559e-07,
"logits/chosen": -2.673280954360962,
"logits/rejected": -2.498465061187744,
"logps/chosen": -247.2686767578125,
"logps/rejected": -202.4930419921875,
"loss": 0.4942,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.17769768834114075,
"rewards/margins": 0.8722941279411316,
"rewards/rejected": -1.0499918460845947,
"step": 1680
},
{
"epoch": 0.8111351091912647,
"grad_norm": 35.25,
"learning_rate": 5.232261757814924e-07,
"logits/chosen": -2.571895122528076,
"logits/rejected": -2.438910722732544,
"logps/chosen": -241.5240936279297,
"logps/rejected": -213.72073364257812,
"loss": 0.4317,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -0.1457543671131134,
"rewards/margins": 1.0501019954681396,
"rewards/rejected": -1.1958563327789307,
"step": 1690
},
{
"epoch": 0.8159347252219823,
"grad_norm": 50.0,
"learning_rate": 4.978480408234465e-07,
"logits/chosen": -2.5237784385681152,
"logits/rejected": -2.4956631660461426,
"logps/chosen": -216.0598602294922,
"logps/rejected": -214.1880340576172,
"loss": 0.5087,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.20147447288036346,
"rewards/margins": 0.8618594408035278,
"rewards/rejected": -1.0633338689804077,
"step": 1700
},
{
"epoch": 0.8207343412526998,
"grad_norm": 51.75,
"learning_rate": 4.73032567673809e-07,
"logits/chosen": -2.6058714389801025,
"logits/rejected": -2.504986047744751,
"logps/chosen": -230.72103881835938,
"logps/rejected": -206.2929229736328,
"loss": 0.4769,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.30534833669662476,
"rewards/margins": 0.8801034092903137,
"rewards/rejected": -1.1854515075683594,
"step": 1710
},
{
"epoch": 0.8255339572834173,
"grad_norm": 46.75,
"learning_rate": 4.487867301883528e-07,
"logits/chosen": -2.5329365730285645,
"logits/rejected": -2.43863844871521,
"logps/chosen": -225.4794921875,
"logps/rejected": -214.2443389892578,
"loss": 0.5013,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.2076941281557083,
"rewards/margins": 0.8862035870552063,
"rewards/rejected": -1.0938977003097534,
"step": 1720
},
{
"epoch": 0.8303335733141348,
"grad_norm": 42.25,
"learning_rate": 4.2511734213898093e-07,
"logits/chosen": -2.64858341217041,
"logits/rejected": -2.5094025135040283,
"logps/chosen": -254.30532836914062,
"logps/rejected": -218.06900024414062,
"loss": 0.5122,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.28009095788002014,
"rewards/margins": 0.8344430923461914,
"rewards/rejected": -1.1145341396331787,
"step": 1730
},
{
"epoch": 0.8351331893448524,
"grad_norm": 69.5,
"learning_rate": 4.020310552988632e-07,
"logits/chosen": -2.620943546295166,
"logits/rejected": -2.456711530685425,
"logps/chosen": -250.24374389648438,
"logps/rejected": -217.15219116210938,
"loss": 0.4241,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.21671004593372345,
"rewards/margins": 1.0417428016662598,
"rewards/rejected": -1.2584527730941772,
"step": 1740
},
{
"epoch": 0.8399328053755699,
"grad_norm": 51.5,
"learning_rate": 3.7953435757309756e-07,
"logits/chosen": -2.628960132598877,
"logits/rejected": -2.504699468612671,
"logps/chosen": -253.9635467529297,
"logps/rejected": -227.1519317626953,
"loss": 0.5159,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.17710088193416595,
"rewards/margins": 0.8642765879631042,
"rewards/rejected": -1.0413774251937866,
"step": 1750
},
{
"epoch": 0.8447324214062875,
"grad_norm": 52.25,
"learning_rate": 3.5763357117542364e-07,
"logits/chosen": -2.6271114349365234,
"logits/rejected": -2.5810532569885254,
"logps/chosen": -237.49758911132812,
"logps/rejected": -229.40185546875,
"loss": 0.4646,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.2700980305671692,
"rewards/margins": 1.0334694385528564,
"rewards/rejected": -1.3035674095153809,
"step": 1760
},
{
"epoch": 0.849532037437005,
"grad_norm": 63.75,
"learning_rate": 3.363348508515016e-07,
"logits/chosen": -2.6458356380462646,
"logits/rejected": -2.519512176513672,
"logps/chosen": -238.7860565185547,
"logps/rejected": -220.2080078125,
"loss": 0.5092,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.2170904129743576,
"rewards/margins": 0.8388406038284302,
"rewards/rejected": -1.0559309720993042,
"step": 1770
},
{
"epoch": 0.8543316534677226,
"grad_norm": 45.25,
"learning_rate": 3.156441821492506e-07,
"logits/chosen": -2.602996349334717,
"logits/rejected": -2.490618944168091,
"logps/chosen": -237.382568359375,
"logps/rejected": -219.4293975830078,
"loss": 0.4473,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.2120467871427536,
"rewards/margins": 1.101704478263855,
"rewards/rejected": -1.313751220703125,
"step": 1780
},
{
"epoch": 0.8591312694984401,
"grad_norm": 38.75,
"learning_rate": 2.9556737973674117e-07,
"logits/chosen": -2.609147310256958,
"logits/rejected": -2.473086357116699,
"logps/chosen": -242.01962280273438,
"logps/rejected": -202.3125762939453,
"loss": 0.4658,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.25016945600509644,
"rewards/margins": 1.0174734592437744,
"rewards/rejected": -1.267642855644226,
"step": 1790
},
{
"epoch": 0.8639308855291576,
"grad_norm": 51.25,
"learning_rate": 2.761100857681068e-07,
"logits/chosen": -2.625156879425049,
"logits/rejected": -2.513139486312866,
"logps/chosen": -219.42385864257812,
"logps/rejected": -200.65432739257812,
"loss": 0.5114,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.38931506872177124,
"rewards/margins": 0.8760172724723816,
"rewards/rejected": -1.2653322219848633,
"step": 1800
},
{
"epoch": 0.8639308855291576,
"eval_logits/chosen": -2.6100218296051025,
"eval_logits/rejected": -2.4987967014312744,
"eval_logps/chosen": -234.25213623046875,
"eval_logps/rejected": -217.06610107421875,
"eval_loss": 0.44831663370132446,
"eval_rewards/accuracies": 0.7990000247955322,
"eval_rewards/chosen": -0.11191659420728683,
"eval_rewards/margins": 1.0416151285171509,
"eval_rewards/rejected": -1.1535316705703735,
"eval_runtime": 31.9766,
"eval_samples_per_second": 31.273,
"eval_steps_per_second": 7.818,
"step": 1800
},
{
"epoch": 0.8687305015598752,
"grad_norm": 40.75,
"learning_rate": 2.5727776829793774e-07,
"logits/chosen": -2.6402649879455566,
"logits/rejected": -2.48222017288208,
"logps/chosen": -238.41708374023438,
"logps/rejected": -186.76150512695312,
"loss": 0.4385,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.12383349239826202,
"rewards/margins": 1.0703352689743042,
"rewards/rejected": -1.1941686868667603,
"step": 1810
},
{
"epoch": 0.8735301175905927,
"grad_norm": 64.5,
"learning_rate": 2.3907571974460255e-07,
"logits/chosen": -2.614182949066162,
"logits/rejected": -2.4683122634887695,
"logps/chosen": -242.4246063232422,
"logps/rejected": -199.64266967773438,
"loss": 0.4524,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.18747808039188385,
"rewards/margins": 0.9742029905319214,
"rewards/rejected": -1.1616809368133545,
"step": 1820
},
{
"epoch": 0.8783297336213103,
"grad_norm": 52.5,
"learning_rate": 2.2150905540292589e-07,
"logits/chosen": -2.6297433376312256,
"logits/rejected": -2.4974730014801025,
"logps/chosen": -229.2626953125,
"logps/rejected": -213.91567993164062,
"loss": 0.4491,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.12955379486083984,
"rewards/margins": 0.960136890411377,
"rewards/rejected": -1.0896905660629272,
"step": 1830
},
{
"epoch": 0.8831293496520278,
"grad_norm": 39.5,
"learning_rate": 2.0458271200664626e-07,
"logits/chosen": -2.5526020526885986,
"logits/rejected": -2.517185688018799,
"logps/chosen": -214.2257843017578,
"logps/rejected": -210.78115844726562,
"loss": 0.4151,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -0.33652427792549133,
"rewards/margins": 1.0866222381591797,
"rewards/rejected": -1.4231464862823486,
"step": 1840
},
{
"epoch": 0.8879289656827454,
"grad_norm": 32.5,
"learning_rate": 1.8830144634105206e-07,
"logits/chosen": -2.5996253490448,
"logits/rejected": -2.453141927719116,
"logps/chosen": -247.02804565429688,
"logps/rejected": -199.14678955078125,
"loss": 0.4273,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -0.08400492370128632,
"rewards/margins": 1.1507220268249512,
"rewards/rejected": -1.234726905822754,
"step": 1850
},
{
"epoch": 0.8927285817134629,
"grad_norm": 42.25,
"learning_rate": 1.7266983390618997e-07,
"logits/chosen": -2.5644283294677734,
"logits/rejected": -2.453029155731201,
"logps/chosen": -228.55294799804688,
"logps/rejected": -198.43435668945312,
"loss": 0.4349,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.06119618937373161,
"rewards/margins": 1.1090104579925537,
"rewards/rejected": -1.1702066659927368,
"step": 1860
},
{
"epoch": 0.8975281977441805,
"grad_norm": 49.0,
"learning_rate": 1.5769226763101887e-07,
"logits/chosen": -2.5068180561065674,
"logits/rejected": -2.47198748588562,
"logps/chosen": -231.07266235351562,
"logps/rejected": -219.62387084960938,
"loss": 0.4895,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.2498408854007721,
"rewards/margins": 0.9979842901229858,
"rewards/rejected": -1.2478251457214355,
"step": 1870
},
{
"epoch": 0.902327813774898,
"grad_norm": 54.0,
"learning_rate": 1.4337295663887086e-07,
"logits/chosen": -2.659472942352295,
"logits/rejected": -2.517972946166992,
"logps/chosen": -239.847900390625,
"logps/rejected": -198.91372680664062,
"loss": 0.4644,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.11448581516742706,
"rewards/margins": 1.0199806690216064,
"rewards/rejected": -1.134466528892517,
"step": 1880
},
{
"epoch": 0.9071274298056156,
"grad_norm": 63.0,
"learning_rate": 1.2971592506456799e-07,
"logits/chosen": -2.5675461292266846,
"logits/rejected": -2.4976906776428223,
"logps/chosen": -206.1525421142578,
"logps/rejected": -200.3158721923828,
"loss": 0.4568,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.2682192623615265,
"rewards/margins": 1.0316725969314575,
"rewards/rejected": -1.2998919486999512,
"step": 1890
},
{
"epoch": 0.9119270458363331,
"grad_norm": 41.25,
"learning_rate": 1.1672501092352545e-07,
"logits/chosen": -2.613196611404419,
"logits/rejected": -2.4825501441955566,
"logps/chosen": -241.4010772705078,
"logps/rejected": -214.26339721679688,
"loss": 0.4681,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.11850067228078842,
"rewards/margins": 0.9811543226242065,
"rewards/rejected": -1.0996549129486084,
"step": 1900
},
{
"epoch": 0.9167266618670507,
"grad_norm": 56.0,
"learning_rate": 1.0440386503315969e-07,
"logits/chosen": -2.5355827808380127,
"logits/rejected": -2.4754626750946045,
"logps/chosen": -224.82363891601562,
"logps/rejected": -254.0577850341797,
"loss": 0.4594,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.16619113087654114,
"rewards/margins": 1.0043249130249023,
"rewards/rejected": -1.1705158948898315,
"step": 1910
},
{
"epoch": 0.9215262778977682,
"grad_norm": 47.0,
"learning_rate": 9.275594998690574e-08,
"logits/chosen": -2.5817465782165527,
"logits/rejected": -2.4193766117095947,
"logps/chosen": -251.2194061279297,
"logps/rejected": -203.05404663085938,
"loss": 0.4609,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.17528951168060303,
"rewards/margins": 0.9491412043571472,
"rewards/rejected": -1.1244306564331055,
"step": 1920
},
{
"epoch": 0.9263258939284857,
"grad_norm": 44.75,
"learning_rate": 8.178453918112783e-08,
"logits/chosen": -2.5946240425109863,
"logits/rejected": -2.4726099967956543,
"logps/chosen": -223.893798828125,
"logps/rejected": -191.31539916992188,
"loss": 0.4227,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.033659275621175766,
"rewards/margins": 1.185791015625,
"rewards/rejected": -1.2194502353668213,
"step": 1930
},
{
"epoch": 0.9311255099592033,
"grad_norm": 72.0,
"learning_rate": 7.149271589520167e-08,
"logits/chosen": -2.551396131515503,
"logits/rejected": -2.4351983070373535,
"logps/chosen": -212.9639129638672,
"logps/rejected": -204.0959930419922,
"loss": 0.4853,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.29301124811172485,
"rewards/margins": 0.9584601521492004,
"rewards/rejected": -1.2514712810516357,
"step": 1940
},
{
"epoch": 0.9359251259899208,
"grad_norm": 42.5,
"learning_rate": 6.188337242502784e-08,
"logits/chosen": -2.592571496963501,
"logits/rejected": -2.4578793048858643,
"logps/chosen": -238.7187042236328,
"logps/rejected": -204.2020263671875,
"loss": 0.4352,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.20334915816783905,
"rewards/margins": 1.0652612447738647,
"rewards/rejected": -1.2686102390289307,
"step": 1950
},
{
"epoch": 0.9407247420206384,
"grad_norm": 34.25,
"learning_rate": 5.295920927021109e-08,
"logits/chosen": -2.6256229877471924,
"logits/rejected": -2.5241026878356934,
"logps/chosen": -236.45401000976562,
"logps/rejected": -210.2203369140625,
"loss": 0.4396,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -0.19231542944908142,
"rewards/margins": 1.1057020425796509,
"rewards/rejected": -1.2980175018310547,
"step": 1960
},
{
"epoch": 0.9455243580513559,
"grad_norm": 44.5,
"learning_rate": 4.472273437514357e-08,
"logits/chosen": -2.644946336746216,
"logits/rejected": -2.526045083999634,
"logps/chosen": -258.8535461425781,
"logps/rejected": -222.54019165039062,
"loss": 0.4178,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -0.06698047369718552,
"rewards/margins": 1.1277925968170166,
"rewards/rejected": -1.1947730779647827,
"step": 1970
},
{
"epoch": 0.9503239740820735,
"grad_norm": 36.0,
"learning_rate": 3.717626242420252e-08,
"logits/chosen": -2.60685658454895,
"logits/rejected": -2.5159687995910645,
"logps/chosen": -223.88827514648438,
"logps/rejected": -212.7635498046875,
"loss": 0.5017,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.22438673675060272,
"rewards/margins": 0.8267514109611511,
"rewards/rejected": -1.0511382818222046,
"step": 1980
},
{
"epoch": 0.955123590112791,
"grad_norm": 43.75,
"learning_rate": 3.03219141912553e-08,
"logits/chosen": -2.586005687713623,
"logits/rejected": -2.484550714492798,
"logps/chosen": -244.2949676513672,
"logps/rejected": -225.6749725341797,
"loss": 0.4699,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.22158965468406677,
"rewards/margins": 0.8752668499946594,
"rewards/rejected": -1.0968565940856934,
"step": 1990
},
{
"epoch": 0.9599232061435086,
"grad_norm": 49.75,
"learning_rate": 2.4161615943664174e-08,
"logits/chosen": -2.6366090774536133,
"logits/rejected": -2.5460915565490723,
"logps/chosen": -219.8328094482422,
"logps/rejected": -214.83737182617188,
"loss": 0.4763,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.11922893673181534,
"rewards/margins": 0.9408013224601746,
"rewards/rejected": -1.060030221939087,
"step": 2000
},
{
"epoch": 0.9599232061435086,
"eval_logits/chosen": -2.610217571258545,
"eval_logits/rejected": -2.4989378452301025,
"eval_logps/chosen": -234.27659606933594,
"eval_logps/rejected": -217.0829620361328,
"eval_loss": 0.4485087990760803,
"eval_rewards/accuracies": 0.7950000166893005,
"eval_rewards/chosen": -0.11436203867197037,
"eval_rewards/margins": 1.0408560037612915,
"eval_rewards/rejected": -1.1552180051803589,
"eval_runtime": 21.4702,
"eval_samples_per_second": 46.576,
"eval_steps_per_second": 11.644,
"step": 2000
},
{
"epoch": 0.9647228221742261,
"grad_norm": 34.5,
"learning_rate": 1.8697098900948285e-08,
"logits/chosen": -2.5669498443603516,
"logits/rejected": -2.4752142429351807,
"logps/chosen": -227.6999053955078,
"logps/rejected": -230.0476531982422,
"loss": 0.435,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.15688031911849976,
"rewards/margins": 1.0234278440475464,
"rewards/rejected": -1.180308222770691,
"step": 2010
},
{
"epoch": 0.9695224382049437,
"grad_norm": 48.25,
"learning_rate": 1.392989874826195e-08,
"logits/chosen": -2.6464297771453857,
"logits/rejected": -2.513470411300659,
"logps/chosen": -231.412353515625,
"logps/rejected": -216.2265167236328,
"loss": 0.5319,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.2918005585670471,
"rewards/margins": 0.7504169940948486,
"rewards/rejected": -1.042217493057251,
"step": 2020
},
{
"epoch": 0.9743220542356611,
"grad_norm": 52.5,
"learning_rate": 9.861355204825173e-09,
"logits/chosen": -2.6138572692871094,
"logits/rejected": -2.4997787475585938,
"logps/chosen": -259.1074523925781,
"logps/rejected": -211.08761596679688,
"loss": 0.4733,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.19954562187194824,
"rewards/margins": 1.0070618391036987,
"rewards/rejected": -1.206607460975647,
"step": 2030
},
{
"epoch": 0.9791216702663786,
"grad_norm": 36.5,
"learning_rate": 6.492611647420932e-09,
"logits/chosen": -2.594069004058838,
"logits/rejected": -2.459033250808716,
"logps/chosen": -230.0123291015625,
"logps/rejected": -206.733642578125,
"loss": 0.4503,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.22017189860343933,
"rewards/margins": 1.0728034973144531,
"rewards/rejected": -1.2929753065109253,
"step": 2040
},
{
"epoch": 0.9839212862970962,
"grad_norm": 51.25,
"learning_rate": 3.8246147890763645e-09,
"logits/chosen": -2.643317699432373,
"logits/rejected": -2.504284143447876,
"logps/chosen": -246.441162109375,
"logps/rejected": -210.845458984375,
"loss": 0.494,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.2079886943101883,
"rewards/margins": 0.7775675058364868,
"rewards/rejected": -0.9855562448501587,
"step": 2050
},
{
"epoch": 0.9887209023278137,
"grad_norm": 38.0,
"learning_rate": 1.8581144130089269e-09,
"logits/chosen": -2.567206859588623,
"logits/rejected": -2.506730794906616,
"logps/chosen": -222.69644165039062,
"logps/rejected": -218.9879913330078,
"loss": 0.4439,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.1257813274860382,
"rewards/margins": 1.1031692028045654,
"rewards/rejected": -1.2289507389068604,
"step": 2060
},
{
"epoch": 0.9935205183585313,
"grad_norm": 53.25,
"learning_rate": 5.936631619152256e-10,
"logits/chosen": -2.615421772003174,
"logits/rejected": -2.54068660736084,
"logps/chosen": -233.13818359375,
"logps/rejected": -214.350341796875,
"loss": 0.4651,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.03799530863761902,
"rewards/margins": 1.0076466798782349,
"rewards/rejected": -1.0456420183181763,
"step": 2070
},
{
"epoch": 0.9983201343892488,
"grad_norm": 55.75,
"learning_rate": 3.161638266302447e-11,
"logits/chosen": -2.651322841644287,
"logits/rejected": -2.516923666000366,
"logps/chosen": -235.73876953125,
"logps/rejected": -220.57421875,
"loss": 0.47,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -0.25053197145462036,
"rewards/margins": 0.9338071942329407,
"rewards/rejected": -1.184339165687561,
"step": 2080
},
{
"epoch": 0.9997600191984641,
"step": 2083,
"total_flos": 0.0,
"train_loss": 0.49830314429746014,
"train_runtime": 2918.5416,
"train_samples_per_second": 11.421,
"train_steps_per_second": 0.714
}
],
"logging_steps": 10,
"max_steps": 2083,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}