Files
tinyllama-1.1b-dpo-pku-safe…/trainer_state.json
ModelHub XC 096673a501 初始化项目,由ModelHub XC社区提供模型
Model: AIPlans/tinyllama-1.1b-dpo-pku-saferlhf
Source: Original Platform
2026-06-11 00:38:47 +08:00

3323 lines
113 KiB
JSON

{
"best_metric": 0.675000011920929,
"best_model_checkpoint": "./outputs/tinyllama-1.1b-dpo-pku-saferlhf/checkpoint-1200",
"epoch": 0.9997600191984641,
"eval_steps": 200,
"global_step": 2083,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004799616030717543,
"grad_norm": 57.0,
"learning_rate": 2.3923444976076555e-08,
"logits/chosen": -2.688718318939209,
"logits/rejected": -2.5538744926452637,
"logps/chosen": -212.6398162841797,
"logps/rejected": -186.61505126953125,
"loss": 0.6966,
"rewards/accuracies": 0.35624998807907104,
"rewards/chosen": -0.005152043420821428,
"rewards/margins": -0.006588623858988285,
"rewards/rejected": 0.0014365792740136385,
"step": 10
},
{
"epoch": 0.009599232061435085,
"grad_norm": 59.75,
"learning_rate": 4.784688995215311e-08,
"logits/chosen": -2.728940486907959,
"logits/rejected": -2.616565227508545,
"logps/chosen": -223.5636749267578,
"logps/rejected": -203.41867065429688,
"loss": 0.6899,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.002993463072925806,
"rewards/margins": 0.006973244249820709,
"rewards/rejected": -0.003979781176894903,
"step": 20
},
{
"epoch": 0.014398848092152628,
"grad_norm": 50.75,
"learning_rate": 7.177033492822967e-08,
"logits/chosen": -2.716870069503784,
"logits/rejected": -2.6400887966156006,
"logps/chosen": -237.99618530273438,
"logps/rejected": -219.1649627685547,
"loss": 0.6938,
"rewards/accuracies": 0.40625,
"rewards/chosen": -0.0003619740600697696,
"rewards/margins": -0.0006011867080815136,
"rewards/rejected": 0.00023921244428493083,
"step": 30
},
{
"epoch": 0.01919846412287017,
"grad_norm": 57.25,
"learning_rate": 9.569377990430622e-08,
"logits/chosen": -2.7444612979888916,
"logits/rejected": -2.5919036865234375,
"logps/chosen": -251.178466796875,
"logps/rejected": -196.35256958007812,
"loss": 0.6926,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.0019455172587186098,
"rewards/margins": 0.001769614638760686,
"rewards/rejected": -0.0037151314318180084,
"step": 40
},
{
"epoch": 0.023998080153587713,
"grad_norm": 48.25,
"learning_rate": 1.1961722488038278e-07,
"logits/chosen": -2.6667444705963135,
"logits/rejected": -2.6048452854156494,
"logps/chosen": -234.8287353515625,
"logps/rejected": -199.68902587890625,
"loss": 0.6949,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -0.0009633477893657982,
"rewards/margins": -0.002945653162896633,
"rewards/rejected": 0.001982305431738496,
"step": 50
},
{
"epoch": 0.028797696184305256,
"grad_norm": 63.5,
"learning_rate": 1.4354066985645933e-07,
"logits/chosen": -2.705540180206299,
"logits/rejected": -2.599553346633911,
"logps/chosen": -223.2880859375,
"logps/rejected": -215.03759765625,
"loss": 0.6932,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.002534763887524605,
"rewards/margins": 0.00042680976912379265,
"rewards/rejected": -0.002961573889479041,
"step": 60
},
{
"epoch": 0.033597312215022795,
"grad_norm": 51.25,
"learning_rate": 1.6746411483253589e-07,
"logits/chosen": -2.726954698562622,
"logits/rejected": -2.5858073234558105,
"logps/chosen": -245.69790649414062,
"logps/rejected": -205.90469360351562,
"loss": 0.6925,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.001567687257193029,
"rewards/margins": 0.0018620832124724984,
"rewards/rejected": -0.0034297697711735964,
"step": 70
},
{
"epoch": 0.03839692824574034,
"grad_norm": 52.25,
"learning_rate": 1.9138755980861244e-07,
"logits/chosen": -2.72399640083313,
"logits/rejected": -2.6148579120635986,
"logps/chosen": -235.31991577148438,
"logps/rejected": -201.12049865722656,
"loss": 0.6933,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 0.0001770513626979664,
"rewards/margins": 0.0003844931779894978,
"rewards/rejected": -0.00020744054927490652,
"step": 80
},
{
"epoch": 0.04319654427645788,
"grad_norm": 53.0,
"learning_rate": 2.15311004784689e-07,
"logits/chosen": -2.7387874126434326,
"logits/rejected": -2.5575790405273438,
"logps/chosen": -251.2541961669922,
"logps/rejected": -189.22152709960938,
"loss": 0.6945,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.001186860492452979,
"rewards/margins": -0.002072554547339678,
"rewards/rejected": 0.0008856941130943596,
"step": 90
},
{
"epoch": 0.04799616030717543,
"grad_norm": 50.25,
"learning_rate": 2.3923444976076555e-07,
"logits/chosen": -2.707674503326416,
"logits/rejected": -2.5784828662872314,
"logps/chosen": -228.6029510498047,
"logps/rejected": -206.3776092529297,
"loss": 0.6919,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.0017306295922026038,
"rewards/margins": 0.0031413964461535215,
"rewards/rejected": -0.001410767319612205,
"step": 100
},
{
"epoch": 0.052795776337892966,
"grad_norm": 52.25,
"learning_rate": 2.631578947368421e-07,
"logits/chosen": -2.7312660217285156,
"logits/rejected": -2.5520169734954834,
"logps/chosen": -237.2713165283203,
"logps/rejected": -178.83563232421875,
"loss": 0.6954,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": 8.01489659352228e-05,
"rewards/margins": -0.004063433036208153,
"rewards/rejected": 0.004143581725656986,
"step": 110
},
{
"epoch": 0.05759539236861051,
"grad_norm": 54.75,
"learning_rate": 2.8708133971291866e-07,
"logits/chosen": -2.7264270782470703,
"logits/rejected": -2.602839946746826,
"logps/chosen": -234.1689910888672,
"logps/rejected": -204.75241088867188,
"loss": 0.6945,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.0038609784096479416,
"rewards/margins": -0.0021540005691349506,
"rewards/rejected": -0.0017069776076823473,
"step": 120
},
{
"epoch": 0.06239500839932805,
"grad_norm": 60.25,
"learning_rate": 3.110047846889952e-07,
"logits/chosen": -2.71527361869812,
"logits/rejected": -2.5875349044799805,
"logps/chosen": -252.4326934814453,
"logps/rejected": -210.2029266357422,
"loss": 0.6909,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.0007449972326867282,
"rewards/margins": 0.005101869348436594,
"rewards/rejected": -0.004356871824711561,
"step": 130
},
{
"epoch": 0.06719462443004559,
"grad_norm": 49.75,
"learning_rate": 3.3492822966507177e-07,
"logits/chosen": -2.6849746704101562,
"logits/rejected": -2.6195476055145264,
"logps/chosen": -235.63906860351562,
"logps/rejected": -218.46603393554688,
"loss": 0.6935,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.0014313453575596213,
"rewards/margins": -0.00015073138638399541,
"rewards/rejected": 0.0015820765402168036,
"step": 140
},
{
"epoch": 0.07199424046076314,
"grad_norm": 51.25,
"learning_rate": 3.588516746411483e-07,
"logits/chosen": -2.702357769012451,
"logits/rejected": -2.6205286979675293,
"logps/chosen": -232.359619140625,
"logps/rejected": -245.7252197265625,
"loss": 0.6947,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": 0.002868877723813057,
"rewards/margins": -0.0025208499282598495,
"rewards/rejected": 0.005389728117734194,
"step": 150
},
{
"epoch": 0.07679385649148068,
"grad_norm": 69.0,
"learning_rate": 3.827751196172249e-07,
"logits/chosen": -2.6653263568878174,
"logits/rejected": -2.5433461666107178,
"logps/chosen": -243.0439453125,
"logps/rejected": -200.38685607910156,
"loss": 0.6959,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.004858389962464571,
"rewards/margins": -0.004746051970869303,
"rewards/rejected": -0.00011233799159526825,
"step": 160
},
{
"epoch": 0.08159347252219823,
"grad_norm": 54.0,
"learning_rate": 4.066985645933014e-07,
"logits/chosen": -2.7417194843292236,
"logits/rejected": -2.5782225131988525,
"logps/chosen": -256.54278564453125,
"logps/rejected": -199.0166473388672,
"loss": 0.6918,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.0022998370695859194,
"rewards/margins": 0.003148593008518219,
"rewards/rejected": -0.0008487561717629433,
"step": 170
},
{
"epoch": 0.08639308855291576,
"grad_norm": 53.5,
"learning_rate": 4.30622009569378e-07,
"logits/chosen": -2.7641491889953613,
"logits/rejected": -2.6625149250030518,
"logps/chosen": -242.5579071044922,
"logps/rejected": -201.333251953125,
"loss": 0.6908,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.000439296942204237,
"rewards/margins": 0.00535095389932394,
"rewards/rejected": -0.005790251307189465,
"step": 180
},
{
"epoch": 0.09119270458363331,
"grad_norm": 56.0,
"learning_rate": 4.545454545454545e-07,
"logits/chosen": -2.691822052001953,
"logits/rejected": -2.587761640548706,
"logps/chosen": -240.88916015625,
"logps/rejected": -198.99119567871094,
"loss": 0.6915,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 0.002601384650915861,
"rewards/margins": 0.003762087319046259,
"rewards/rejected": -0.0011607027845457196,
"step": 190
},
{
"epoch": 0.09599232061435085,
"grad_norm": 45.25,
"learning_rate": 4.784688995215311e-07,
"logits/chosen": -2.7268319129943848,
"logits/rejected": -2.6072897911071777,
"logps/chosen": -228.60128784179688,
"logps/rejected": -190.16201782226562,
"loss": 0.6899,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.0014531847555190325,
"rewards/margins": 0.00695295725017786,
"rewards/rejected": -0.005499773193150759,
"step": 200
},
{
"epoch": 0.09599232061435085,
"eval_logits/chosen": -2.7156012058258057,
"eval_logits/rejected": -2.595405340194702,
"eval_logps/chosen": -233.07533264160156,
"eval_logps/rejected": -205.5474853515625,
"eval_loss": 0.6897569298744202,
"eval_rewards/accuracies": 0.5680000185966492,
"eval_rewards/chosen": 0.005765980575233698,
"eval_rewards/margins": 0.007436447311192751,
"eval_rewards/rejected": -0.0016704658046364784,
"eval_runtime": 21.4199,
"eval_samples_per_second": 46.686,
"eval_steps_per_second": 11.671,
"step": 200
},
{
"epoch": 0.1007919366450684,
"grad_norm": 50.0,
"learning_rate": 4.999996487062011e-07,
"logits/chosen": -2.6748883724212646,
"logits/rejected": -2.5881714820861816,
"logps/chosen": -242.6162872314453,
"logps/rejected": -214.3368377685547,
"loss": 0.6931,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": 0.0026280046440660954,
"rewards/margins": 0.000607747002504766,
"rewards/rejected": 0.002020257292315364,
"step": 210
},
{
"epoch": 0.10559155267578593,
"grad_norm": 49.5,
"learning_rate": 4.999574946449064e-07,
"logits/chosen": -2.7096612453460693,
"logits/rejected": -2.5823137760162354,
"logps/chosen": -226.0032196044922,
"logps/rejected": -185.87234497070312,
"loss": 0.6908,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.007502657826989889,
"rewards/margins": 0.005469072610139847,
"rewards/rejected": 0.002033584751188755,
"step": 220
},
{
"epoch": 0.11039116870650348,
"grad_norm": 53.25,
"learning_rate": 4.998450953980164e-07,
"logits/chosen": -2.674795389175415,
"logits/rejected": -2.562544345855713,
"logps/chosen": -231.25247192382812,
"logps/rejected": -223.4167938232422,
"loss": 0.6922,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 0.005688765086233616,
"rewards/margins": 0.0028063193894922733,
"rewards/rejected": 0.0028824463952332735,
"step": 230
},
{
"epoch": 0.11519078473722102,
"grad_norm": 49.5,
"learning_rate": 4.996624825529257e-07,
"logits/chosen": -2.752612590789795,
"logits/rejected": -2.641317367553711,
"logps/chosen": -216.5712432861328,
"logps/rejected": -192.3323211669922,
"loss": 0.69,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.01013671699911356,
"rewards/margins": 0.006918230559676886,
"rewards/rejected": 0.003218486439436674,
"step": 240
},
{
"epoch": 0.11999040076793857,
"grad_norm": 51.5,
"learning_rate": 4.994097074290524e-07,
"logits/chosen": -2.7131876945495605,
"logits/rejected": -2.591782331466675,
"logps/chosen": -228.76925659179688,
"logps/rejected": -200.34799194335938,
"loss": 0.6902,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.00667478796094656,
"rewards/margins": 0.00645422050729394,
"rewards/rejected": 0.00022056761372368783,
"step": 250
},
{
"epoch": 0.1247900167986561,
"grad_norm": 53.0,
"learning_rate": 4.990868410634162e-07,
"logits/chosen": -2.7187414169311523,
"logits/rejected": -2.6327602863311768,
"logps/chosen": -225.66043090820312,
"logps/rejected": -192.34942626953125,
"loss": 0.6881,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.009334566071629524,
"rewards/margins": 0.010765586979687214,
"rewards/rejected": -0.0014310205588117242,
"step": 260
},
{
"epoch": 0.12958963282937366,
"grad_norm": 55.0,
"learning_rate": 4.986939741906753e-07,
"logits/chosen": -2.7310328483581543,
"logits/rejected": -2.6244540214538574,
"logps/chosen": -214.0838623046875,
"logps/rejected": -191.58615112304688,
"loss": 0.685,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.012120475992560387,
"rewards/margins": 0.016924794763326645,
"rewards/rejected": -0.004804318305104971,
"step": 270
},
{
"epoch": 0.13438924886009118,
"grad_norm": 61.75,
"learning_rate": 4.982312172176264e-07,
"logits/chosen": -2.7920923233032227,
"logits/rejected": -2.5907645225524902,
"logps/chosen": -273.3122253417969,
"logps/rejected": -205.17733764648438,
"loss": 0.6891,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.02231917902827263,
"rewards/margins": 0.008881422691047192,
"rewards/rejected": 0.013437752611935139,
"step": 280
},
{
"epoch": 0.13918886489080873,
"grad_norm": 57.25,
"learning_rate": 4.976987001921786e-07,
"logits/chosen": -2.710538625717163,
"logits/rejected": -2.5878021717071533,
"logps/chosen": -235.57437133789062,
"logps/rejected": -204.8675537109375,
"loss": 0.6868,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.013490339741110802,
"rewards/margins": 0.01348956674337387,
"rewards/rejected": 7.734633982181549e-07,
"step": 290
},
{
"epoch": 0.14398848092152627,
"grad_norm": 52.75,
"learning_rate": 4.97096572766805e-07,
"logits/chosen": -2.727212905883789,
"logits/rejected": -2.582718849182129,
"logps/chosen": -240.856201171875,
"logps/rejected": -190.96470642089844,
"loss": 0.6864,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.01539058517664671,
"rewards/margins": 0.014497148804366589,
"rewards/rejected": 0.0008934367215260863,
"step": 300
},
{
"epoch": 0.14878809695224382,
"grad_norm": 49.0,
"learning_rate": 4.964250041564868e-07,
"logits/chosen": -2.7062602043151855,
"logits/rejected": -2.5759198665618896,
"logps/chosen": -232.9503173828125,
"logps/rejected": -198.68533325195312,
"loss": 0.6863,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.010409911163151264,
"rewards/margins": 0.01460896898061037,
"rewards/rejected": -0.0041990578174591064,
"step": 310
},
{
"epoch": 0.15358771298296137,
"grad_norm": 48.0,
"learning_rate": 4.956841830911587e-07,
"logits/chosen": -2.688969850540161,
"logits/rejected": -2.5635781288146973,
"logps/chosen": -244.30337524414062,
"logps/rejected": -198.82345581054688,
"loss": 0.6871,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.01625187136232853,
"rewards/margins": 0.013046267442405224,
"rewards/rejected": 0.0032056006602942944,
"step": 320
},
{
"epoch": 0.1583873290136789,
"grad_norm": 56.25,
"learning_rate": 4.948743177626708e-07,
"logits/chosen": -2.708862543106079,
"logits/rejected": -2.5964908599853516,
"logps/chosen": -218.1621856689453,
"logps/rejected": -196.46609497070312,
"loss": 0.691,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.015645433217287064,
"rewards/margins": 0.005278537981212139,
"rewards/rejected": 0.010366896167397499,
"step": 330
},
{
"epoch": 0.16318694504439646,
"grad_norm": 53.5,
"learning_rate": 4.939956357662805e-07,
"logits/chosen": -2.664097309112549,
"logits/rejected": -2.504223108291626,
"logps/chosen": -233.2242431640625,
"logps/rejected": -178.89642333984375,
"loss": 0.6869,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.010349711403250694,
"rewards/margins": 0.013576941564679146,
"rewards/rejected": -0.0032272294629365206,
"step": 340
},
{
"epoch": 0.16798656107511398,
"grad_norm": 53.25,
"learning_rate": 4.930483840366915e-07,
"logits/chosen": -2.6505606174468994,
"logits/rejected": -2.508861541748047,
"logps/chosen": -253.9485321044922,
"logps/rejected": -195.91908264160156,
"loss": 0.6851,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.024759415537118912,
"rewards/margins": 0.017002228647470474,
"rewards/rejected": 0.0077571868896484375,
"step": 350
},
{
"epoch": 0.17278617710583152,
"grad_norm": 46.75,
"learning_rate": 4.920328287786586e-07,
"logits/chosen": -2.661841869354248,
"logits/rejected": -2.5565028190612793,
"logps/chosen": -229.78305053710938,
"logps/rejected": -194.48904418945312,
"loss": 0.6866,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.021085303276777267,
"rewards/margins": 0.014185063540935516,
"rewards/rejected": 0.006900241132825613,
"step": 360
},
{
"epoch": 0.17758579313654907,
"grad_norm": 45.5,
"learning_rate": 4.90949255392176e-07,
"logits/chosen": -2.70994234085083,
"logits/rejected": -2.547222852706909,
"logps/chosen": -244.82565307617188,
"logps/rejected": -202.6025390625,
"loss": 0.6828,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.022651832550764084,
"rewards/margins": 0.021922901272773743,
"rewards/rejected": 0.0007289334898814559,
"step": 370
},
{
"epoch": 0.18238540916726662,
"grad_norm": 48.75,
"learning_rate": 4.897979683922727e-07,
"logits/chosen": -2.733055591583252,
"logits/rejected": -2.6267523765563965,
"logps/chosen": -218.7659149169922,
"logps/rejected": -180.62173461914062,
"loss": 0.6845,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.01779359206557274,
"rewards/margins": 0.01852073147892952,
"rewards/rejected": -0.0007271372596733272,
"step": 380
},
{
"epoch": 0.18718502519798416,
"grad_norm": 48.75,
"learning_rate": 4.885792913234339e-07,
"logits/chosen": -2.6706607341766357,
"logits/rejected": -2.6117610931396484,
"logps/chosen": -223.50021362304688,
"logps/rejected": -208.219482421875,
"loss": 0.6879,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.025855297222733498,
"rewards/margins": 0.011695639230310917,
"rewards/rejected": 0.014159657061100006,
"step": 390
},
{
"epoch": 0.1919846412287017,
"grad_norm": 49.5,
"learning_rate": 4.872935666686766e-07,
"logits/chosen": -2.6978952884674072,
"logits/rejected": -2.5849671363830566,
"logps/chosen": -233.3878631591797,
"logps/rejected": -212.00723266601562,
"loss": 0.6868,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.025096680968999863,
"rewards/margins": 0.013743218965828419,
"rewards/rejected": 0.01135346107184887,
"step": 400
},
{
"epoch": 0.1919846412287017,
"eval_logits/chosen": -2.7157444953918457,
"eval_logits/rejected": -2.595482587814331,
"eval_logps/chosen": -232.87948608398438,
"eval_logps/rejected": -205.45095825195312,
"eval_loss": 0.6850579977035522,
"eval_rewards/accuracies": 0.6299999952316284,
"eval_rewards/chosen": 0.025349698960781097,
"eval_rewards/margins": 0.017367491498589516,
"eval_rewards/rejected": 0.007982207462191582,
"eval_runtime": 21.4159,
"eval_samples_per_second": 46.694,
"eval_steps_per_second": 11.674,
"step": 400
},
{
"epoch": 0.19678425725941925,
"grad_norm": 50.75,
"learning_rate": 4.859411557533018e-07,
"logits/chosen": -2.7110087871551514,
"logits/rejected": -2.599547863006592,
"logps/chosen": -229.89578247070312,
"logps/rejected": -196.5421905517578,
"loss": 0.685,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.023681003600358963,
"rewards/margins": 0.01766652800142765,
"rewards/rejected": 0.006014474667608738,
"step": 410
},
{
"epoch": 0.2015838732901368,
"grad_norm": 47.75,
"learning_rate": 4.845224386433521e-07,
"logits/chosen": -2.6937224864959717,
"logits/rejected": -2.616425037384033,
"logps/chosen": -207.7985382080078,
"logps/rejected": -210.020751953125,
"loss": 0.6884,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.02355768159031868,
"rewards/margins": 0.01056084968149662,
"rewards/rejected": 0.012996832840144634,
"step": 420
},
{
"epoch": 0.20638348932085432,
"grad_norm": 42.75,
"learning_rate": 4.830378140388015e-07,
"logits/chosen": -2.802743434906006,
"logits/rejected": -2.6532254219055176,
"logps/chosen": -238.71115112304688,
"logps/rejected": -192.9730987548828,
"loss": 0.6816,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.025352749973535538,
"rewards/margins": 0.02443523518741131,
"rewards/rejected": 0.0009175121667794883,
"step": 430
},
{
"epoch": 0.21118310535157186,
"grad_norm": 52.5,
"learning_rate": 4.814876991615104e-07,
"logits/chosen": -2.682868719100952,
"logits/rejected": -2.5881507396698,
"logps/chosen": -226.88131713867188,
"logps/rejected": -197.5751495361328,
"loss": 0.6858,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.024431098252534866,
"rewards/margins": 0.016206270083785057,
"rewards/rejected": 0.008224830962717533,
"step": 440
},
{
"epoch": 0.2159827213822894,
"grad_norm": 55.0,
"learning_rate": 4.798725296379735e-07,
"logits/chosen": -2.711108684539795,
"logits/rejected": -2.626420497894287,
"logps/chosen": -221.24533081054688,
"logps/rejected": -193.32937622070312,
"loss": 0.6844,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.031023338437080383,
"rewards/margins": 0.01875515654683113,
"rewards/rejected": 0.012268180958926678,
"step": 450
},
{
"epoch": 0.22078233741300696,
"grad_norm": 48.5,
"learning_rate": 4.781927593768969e-07,
"logits/chosen": -2.7570741176605225,
"logits/rejected": -2.6272220611572266,
"logps/chosen": -232.4735107421875,
"logps/rejected": -199.38890075683594,
"loss": 0.6839,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.026766937226057053,
"rewards/margins": 0.019803114235401154,
"rewards/rejected": 0.006963823921978474,
"step": 460
},
{
"epoch": 0.2255819534437245,
"grad_norm": 46.5,
"learning_rate": 4.764488604416364e-07,
"logits/chosen": -2.7485814094543457,
"logits/rejected": -2.579071044921875,
"logps/chosen": -257.55096435546875,
"logps/rejected": -215.48782348632812,
"loss": 0.6799,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.03846278041601181,
"rewards/margins": 0.02814416028559208,
"rewards/rejected": 0.010318620130419731,
"step": 470
},
{
"epoch": 0.23038156947444205,
"grad_norm": 49.5,
"learning_rate": 4.7464132291753457e-07,
"logits/chosen": -2.693459987640381,
"logits/rejected": -2.601459503173828,
"logps/chosen": -213.8591766357422,
"logps/rejected": -188.70779418945312,
"loss": 0.6871,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.02118738368153572,
"rewards/margins": 0.013606322929263115,
"rewards/rejected": 0.007581062614917755,
"step": 480
},
{
"epoch": 0.2351811855051596,
"grad_norm": 50.0,
"learning_rate": 4.7277065477419236e-07,
"logits/chosen": -2.6836752891540527,
"logits/rejected": -2.5498709678649902,
"logps/chosen": -229.8844451904297,
"logps/rejected": -181.79513549804688,
"loss": 0.68,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.02958494983613491,
"rewards/margins": 0.027864400297403336,
"rewards/rejected": 0.0017205558251589537,
"step": 490
},
{
"epoch": 0.23998080153587714,
"grad_norm": 50.0,
"learning_rate": 4.7083738172271575e-07,
"logits/chosen": -2.6776702404022217,
"logits/rejected": -2.5478641986846924,
"logps/chosen": -241.678466796875,
"logps/rejected": -201.00245666503906,
"loss": 0.6835,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.03041999600827694,
"rewards/margins": 0.020813625305891037,
"rewards/rejected": 0.009606371633708477,
"step": 500
},
{
"epoch": 0.24478041756659466,
"grad_norm": 47.5,
"learning_rate": 4.6884204706797537e-07,
"logits/chosen": -2.67305850982666,
"logits/rejected": -2.5395994186401367,
"logps/chosen": -246.15304565429688,
"logps/rejected": -191.6173553466797,
"loss": 0.6802,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.03808588907122612,
"rewards/margins": 0.027713218703866005,
"rewards/rejected": 0.010372666642069817,
"step": 510
},
{
"epoch": 0.2495800335973122,
"grad_norm": 56.25,
"learning_rate": 4.6678521155592266e-07,
"logits/chosen": -2.715430498123169,
"logits/rejected": -2.5766196250915527,
"logps/chosen": -257.3648681640625,
"logps/rejected": -218.2578125,
"loss": 0.6819,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.03974943235516548,
"rewards/margins": 0.0246734581887722,
"rewards/rejected": 0.01507597416639328,
"step": 520
},
{
"epoch": 0.2543796496280298,
"grad_norm": 42.75,
"learning_rate": 4.646674532160041e-07,
"logits/chosen": -2.7444615364074707,
"logits/rejected": -2.642268180847168,
"logps/chosen": -234.83529663085938,
"logps/rejected": -207.768798828125,
"loss": 0.6895,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 0.02697952464222908,
"rewards/margins": 0.008664881810545921,
"rewards/rejected": 0.018314644694328308,
"step": 530
},
{
"epoch": 0.2591792656587473,
"grad_norm": 52.5,
"learning_rate": 4.624893671987185e-07,
"logits/chosen": -2.710597515106201,
"logits/rejected": -2.618180513381958,
"logps/chosen": -220.21383666992188,
"logps/rejected": -187.06417846679688,
"loss": 0.6782,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.030816808342933655,
"rewards/margins": 0.031680621206760406,
"rewards/rejected": -0.0008638119325041771,
"step": 540
},
{
"epoch": 0.2639788816894648,
"grad_norm": 51.5,
"learning_rate": 4.602515656083629e-07,
"logits/chosen": -2.7750511169433594,
"logits/rejected": -2.689318895339966,
"logps/chosen": -234.1312255859375,
"logps/rejected": -218.6779327392578,
"loss": 0.682,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.032838549464941025,
"rewards/margins": 0.02399415522813797,
"rewards/rejected": 0.008844394236803055,
"step": 550
},
{
"epoch": 0.26877849772018236,
"grad_norm": 48.5,
"learning_rate": 4.5795467733101356e-07,
"logits/chosen": -2.716984510421753,
"logits/rejected": -2.536345958709717,
"logps/chosen": -238.9961395263672,
"logps/rejected": -209.2313690185547,
"loss": 0.6839,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.029773468151688576,
"rewards/margins": 0.020234117284417152,
"rewards/rejected": 0.009539352729916573,
"step": 560
},
{
"epoch": 0.2735781137508999,
"grad_norm": 45.0,
"learning_rate": 4.555993478577911e-07,
"logits/chosen": -2.7671806812286377,
"logits/rejected": -2.5658230781555176,
"logps/chosen": -245.57693481445312,
"logps/rejected": -186.72381591796875,
"loss": 0.6743,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.04541153460741043,
"rewards/margins": 0.03957425057888031,
"rewards/rejected": 0.0058372789062559605,
"step": 570
},
{
"epoch": 0.27837772978161746,
"grad_norm": 46.75,
"learning_rate": 4.531862391034591e-07,
"logits/chosen": -2.6841483116149902,
"logits/rejected": -2.5884292125701904,
"logps/chosen": -234.3633270263672,
"logps/rejected": -198.0540008544922,
"loss": 0.6779,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.03814179450273514,
"rewards/margins": 0.03233181685209274,
"rewards/rejected": 0.005809984169900417,
"step": 580
},
{
"epoch": 0.283177345812335,
"grad_norm": 48.5,
"learning_rate": 4.5071602922040734e-07,
"logits/chosen": -2.762327194213867,
"logits/rejected": -2.6249194145202637,
"logps/chosen": -237.1390380859375,
"logps/rejected": -201.12574768066406,
"loss": 0.6779,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.0438932366669178,
"rewards/margins": 0.032640643417835236,
"rewards/rejected": 0.011252591386437416,
"step": 590
},
{
"epoch": 0.28797696184305255,
"grad_norm": 51.25,
"learning_rate": 4.4818941240807133e-07,
"logits/chosen": -2.751591920852661,
"logits/rejected": -2.6312174797058105,
"logps/chosen": -235.29855346679688,
"logps/rejected": -211.7731475830078,
"loss": 0.6741,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.043742720037698746,
"rewards/margins": 0.040415119379758835,
"rewards/rejected": 0.0033275973983108997,
"step": 600
},
{
"epoch": 0.28797696184305255,
"eval_logits/chosen": -2.7159337997436523,
"eval_logits/rejected": -2.5955567359924316,
"eval_logps/chosen": -232.7647247314453,
"eval_logps/rejected": -205.43447875976562,
"eval_loss": 0.6805809140205383,
"eval_rewards/accuracies": 0.640999972820282,
"eval_rewards/chosen": 0.03682754188776016,
"eval_rewards/margins": 0.02719729021191597,
"eval_rewards/rejected": 0.009630252607166767,
"eval_runtime": 21.3954,
"eval_samples_per_second": 46.739,
"eval_steps_per_second": 11.685,
"step": 600
},
{
"epoch": 0.2927765778737701,
"grad_norm": 46.75,
"learning_rate": 4.456070987178426e-07,
"logits/chosen": -2.7190473079681396,
"logits/rejected": -2.549043893814087,
"logps/chosen": -218.71493530273438,
"logps/rejected": -175.1071014404297,
"loss": 0.6771,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.029196638613939285,
"rewards/margins": 0.034055858850479126,
"rewards/rejected": -0.004859219305217266,
"step": 610
},
{
"epoch": 0.29757619390448764,
"grad_norm": 43.5,
"learning_rate": 4.429698138535241e-07,
"logits/chosen": -2.689408779144287,
"logits/rejected": -2.5913164615631104,
"logps/chosen": -238.9366912841797,
"logps/rejected": -217.3396453857422,
"loss": 0.6825,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.04081985726952553,
"rewards/margins": 0.02339627407491207,
"rewards/rejected": 0.01742357760667801,
"step": 620
},
{
"epoch": 0.3023758099352052,
"grad_norm": 45.25,
"learning_rate": 4.402782989673867e-07,
"logits/chosen": -2.7332329750061035,
"logits/rejected": -2.5742244720458984,
"logps/chosen": -241.5056610107422,
"logps/rejected": -199.6383514404297,
"loss": 0.6788,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.044023603200912476,
"rewards/margins": 0.03084336593747139,
"rewards/rejected": 0.013180236332118511,
"step": 630
},
{
"epoch": 0.30717542596592273,
"grad_norm": 48.5,
"learning_rate": 4.3753331045188415e-07,
"logits/chosen": -2.651803970336914,
"logits/rejected": -2.60718035697937,
"logps/chosen": -223.8908233642578,
"logps/rejected": -215.56411743164062,
"loss": 0.6881,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.027814963832497597,
"rewards/margins": 0.012350986711680889,
"rewards/rejected": 0.015463980846107006,
"step": 640
},
{
"epoch": 0.3119750419966403,
"grad_norm": 47.75,
"learning_rate": 4.3473561972708517e-07,
"logits/chosen": -2.7187139987945557,
"logits/rejected": -2.5601890087127686,
"logps/chosen": -232.9247283935547,
"logps/rejected": -204.0401611328125,
"loss": 0.6835,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.030253728851675987,
"rewards/margins": 0.021437767893075943,
"rewards/rejected": 0.008815961889922619,
"step": 650
},
{
"epoch": 0.3167746580273578,
"grad_norm": 45.75,
"learning_rate": 4.3188601302388276e-07,
"logits/chosen": -2.6520533561706543,
"logits/rejected": -2.577056407928467,
"logps/chosen": -217.80337524414062,
"logps/rejected": -217.4649658203125,
"loss": 0.6781,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.043406371027231216,
"rewards/margins": 0.03220932558178902,
"rewards/rejected": 0.011197047308087349,
"step": 660
},
{
"epoch": 0.32157427405807537,
"grad_norm": 45.0,
"learning_rate": 4.289852911630406e-07,
"logits/chosen": -2.746192455291748,
"logits/rejected": -2.5919899940490723,
"logps/chosen": -261.55328369140625,
"logps/rejected": -208.4563446044922,
"loss": 0.6746,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.05559698864817619,
"rewards/margins": 0.039354514330625534,
"rewards/rejected": 0.016242478042840958,
"step": 670
},
{
"epoch": 0.3263738900887929,
"grad_norm": 56.5,
"learning_rate": 4.2603426933013955e-07,
"logits/chosen": -2.7136101722717285,
"logits/rejected": -2.5737733840942383,
"logps/chosen": -235.10751342773438,
"logps/rejected": -190.7623291015625,
"loss": 0.6826,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.03037920594215393,
"rewards/margins": 0.02374129556119442,
"rewards/rejected": 0.0066379099152982235,
"step": 680
},
{
"epoch": 0.33117350611951046,
"grad_norm": 49.75,
"learning_rate": 4.2303377684648734e-07,
"logits/chosen": -2.693387746810913,
"logits/rejected": -2.6150875091552734,
"logps/chosen": -229.60073852539062,
"logps/rejected": -227.20849609375,
"loss": 0.6781,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.047377780079841614,
"rewards/margins": 0.03262994438409805,
"rewards/rejected": 0.014747830107808113,
"step": 690
},
{
"epoch": 0.33597312215022795,
"grad_norm": 58.25,
"learning_rate": 4.199846569360557e-07,
"logits/chosen": -2.7111198902130127,
"logits/rejected": -2.6025779247283936,
"logps/chosen": -236.203857421875,
"logps/rejected": -210.2013702392578,
"loss": 0.6878,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.03395792096853256,
"rewards/margins": 0.013354765251278877,
"rewards/rejected": 0.020603153854608536,
"step": 700
},
{
"epoch": 0.3407727381809455,
"grad_norm": 51.0,
"learning_rate": 4.1688776648851034e-07,
"logits/chosen": -2.7076306343078613,
"logits/rejected": -2.5523858070373535,
"logps/chosen": -228.8372039794922,
"logps/rejected": -181.23193359375,
"loss": 0.6765,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.03608817234635353,
"rewards/margins": 0.03536154329776764,
"rewards/rejected": 0.0007266284665092826,
"step": 710
},
{
"epoch": 0.34557235421166305,
"grad_norm": 49.25,
"learning_rate": 4.1374397581840034e-07,
"logits/chosen": -2.7360334396362305,
"logits/rejected": -2.5981593132019043,
"logps/chosen": -227.6597137451172,
"logps/rejected": -183.78594970703125,
"loss": 0.676,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.040428828448057175,
"rewards/margins": 0.03674127534031868,
"rewards/rejected": 0.0036875568330287933,
"step": 720
},
{
"epoch": 0.3503719702423806,
"grad_norm": 53.5,
"learning_rate": 4.105541684205751e-07,
"logits/chosen": -2.6906344890594482,
"logits/rejected": -2.5774295330047607,
"logps/chosen": -218.3501434326172,
"logps/rejected": -195.3230743408203,
"loss": 0.6776,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.037394892424345016,
"rewards/margins": 0.03364209085702896,
"rewards/rejected": 0.0037527973763644695,
"step": 730
},
{
"epoch": 0.35517158627309814,
"grad_norm": 49.25,
"learning_rate": 4.073192407218971e-07,
"logits/chosen": -2.7351787090301514,
"logits/rejected": -2.5966103076934814,
"logps/chosen": -241.0888214111328,
"logps/rejected": -191.10189819335938,
"loss": 0.6704,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.0579073540866375,
"rewards/margins": 0.04853241890668869,
"rewards/rejected": 0.009374936111271381,
"step": 740
},
{
"epoch": 0.3599712023038157,
"grad_norm": 50.5,
"learning_rate": 4.040401018293204e-07,
"logits/chosen": -2.664130687713623,
"logits/rejected": -2.594024181365967,
"logps/chosen": -221.1615753173828,
"logps/rejected": -228.0982666015625,
"loss": 0.6793,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.0396769680082798,
"rewards/margins": 0.03049297071993351,
"rewards/rejected": 0.009183998219668865,
"step": 750
},
{
"epoch": 0.36477081833453323,
"grad_norm": 52.25,
"learning_rate": 4.0071767327440536e-07,
"logits/chosen": -2.6687798500061035,
"logits/rejected": -2.6370534896850586,
"logps/chosen": -235.759521484375,
"logps/rejected": -229.422119140625,
"loss": 0.6776,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.05410134792327881,
"rewards/margins": 0.034135472029447556,
"rewards/rejected": 0.019965868443250656,
"step": 760
},
{
"epoch": 0.3695704343652508,
"grad_norm": 47.25,
"learning_rate": 3.9735288875434254e-07,
"logits/chosen": -2.741582155227661,
"logits/rejected": -2.554959774017334,
"logps/chosen": -242.64480590820312,
"logps/rejected": -185.625244140625,
"loss": 0.6808,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.034398965537548065,
"rewards/margins": 0.02690746821463108,
"rewards/rejected": 0.007491500116884708,
"step": 770
},
{
"epoch": 0.3743700503959683,
"grad_norm": 55.0,
"learning_rate": 3.939466938695565e-07,
"logits/chosen": -2.660132884979248,
"logits/rejected": -2.5671591758728027,
"logps/chosen": -253.7039337158203,
"logps/rejected": -221.8881072998047,
"loss": 0.6792,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.04830535501241684,
"rewards/margins": 0.031025772914290428,
"rewards/rejected": 0.01727958396077156,
"step": 780
},
{
"epoch": 0.37916966642668587,
"grad_norm": 46.25,
"learning_rate": 3.905000458579657e-07,
"logits/chosen": -2.672783851623535,
"logits/rejected": -2.598494291305542,
"logps/chosen": -210.40975952148438,
"logps/rejected": -224.6177978515625,
"loss": 0.6821,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.03782086446881294,
"rewards/margins": 0.02480388432741165,
"rewards/rejected": 0.01301698386669159,
"step": 790
},
{
"epoch": 0.3839692824574034,
"grad_norm": 50.75,
"learning_rate": 3.870139133259709e-07,
"logits/chosen": -2.6891722679138184,
"logits/rejected": -2.5445141792297363,
"logps/chosen": -259.342529296875,
"logps/rejected": -206.988525390625,
"loss": 0.6767,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.05699441581964493,
"rewards/margins": 0.03583725541830063,
"rewards/rejected": 0.0211571604013443,
"step": 800
},
{
"epoch": 0.3839692824574034,
"eval_logits/chosen": -2.7158119678497314,
"eval_logits/rejected": -2.595388650894165,
"eval_logps/chosen": -232.6337127685547,
"eval_logps/rejected": -205.41590881347656,
"eval_loss": 0.6753210425376892,
"eval_rewards/accuracies": 0.6669999957084656,
"eval_rewards/chosen": 0.04992655664682388,
"eval_rewards/margins": 0.03843830153346062,
"eval_rewards/rejected": 0.01148825604468584,
"eval_runtime": 21.4126,
"eval_samples_per_second": 46.702,
"eval_steps_per_second": 11.675,
"step": 800
},
{
"epoch": 0.38876889848812096,
"grad_norm": 55.5,
"learning_rate": 3.8348927597624964e-07,
"logits/chosen": -2.740044116973877,
"logits/rejected": -2.6301164627075195,
"logps/chosen": -231.58377075195312,
"logps/rejected": -210.11886596679688,
"loss": 0.6786,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.052413731813430786,
"rewards/margins": 0.031818680465221405,
"rewards/rejected": 0.020595049485564232,
"step": 810
},
{
"epoch": 0.3935685145188385,
"grad_norm": 49.25,
"learning_rate": 3.7992712433243114e-07,
"logits/chosen": -2.717849016189575,
"logits/rejected": -2.5538547039031982,
"logps/chosen": -233.3022003173828,
"logps/rejected": -178.08187866210938,
"loss": 0.6776,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.034468021243810654,
"rewards/margins": 0.03307095915079117,
"rewards/rejected": 0.001397057669237256,
"step": 820
},
{
"epoch": 0.39836813054955605,
"grad_norm": 47.5,
"learning_rate": 3.7632845946073135e-07,
"logits/chosen": -2.7453646659851074,
"logits/rejected": -2.5826191902160645,
"logps/chosen": -246.8603515625,
"logps/rejected": -179.30404663085938,
"loss": 0.6707,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.045634619891643524,
"rewards/margins": 0.047946564853191376,
"rewards/rejected": -0.002311945194378495,
"step": 830
},
{
"epoch": 0.4031677465802736,
"grad_norm": 48.0,
"learning_rate": 3.7269429268862507e-07,
"logits/chosen": -2.710023880004883,
"logits/rejected": -2.6359734535217285,
"logps/chosen": -208.82150268554688,
"logps/rejected": -196.3325653076172,
"loss": 0.6799,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.04252176731824875,
"rewards/margins": 0.02919645607471466,
"rewards/rejected": 0.01332530565559864,
"step": 840
},
{
"epoch": 0.40796736261099115,
"grad_norm": 57.0,
"learning_rate": 3.6902564532063336e-07,
"logits/chosen": -2.7001442909240723,
"logits/rejected": -2.6420705318450928,
"logps/chosen": -214.80667114257812,
"logps/rejected": -198.60533142089844,
"loss": 0.6792,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.04096178710460663,
"rewards/margins": 0.03064887225627899,
"rewards/rejected": 0.010312914848327637,
"step": 850
},
{
"epoch": 0.41276697864170864,
"grad_norm": 50.0,
"learning_rate": 3.653235483513084e-07,
"logits/chosen": -2.736861228942871,
"logits/rejected": -2.618833541870117,
"logps/chosen": -248.7901153564453,
"logps/rejected": -216.1083526611328,
"loss": 0.6743,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.056495171040296555,
"rewards/margins": 0.041087765246629715,
"rewards/rejected": 0.015407413244247437,
"step": 860
},
{
"epoch": 0.4175665946724262,
"grad_norm": 48.0,
"learning_rate": 3.615890421754944e-07,
"logits/chosen": -2.724944591522217,
"logits/rejected": -2.6574723720550537,
"logps/chosen": -223.84408569335938,
"logps/rejected": -194.44735717773438,
"loss": 0.6796,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.042622704058885574,
"rewards/margins": 0.02971811592578888,
"rewards/rejected": 0.012904593721032143,
"step": 870
},
{
"epoch": 0.42236621070314373,
"grad_norm": 61.0,
"learning_rate": 3.5782317629594706e-07,
"logits/chosen": -2.706808567047119,
"logits/rejected": -2.6081411838531494,
"logps/chosen": -241.17495727539062,
"logps/rejected": -212.4481658935547,
"loss": 0.6819,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.047704242169857025,
"rewards/margins": 0.025270383805036545,
"rewards/rejected": 0.022433852776885033,
"step": 880
},
{
"epoch": 0.4271658267338613,
"grad_norm": 47.5,
"learning_rate": 3.5402700902839313e-07,
"logits/chosen": -2.6064233779907227,
"logits/rejected": -2.556283473968506,
"logps/chosen": -206.00344848632812,
"logps/rejected": -208.06936645507812,
"loss": 0.6791,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.036233410239219666,
"rewards/margins": 0.030693132430315018,
"rewards/rejected": 0.005540275014936924,
"step": 890
},
{
"epoch": 0.4319654427645788,
"grad_norm": 46.75,
"learning_rate": 3.5020160720411403e-07,
"logits/chosen": -2.722177267074585,
"logits/rejected": -2.592517137527466,
"logps/chosen": -234.11703491210938,
"logps/rejected": -216.63766479492188,
"loss": 0.6759,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.05363321304321289,
"rewards/margins": 0.03814633563160896,
"rewards/rejected": 0.015486878342926502,
"step": 900
},
{
"epoch": 0.43676505879529637,
"grad_norm": 52.75,
"learning_rate": 3.46348045870135e-07,
"logits/chosen": -2.6586787700653076,
"logits/rejected": -2.601860523223877,
"logps/chosen": -215.1962432861328,
"logps/rejected": -208.45068359375,
"loss": 0.6834,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.04313874989748001,
"rewards/margins": 0.022245222702622414,
"rewards/rejected": 0.020893529057502747,
"step": 910
},
{
"epoch": 0.4415646748260139,
"grad_norm": 45.75,
"learning_rate": 3.4246740798710725e-07,
"logits/chosen": -2.672468900680542,
"logits/rejected": -2.5783610343933105,
"logps/chosen": -216.0347442626953,
"logps/rejected": -195.738037109375,
"loss": 0.6819,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.03864717856049538,
"rewards/margins": 0.02574675716459751,
"rewards/rejected": 0.01290042232722044,
"step": 920
},
{
"epoch": 0.44636429085673146,
"grad_norm": 50.25,
"learning_rate": 3.3856078412496417e-07,
"logits/chosen": -2.729473829269409,
"logits/rejected": -2.588343858718872,
"logps/chosen": -238.8496551513672,
"logps/rejected": -190.5778045654297,
"loss": 0.6685,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.048866622149944305,
"rewards/margins": 0.05237164348363876,
"rewards/rejected": -0.003505019936710596,
"step": 930
},
{
"epoch": 0.451163906887449,
"grad_norm": 49.5,
"learning_rate": 3.3462927215644066e-07,
"logits/chosen": -2.747483968734741,
"logits/rejected": -2.640693187713623,
"logps/chosen": -262.37213134765625,
"logps/rejected": -216.82742309570312,
"loss": 0.6748,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.060678768903017044,
"rewards/margins": 0.04027427360415459,
"rewards/rejected": 0.020404506474733353,
"step": 940
},
{
"epoch": 0.45596352291816655,
"grad_norm": 42.75,
"learning_rate": 3.3067397694853937e-07,
"logits/chosen": -2.6840896606445312,
"logits/rejected": -2.554112434387207,
"logps/chosen": -233.49899291992188,
"logps/rejected": -191.6746063232422,
"loss": 0.6713,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.051722604781389236,
"rewards/margins": 0.04748953878879547,
"rewards/rejected": 0.0042330720461905,
"step": 950
},
{
"epoch": 0.4607631389488841,
"grad_norm": 46.0,
"learning_rate": 3.2669601005203155e-07,
"logits/chosen": -2.717355251312256,
"logits/rejected": -2.615908145904541,
"logps/chosen": -204.6543426513672,
"logps/rejected": -185.7799072265625,
"loss": 0.6734,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.04618459939956665,
"rewards/margins": 0.042539265006780624,
"rewards/rejected": 0.003645337652415037,
"step": 960
},
{
"epoch": 0.46556275497960165,
"grad_norm": 48.5,
"learning_rate": 3.2269648938907973e-07,
"logits/chosen": -2.6776490211486816,
"logits/rejected": -2.560394287109375,
"logps/chosen": -214.96142578125,
"logps/rejected": -182.30978393554688,
"loss": 0.6781,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.0470888614654541,
"rewards/margins": 0.033395569771528244,
"rewards/rejected": 0.013693295419216156,
"step": 970
},
{
"epoch": 0.4703623710103192,
"grad_norm": 49.0,
"learning_rate": 3.186765389390695e-07,
"logits/chosen": -2.7659125328063965,
"logits/rejected": -2.6198360919952393,
"logps/chosen": -251.896240234375,
"logps/rejected": -194.74826049804688,
"loss": 0.6773,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.043909598141908646,
"rewards/margins": 0.03449582681059837,
"rewards/rejected": 0.009413773193955421,
"step": 980
},
{
"epoch": 0.47516198704103674,
"grad_norm": 50.25,
"learning_rate": 3.146372884227393e-07,
"logits/chosen": -2.7383854389190674,
"logits/rejected": -2.633877992630005,
"logps/chosen": -249.55557250976562,
"logps/rejected": -215.5314178466797,
"loss": 0.6781,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.05108872801065445,
"rewards/margins": 0.03366169333457947,
"rewards/rejected": 0.01742703653872013,
"step": 990
},
{
"epoch": 0.4799616030717543,
"grad_norm": 45.75,
"learning_rate": 3.105798729846969e-07,
"logits/chosen": -2.6620967388153076,
"logits/rejected": -2.5416641235351562,
"logps/chosen": -214.88015747070312,
"logps/rejected": -182.47698974609375,
"loss": 0.676,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.04558812081813812,
"rewards/margins": 0.0367765799164772,
"rewards/rejected": 0.00881153903901577,
"step": 1000
},
{
"epoch": 0.4799616030717543,
"eval_logits/chosen": -2.716266393661499,
"eval_logits/rejected": -2.595984935760498,
"eval_logps/chosen": -232.63925170898438,
"eval_logps/rejected": -205.39816284179688,
"eval_loss": 0.6766188740730286,
"eval_rewards/accuracies": 0.6570000052452087,
"eval_rewards/chosen": 0.04937145859003067,
"eval_rewards/margins": 0.03610716760158539,
"eval_rewards/rejected": 0.013264299370348454,
"eval_runtime": 21.4065,
"eval_samples_per_second": 46.715,
"eval_steps_per_second": 11.679,
"step": 1000
},
{
"epoch": 0.48476121910247183,
"grad_norm": 47.0,
"learning_rate": 3.065054328744109e-07,
"logits/chosen": -2.6782500743865967,
"logits/rejected": -2.5327606201171875,
"logps/chosen": -249.7314910888672,
"logps/rejected": -209.0973663330078,
"loss": 0.6751,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.05049954727292061,
"rewards/margins": 0.03927897661924362,
"rewards/rejected": 0.011220571584999561,
"step": 1010
},
{
"epoch": 0.4895608351331893,
"grad_norm": 51.75,
"learning_rate": 3.024151131257687e-07,
"logits/chosen": -2.7015366554260254,
"logits/rejected": -2.5806756019592285,
"logps/chosen": -245.3987274169922,
"logps/rejected": -191.32785034179688,
"loss": 0.6728,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.04835500195622444,
"rewards/margins": 0.04381892830133438,
"rewards/rejected": 0.00453607365489006,
"step": 1020
},
{
"epoch": 0.49436045116390687,
"grad_norm": 50.75,
"learning_rate": 2.9831006323528886e-07,
"logits/chosen": -2.7741270065307617,
"logits/rejected": -2.5906481742858887,
"logps/chosen": -254.17239379882812,
"logps/rejected": -197.8710479736328,
"loss": 0.673,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.059526920318603516,
"rewards/margins": 0.04361771419644356,
"rewards/rejected": 0.01590920425951481,
"step": 1030
},
{
"epoch": 0.4991600671946244,
"grad_norm": 48.0,
"learning_rate": 2.941914368390798e-07,
"logits/chosen": -2.692235231399536,
"logits/rejected": -2.610217332839966,
"logps/chosen": -218.3246307373047,
"logps/rejected": -205.70993041992188,
"loss": 0.6816,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.032597918063402176,
"rewards/margins": 0.025554979220032692,
"rewards/rejected": 0.007042936980724335,
"step": 1040
},
{
"epoch": 0.503959683225342,
"grad_norm": 56.75,
"learning_rate": 2.900603913886357e-07,
"logits/chosen": -2.672635555267334,
"logits/rejected": -2.5501255989074707,
"logps/chosen": -244.4874267578125,
"logps/rejected": -211.55068969726562,
"loss": 0.6746,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.04713388532400131,
"rewards/margins": 0.04026245325803757,
"rewards/rejected": 0.006871436722576618,
"step": 1050
},
{
"epoch": 0.5087592992560596,
"grad_norm": 49.25,
"learning_rate": 2.859180878255588e-07,
"logits/chosen": -2.682440996170044,
"logits/rejected": -2.611323833465576,
"logps/chosen": -232.0714874267578,
"logps/rejected": -215.4655303955078,
"loss": 0.6815,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.04437769576907158,
"rewards/margins": 0.02631448581814766,
"rewards/rejected": 0.018063215538859367,
"step": 1060
},
{
"epoch": 0.5135589152867771,
"grad_norm": 50.25,
"learning_rate": 2.8176569025530234e-07,
"logits/chosen": -2.7059943675994873,
"logits/rejected": -2.602865219116211,
"logps/chosen": -232.48147583007812,
"logps/rejected": -204.5135040283203,
"loss": 0.6773,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.05438702180981636,
"rewards/margins": 0.034470170736312866,
"rewards/rejected": 0.019916851073503494,
"step": 1070
},
{
"epoch": 0.5183585313174947,
"grad_norm": 50.25,
"learning_rate": 2.7760436562002346e-07,
"logits/chosen": -2.6945126056671143,
"logits/rejected": -2.516050338745117,
"logps/chosen": -265.76055908203125,
"logps/rejected": -181.6714324951172,
"loss": 0.6745,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.05421316623687744,
"rewards/margins": 0.04068455100059509,
"rewards/rejected": 0.0135286133736372,
"step": 1080
},
{
"epoch": 0.5231581473482121,
"grad_norm": 53.5,
"learning_rate": 2.734352833706392e-07,
"logits/chosen": -2.7844748497009277,
"logits/rejected": -2.654388904571533,
"logps/chosen": -249.6466827392578,
"logps/rejected": -214.0853271484375,
"loss": 0.6748,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.06085364893078804,
"rewards/margins": 0.04021080583333969,
"rewards/rejected": 0.020642835646867752,
"step": 1090
},
{
"epoch": 0.5279577633789296,
"grad_norm": 47.75,
"learning_rate": 2.6925961513817733e-07,
"logits/chosen": -2.6918578147888184,
"logits/rejected": -2.627488374710083,
"logps/chosen": -199.40310668945312,
"logps/rejected": -200.98233032226562,
"loss": 0.676,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.043318361043930054,
"rewards/margins": 0.03646283596754074,
"rewards/rejected": 0.006855523679405451,
"step": 1100
},
{
"epoch": 0.5327573794096472,
"grad_norm": 50.0,
"learning_rate": 2.6507853440451484e-07,
"logits/chosen": -2.7055044174194336,
"logits/rejected": -2.61013126373291,
"logps/chosen": -227.26321411132812,
"logps/rejected": -205.27490234375,
"loss": 0.6744,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.04602568596601486,
"rewards/margins": 0.04149339720606804,
"rewards/rejected": 0.00453228922560811,
"step": 1110
},
{
"epoch": 0.5375569954403647,
"grad_norm": 42.0,
"learning_rate": 2.608932161725958e-07,
"logits/chosen": -2.7049965858459473,
"logits/rejected": -2.570584774017334,
"logps/chosen": -232.04818725585938,
"logps/rejected": -203.71127319335938,
"loss": 0.6741,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.04708060622215271,
"rewards/margins": 0.04201260581612587,
"rewards/rejected": 0.005067999474704266,
"step": 1120
},
{
"epoch": 0.5423566114710823,
"grad_norm": 56.5,
"learning_rate": 2.5670483663622247e-07,
"logits/chosen": -2.6920132637023926,
"logits/rejected": -2.583217144012451,
"logps/chosen": -239.03427124023438,
"logps/rejected": -200.62376403808594,
"loss": 0.6751,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.04974411427974701,
"rewards/margins": 0.039146848022937775,
"rewards/rejected": 0.010597268119454384,
"step": 1130
},
{
"epoch": 0.5471562275017998,
"grad_norm": 43.5,
"learning_rate": 2.5251457284951056e-07,
"logits/chosen": -2.709200859069824,
"logits/rejected": -2.6131153106689453,
"logps/chosen": -227.12826538085938,
"logps/rejected": -191.06222534179688,
"loss": 0.6735,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.051633380353450775,
"rewards/margins": 0.04308091849088669,
"rewards/rejected": 0.008552461862564087,
"step": 1140
},
{
"epoch": 0.5519558435325174,
"grad_norm": 45.5,
"learning_rate": 2.4832360239610414e-07,
"logits/chosen": -2.702821731567383,
"logits/rejected": -2.5874671936035156,
"logps/chosen": -228.0370635986328,
"logps/rejected": -201.68345642089844,
"loss": 0.6721,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.052544206380844116,
"rewards/margins": 0.04553115367889404,
"rewards/rejected": 0.007013053633272648,
"step": 1150
},
{
"epoch": 0.5567554595632349,
"grad_norm": 52.25,
"learning_rate": 2.441331030582407e-07,
"logits/chosen": -2.720001697540283,
"logits/rejected": -2.630744457244873,
"logps/chosen": -223.72116088867188,
"logps/rejected": -205.20474243164062,
"loss": 0.6789,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.05077257752418518,
"rewards/margins": 0.030812978744506836,
"rewards/rejected": 0.019959593191742897,
"step": 1160
},
{
"epoch": 0.5615550755939525,
"grad_norm": 48.5,
"learning_rate": 2.39944252485761e-07,
"logits/chosen": -2.7418465614318848,
"logits/rejected": -2.5958893299102783,
"logps/chosen": -245.22238159179688,
"logps/rejected": -192.0289764404297,
"loss": 0.6723,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.05651768296957016,
"rewards/margins": 0.04533126950263977,
"rewards/rejected": 0.011186418123543262,
"step": 1170
},
{
"epoch": 0.56635469162467,
"grad_norm": 53.25,
"learning_rate": 2.3575822786515529e-07,
"logits/chosen": -2.6802361011505127,
"logits/rejected": -2.5656845569610596,
"logps/chosen": -231.7133026123047,
"logps/rejected": -209.87765502929688,
"loss": 0.6792,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.04587788134813309,
"rewards/margins": 0.03062388300895691,
"rewards/rejected": 0.015253995545208454,
"step": 1180
},
{
"epoch": 0.5711543076553875,
"grad_norm": 54.0,
"learning_rate": 2.3157620558874106e-07,
"logits/chosen": -2.723170757293701,
"logits/rejected": -2.5895724296569824,
"logps/chosen": -242.99923706054688,
"logps/rejected": -193.60450744628906,
"loss": 0.6776,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.052822746336460114,
"rewards/margins": 0.03466617316007614,
"rewards/rejected": 0.018156569451093674,
"step": 1190
},
{
"epoch": 0.5759539236861051,
"grad_norm": 42.75,
"learning_rate": 2.2739936092406286e-07,
"logits/chosen": -2.674161434173584,
"logits/rejected": -2.576936721801758,
"logps/chosen": -226.0552215576172,
"logps/rejected": -213.71524047851562,
"loss": 0.6774,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.05344771221280098,
"rewards/margins": 0.034632958471775055,
"rewards/rejected": 0.018814753741025925,
"step": 1200
},
{
"epoch": 0.5759539236861051,
"eval_logits/chosen": -2.71634578704834,
"eval_logits/rejected": -2.595935583114624,
"eval_logps/chosen": -232.62538146972656,
"eval_logps/rejected": -205.43319702148438,
"eval_loss": 0.6742354035377502,
"eval_rewards/accuracies": 0.675000011920929,
"eval_rewards/chosen": 0.050759363919496536,
"eval_rewards/margins": 0.041003111749887466,
"eval_rewards/rejected": 0.009756244719028473,
"eval_runtime": 21.4174,
"eval_samples_per_second": 46.691,
"eval_steps_per_second": 11.673,
"step": 1200
},
{
"epoch": 0.5807535397168226,
"grad_norm": 48.25,
"learning_rate": 2.232288676836087e-07,
"logits/chosen": -2.617983341217041,
"logits/rejected": -2.5485970973968506,
"logps/chosen": -238.41134643554688,
"logps/rejected": -203.52255249023438,
"loss": 0.6683,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.06037604808807373,
"rewards/margins": 0.05402814596891403,
"rewards/rejected": 0.006347896996885538,
"step": 1210
},
{
"epoch": 0.5855531557475402,
"grad_norm": 50.25,
"learning_rate": 2.1906589789493518e-07,
"logits/chosen": -2.710653066635132,
"logits/rejected": -2.5681469440460205,
"logps/chosen": -217.65231323242188,
"logps/rejected": -183.86453247070312,
"loss": 0.6765,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.04624713212251663,
"rewards/margins": 0.036256637424230576,
"rewards/rejected": 0.009990494698286057,
"step": 1220
},
{
"epoch": 0.5903527717782577,
"grad_norm": 50.25,
"learning_rate": 2.1491162147129428e-07,
"logits/chosen": -2.71733021736145,
"logits/rejected": -2.6050782203674316,
"logps/chosen": -232.63601684570312,
"logps/rejected": -208.39132690429688,
"loss": 0.6763,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.051035962998867035,
"rewards/margins": 0.036614201962947845,
"rewards/rejected": 0.014421762898564339,
"step": 1230
},
{
"epoch": 0.5951523878089753,
"grad_norm": 48.0,
"learning_rate": 2.107672058828544e-07,
"logits/chosen": -2.722160816192627,
"logits/rejected": -2.608168363571167,
"logps/chosen": -226.43807983398438,
"logps/rejected": -192.34970092773438,
"loss": 0.6717,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.05640612170100212,
"rewards/margins": 0.0462309755384922,
"rewards/rejected": 0.010175148025155067,
"step": 1240
},
{
"epoch": 0.5999520038396928,
"grad_norm": 54.5,
"learning_rate": 2.0663381582860825e-07,
"logits/chosen": -2.7216320037841797,
"logits/rejected": -2.643075942993164,
"logps/chosen": -226.637451171875,
"logps/rejected": -209.5499725341797,
"loss": 0.6759,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.046819452196359634,
"rewards/margins": 0.03698267415165901,
"rewards/rejected": 0.00983678363263607,
"step": 1250
},
{
"epoch": 0.6047516198704104,
"grad_norm": 45.25,
"learning_rate": 2.025126129090588e-07,
"logits/chosen": -2.776801586151123,
"logits/rejected": -2.626488447189331,
"logps/chosen": -221.3492431640625,
"logps/rejected": -179.89920043945312,
"loss": 0.6709,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.051989030092954636,
"rewards/margins": 0.04751256853342056,
"rewards/rejected": 0.004476464353501797,
"step": 1260
},
{
"epoch": 0.6095512359011279,
"grad_norm": 43.5,
"learning_rate": 1.9840475529977655e-07,
"logits/chosen": -2.71726655960083,
"logits/rejected": -2.6046361923217773,
"logps/chosen": -227.9778594970703,
"logps/rejected": -195.9775390625,
"loss": 0.6739,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.044441260397434235,
"rewards/margins": 0.04231487214565277,
"rewards/rejected": 0.0021263775415718555,
"step": 1270
},
{
"epoch": 0.6143508519318455,
"grad_norm": 49.75,
"learning_rate": 1.9431139742591896e-07,
"logits/chosen": -2.7021281719207764,
"logits/rejected": -2.58604097366333,
"logps/chosen": -207.78173828125,
"logps/rejected": -187.71017456054688,
"loss": 0.6787,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.03924870118498802,
"rewards/margins": 0.031981147825717926,
"rewards/rejected": 0.0072675542905926704,
"step": 1280
},
{
"epoch": 0.619150467962563,
"grad_norm": 52.0,
"learning_rate": 1.9023368963780455e-07,
"logits/chosen": -2.721538543701172,
"logits/rejected": -2.6105265617370605,
"logps/chosen": -232.59326171875,
"logps/rejected": -196.3831024169922,
"loss": 0.6749,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.04439551383256912,
"rewards/margins": 0.03949584811925888,
"rewards/rejected": 0.004899662919342518,
"step": 1290
},
{
"epoch": 0.6239500839932806,
"grad_norm": 47.25,
"learning_rate": 1.861727778876314e-07,
"logits/chosen": -2.7027573585510254,
"logits/rejected": -2.5897445678710938,
"logps/chosen": -207.4355926513672,
"logps/rejected": -173.4372100830078,
"loss": 0.6804,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.030175339430570602,
"rewards/margins": 0.028225919231772423,
"rewards/rejected": 0.001949421362951398,
"step": 1300
},
{
"epoch": 0.6287497000239981,
"grad_norm": 46.5,
"learning_rate": 1.821298034074315e-07,
"logits/chosen": -2.7313363552093506,
"logits/rejected": -2.6595630645751953,
"logps/chosen": -222.908447265625,
"logps/rejected": -203.55274963378906,
"loss": 0.685,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.03836590051651001,
"rewards/margins": 0.019602758809924126,
"rewards/rejected": 0.018763139843940735,
"step": 1310
},
{
"epoch": 0.6335493160547156,
"grad_norm": 48.5,
"learning_rate": 1.7810590238835276e-07,
"logits/chosen": -2.6614937782287598,
"logits/rejected": -2.6302168369293213,
"logps/chosen": -224.0082244873047,
"logps/rejected": -239.5669708251953,
"loss": 0.6785,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.049693018198013306,
"rewards/margins": 0.03309093788266182,
"rewards/rejected": 0.016602078452706337,
"step": 1320
},
{
"epoch": 0.6383489320854332,
"grad_norm": 48.75,
"learning_rate": 1.7410220566135603e-07,
"logits/chosen": -2.733497142791748,
"logits/rejected": -2.613424777984619,
"logps/chosen": -227.0560302734375,
"logps/rejected": -196.37181091308594,
"loss": 0.6739,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.05607549101114273,
"rewards/margins": 0.041366271674633026,
"rewards/rejected": 0.014709214679896832,
"step": 1330
},
{
"epoch": 0.6431485481161507,
"grad_norm": 46.75,
"learning_rate": 1.7011983837942021e-07,
"logits/chosen": -2.7072107791900635,
"logits/rejected": -2.5902278423309326,
"logps/chosen": -233.81179809570312,
"logps/rejected": -206.1332244873047,
"loss": 0.6712,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.0620468370616436,
"rewards/margins": 0.04747987538576126,
"rewards/rejected": 0.014566963538527489,
"step": 1340
},
{
"epoch": 0.6479481641468683,
"grad_norm": 45.5,
"learning_rate": 1.6615991970134158e-07,
"logits/chosen": -2.741150379180908,
"logits/rejected": -2.6298651695251465,
"logps/chosen": -221.12841796875,
"logps/rejected": -191.54258728027344,
"loss": 0.6773,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.04875689372420311,
"rewards/margins": 0.03473493829369545,
"rewards/rejected": 0.014021962881088257,
"step": 1350
},
{
"epoch": 0.6527477801775858,
"grad_norm": 46.75,
"learning_rate": 1.622235624772183e-07,
"logits/chosen": -2.6976001262664795,
"logits/rejected": -2.5869300365448,
"logps/chosen": -232.46533203125,
"logps/rejected": -209.5470428466797,
"loss": 0.6755,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.05371447280049324,
"rewards/margins": 0.0387248769402504,
"rewards/rejected": 0.014989593997597694,
"step": 1360
},
{
"epoch": 0.6575473962083034,
"grad_norm": 46.5,
"learning_rate": 1.5831187293570825e-07,
"logits/chosen": -2.722553014755249,
"logits/rejected": -2.602963924407959,
"logps/chosen": -272.6893615722656,
"logps/rejected": -217.13632202148438,
"loss": 0.6784,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.06017666310071945,
"rewards/margins": 0.03393205627799034,
"rewards/rejected": 0.026244616135954857,
"step": 1370
},
{
"epoch": 0.6623470122390209,
"grad_norm": 50.25,
"learning_rate": 1.5442595037314648e-07,
"logits/chosen": -2.7165400981903076,
"logits/rejected": -2.5861897468566895,
"logps/chosen": -240.0535888671875,
"logps/rejected": -186.63296508789062,
"loss": 0.6701,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.06224694103002548,
"rewards/margins": 0.049013856798410416,
"rewards/rejected": 0.013233085162937641,
"step": 1380
},
{
"epoch": 0.6671466282697385,
"grad_norm": 48.25,
"learning_rate": 1.5056688684461232e-07,
"logits/chosen": -2.7177727222442627,
"logits/rejected": -2.5875582695007324,
"logps/chosen": -241.77590942382812,
"logps/rejected": -202.8583221435547,
"loss": 0.6715,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.05555950850248337,
"rewards/margins": 0.04663746803998947,
"rewards/rejected": 0.008922042325139046,
"step": 1390
},
{
"epoch": 0.6719462443004559,
"grad_norm": 52.0,
"learning_rate": 1.4673576685703026e-07,
"logits/chosen": -2.71079158782959,
"logits/rejected": -2.6190669536590576,
"logps/chosen": -240.28317260742188,
"logps/rejected": -208.4333953857422,
"loss": 0.6746,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.05780891329050064,
"rewards/margins": 0.04101189970970154,
"rewards/rejected": 0.016797009855508804,
"step": 1400
},
{
"epoch": 0.6719462443004559,
"eval_logits/chosen": -2.7162351608276367,
"eval_logits/rejected": -2.595787763595581,
"eval_logps/chosen": -232.603759765625,
"eval_logps/rejected": -205.423583984375,
"eval_loss": 0.6737259030342102,
"eval_rewards/accuracies": 0.6549999713897705,
"eval_rewards/chosen": 0.052920494228601456,
"eval_rewards/margins": 0.04220106825232506,
"eval_rewards/rejected": 0.010719424113631248,
"eval_runtime": 21.4584,
"eval_samples_per_second": 46.602,
"eval_steps_per_second": 11.65,
"step": 1400
},
{
"epoch": 0.6767458603311735,
"grad_norm": 45.25,
"learning_rate": 1.429336670643929e-07,
"logits/chosen": -2.6878199577331543,
"logits/rejected": -2.5576183795928955,
"logps/chosen": -217.6654052734375,
"logps/rejected": -194.15072631835938,
"loss": 0.6747,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.04295631870627403,
"rewards/margins": 0.04018958657979965,
"rewards/rejected": 0.0027667314279824495,
"step": 1410
},
{
"epoch": 0.681545476361891,
"grad_norm": 44.25,
"learning_rate": 1.3916165596519013e-07,
"logits/chosen": -2.721832036972046,
"logits/rejected": -2.5464541912078857,
"logps/chosen": -230.22433471679688,
"logps/rejected": -185.80426025390625,
"loss": 0.673,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.0356689877808094,
"rewards/margins": 0.04347275570034981,
"rewards/rejected": -0.007803765125572681,
"step": 1420
},
{
"epoch": 0.6863450923926085,
"grad_norm": 50.75,
"learning_rate": 1.354207936021309e-07,
"logits/chosen": -2.7223222255706787,
"logits/rejected": -2.565199375152588,
"logps/chosen": -222.4684295654297,
"logps/rejected": -181.96670532226562,
"loss": 0.6756,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.0483887605369091,
"rewards/margins": 0.0376238189637661,
"rewards/rejected": 0.010764943435788155,
"step": 1430
},
{
"epoch": 0.6911447084233261,
"grad_norm": 56.0,
"learning_rate": 1.317121312642406e-07,
"logits/chosen": -2.712290048599243,
"logits/rejected": -2.5553765296936035,
"logps/chosen": -231.23538208007812,
"logps/rejected": -199.30477905273438,
"loss": 0.6737,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.05653299763798714,
"rewards/margins": 0.04253407567739487,
"rewards/rejected": 0.013998927548527718,
"step": 1440
},
{
"epoch": 0.6959443244540436,
"grad_norm": 54.0,
"learning_rate": 1.280367111914195e-07,
"logits/chosen": -2.635277509689331,
"logits/rejected": -2.543097972869873,
"logps/chosen": -245.3563690185547,
"logps/rejected": -227.46142578125,
"loss": 0.6802,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.055252768099308014,
"rewards/margins": 0.029850680381059647,
"rewards/rejected": 0.02540207840502262,
"step": 1450
},
{
"epoch": 0.7007439404847612,
"grad_norm": 58.75,
"learning_rate": 1.243955662815429e-07,
"logits/chosen": -2.7177271842956543,
"logits/rejected": -2.5459141731262207,
"logps/chosen": -247.24038696289062,
"logps/rejected": -206.8587188720703,
"loss": 0.6777,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.05328672379255295,
"rewards/margins": 0.03413508087396622,
"rewards/rejected": 0.01915164105594158,
"step": 1460
},
{
"epoch": 0.7055435565154787,
"grad_norm": 52.25,
"learning_rate": 1.207897198001878e-07,
"logits/chosen": -2.747087001800537,
"logits/rejected": -2.646921396255493,
"logps/chosen": -230.12109375,
"logps/rejected": -196.12423706054688,
"loss": 0.6732,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.050879109650850296,
"rewards/margins": 0.04289903864264488,
"rewards/rejected": 0.007980065420269966,
"step": 1470
},
{
"epoch": 0.7103431725461963,
"grad_norm": 50.0,
"learning_rate": 1.1722018509306586e-07,
"logits/chosen": -2.708061456680298,
"logits/rejected": -2.556723117828369,
"logps/chosen": -247.2672119140625,
"logps/rejected": -188.83200073242188,
"loss": 0.6704,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.04851624369621277,
"rewards/margins": 0.04957341402769089,
"rewards/rejected": -0.001057169632986188,
"step": 1480
},
{
"epoch": 0.7151427885769138,
"grad_norm": 46.5,
"learning_rate": 1.1368796530124442e-07,
"logits/chosen": -2.671211004257202,
"logits/rejected": -2.543172597885132,
"logps/chosen": -249.50686645507812,
"logps/rejected": -195.772216796875,
"loss": 0.6652,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": 0.06298129260540009,
"rewards/margins": 0.05997220426797867,
"rewards/rejected": 0.0030090927612036467,
"step": 1490
},
{
"epoch": 0.7199424046076314,
"grad_norm": 48.25,
"learning_rate": 1.1019405307923557e-07,
"logits/chosen": -2.719313144683838,
"logits/rejected": -2.598017454147339,
"logps/chosen": -244.72067260742188,
"logps/rejected": -204.39401245117188,
"loss": 0.6727,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.06084643676877022,
"rewards/margins": 0.04462161287665367,
"rewards/rejected": 0.016224823892116547,
"step": 1500
},
{
"epoch": 0.7247420206383489,
"grad_norm": 52.0,
"learning_rate": 1.0673943031603133e-07,
"logits/chosen": -2.7169058322906494,
"logits/rejected": -2.6187710762023926,
"logps/chosen": -221.7633819580078,
"logps/rejected": -206.36257934570312,
"loss": 0.6743,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.04818148910999298,
"rewards/margins": 0.04159141331911087,
"rewards/rejected": 0.006590074393898249,
"step": 1510
},
{
"epoch": 0.7295416366690665,
"grad_norm": 51.0,
"learning_rate": 1.0332506785916522e-07,
"logits/chosen": -2.690253496170044,
"logits/rejected": -2.5843067169189453,
"logps/chosen": -238.80850219726562,
"logps/rejected": -208.9747314453125,
"loss": 0.6821,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.04689895361661911,
"rewards/margins": 0.025392215698957443,
"rewards/rejected": 0.021506736055016518,
"step": 1520
},
{
"epoch": 0.734341252699784,
"grad_norm": 45.75,
"learning_rate": 9.995192524187637e-08,
"logits/chosen": -2.633424997329712,
"logits/rejected": -2.576991081237793,
"logps/chosen": -220.3133087158203,
"logps/rejected": -207.2361297607422,
"loss": 0.6806,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.04313893988728523,
"rewards/margins": 0.028983239084482193,
"rewards/rejected": 0.01415570080280304,
"step": 1530
},
{
"epoch": 0.7391408687305016,
"grad_norm": 47.25,
"learning_rate": 9.662095041345317e-08,
"logits/chosen": -2.6786739826202393,
"logits/rejected": -2.547990322113037,
"logps/chosen": -245.1093292236328,
"logps/rejected": -216.21823120117188,
"loss": 0.6736,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.05186639353632927,
"rewards/margins": 0.042230743914842606,
"rewards/rejected": 0.00963564682751894,
"step": 1540
},
{
"epoch": 0.7439404847612191,
"grad_norm": 52.25,
"learning_rate": 9.333307947283256e-08,
"logits/chosen": -2.7363951206207275,
"logits/rejected": -2.621778964996338,
"logps/chosen": -240.5101318359375,
"logps/rejected": -212.27685546875,
"loss": 0.677,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.04980180412530899,
"rewards/margins": 0.035508893430233,
"rewards/rejected": 0.014292912557721138,
"step": 1550
},
{
"epoch": 0.7487401007919366,
"grad_norm": 45.75,
"learning_rate": 9.008923640552978e-08,
"logits/chosen": -2.688732147216797,
"logits/rejected": -2.5987465381622314,
"logps/chosen": -210.93844604492188,
"logps/rejected": -178.03103637695312,
"loss": 0.6756,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.03177894279360771,
"rewards/margins": 0.03770405799150467,
"rewards/rejected": -0.005925112869590521,
"step": 1560
},
{
"epoch": 0.7535397168226542,
"grad_norm": 49.25,
"learning_rate": 8.689033282397165e-08,
"logits/chosen": -2.717036485671997,
"logits/rejected": -2.59865140914917,
"logps/chosen": -228.39285278320312,
"logps/rejected": -202.19210815429688,
"loss": 0.6765,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.04258617386221886,
"rewards/margins": 0.03610239177942276,
"rewards/rejected": 0.006483784876763821,
"step": 1570
},
{
"epoch": 0.7583393328533717,
"grad_norm": 44.25,
"learning_rate": 8.373726771130768e-08,
"logits/chosen": -2.7102208137512207,
"logits/rejected": -2.585137128829956,
"logps/chosen": -240.1363983154297,
"logps/rejected": -198.0478057861328,
"loss": 0.6694,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.058793772011995316,
"rewards/margins": 0.051263321191072464,
"rewards/rejected": 0.007530451752245426,
"step": 1580
},
{
"epoch": 0.7631389488840893,
"grad_norm": 54.5,
"learning_rate": 8.063092716877015e-08,
"logits/chosen": -2.654996871948242,
"logits/rejected": -2.5496888160705566,
"logps/chosen": -253.6233673095703,
"logps/rejected": -211.02017211914062,
"loss": 0.6746,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.05824859067797661,
"rewards/margins": 0.04030389338731766,
"rewards/rejected": 0.0179446954280138,
"step": 1590
},
{
"epoch": 0.7679385649148068,
"grad_norm": 46.75,
"learning_rate": 7.757218416665445e-08,
"logits/chosen": -2.745260715484619,
"logits/rejected": -2.5834543704986572,
"logps/chosen": -229.595703125,
"logps/rejected": -188.58816528320312,
"loss": 0.6678,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.05159440636634827,
"rewards/margins": 0.054445721209049225,
"rewards/rejected": -0.00285131623968482,
"step": 1600
},
{
"epoch": 0.7679385649148068,
"eval_logits/chosen": -2.7164273262023926,
"eval_logits/rejected": -2.5960192680358887,
"eval_logps/chosen": -232.62025451660156,
"eval_logps/rejected": -205.42721557617188,
"eval_loss": 0.6742997169494629,
"eval_rewards/accuracies": 0.6629999876022339,
"eval_rewards/chosen": 0.051273249089717865,
"eval_rewards/margins": 0.04091595113277435,
"eval_rewards/rejected": 0.010357297956943512,
"eval_runtime": 21.4386,
"eval_samples_per_second": 46.645,
"eval_steps_per_second": 11.661,
"step": 1600
},
{
"epoch": 0.7727381809455244,
"grad_norm": 53.75,
"learning_rate": 7.456189829898954e-08,
"logits/chosen": -2.722618818283081,
"logits/rejected": -2.5636143684387207,
"logps/chosen": -236.95559692382812,
"logps/rejected": -187.6465301513672,
"loss": 0.6711,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.05168156698346138,
"rewards/margins": 0.04733755439519882,
"rewards/rejected": 0.00434401398524642,
"step": 1610
},
{
"epoch": 0.7775377969762419,
"grad_norm": 47.5,
"learning_rate": 7.160091554196731e-08,
"logits/chosen": -2.7647414207458496,
"logits/rejected": -2.6293978691101074,
"logps/chosen": -234.0639190673828,
"logps/rejected": -196.3838653564453,
"loss": 0.673,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.051941949874162674,
"rewards/margins": 0.043656349182128906,
"rewards/rejected": 0.008285606279969215,
"step": 1620
},
{
"epoch": 0.7823374130069595,
"grad_norm": 46.75,
"learning_rate": 6.86900680161994e-08,
"logits/chosen": -2.6866321563720703,
"logits/rejected": -2.612730026245117,
"logps/chosen": -240.7451171875,
"logps/rejected": -228.06771850585938,
"loss": 0.6801,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.05514489486813545,
"rewards/margins": 0.029478853568434715,
"rewards/rejected": 0.025666039437055588,
"step": 1630
},
{
"epoch": 0.787137029037677,
"grad_norm": 41.75,
"learning_rate": 6.583017375286726e-08,
"logits/chosen": -2.695338726043701,
"logits/rejected": -2.5727577209472656,
"logps/chosen": -229.021728515625,
"logps/rejected": -195.28890991210938,
"loss": 0.6761,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.050636857748031616,
"rewards/margins": 0.03745696693658829,
"rewards/rejected": 0.01317988894879818,
"step": 1640
},
{
"epoch": 0.7919366450683946,
"grad_norm": 48.0,
"learning_rate": 6.302203646383239e-08,
"logits/chosen": -2.71480131149292,
"logits/rejected": -2.6168365478515625,
"logps/chosen": -241.9342041015625,
"logps/rejected": -196.26498413085938,
"loss": 0.6745,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.047247517853975296,
"rewards/margins": 0.04049244523048401,
"rewards/rejected": 0.006755062844604254,
"step": 1650
},
{
"epoch": 0.7967362610991121,
"grad_norm": 50.0,
"learning_rate": 6.02664453157703e-08,
"logits/chosen": -2.764697790145874,
"logits/rejected": -2.6510090827941895,
"logps/chosen": -232.6202850341797,
"logps/rejected": -214.68887329101562,
"loss": 0.6789,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.04804060235619545,
"rewards/margins": 0.031984902918338776,
"rewards/rejected": 0.016055695712566376,
"step": 1660
},
{
"epoch": 0.8015358771298297,
"grad_norm": 39.25,
"learning_rate": 5.756417470839195e-08,
"logits/chosen": -2.7477545738220215,
"logits/rejected": -2.6470861434936523,
"logps/chosen": -226.798828125,
"logps/rejected": -196.84617614746094,
"loss": 0.6756,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.04336618259549141,
"rewards/margins": 0.03806891292333603,
"rewards/rejected": 0.005297265015542507,
"step": 1670
},
{
"epoch": 0.8063354931605472,
"grad_norm": 46.0,
"learning_rate": 5.491598405681558e-08,
"logits/chosen": -2.7781832218170166,
"logits/rejected": -2.594255208969116,
"logps/chosen": -244.9146728515625,
"logps/rejected": -191.92153930664062,
"loss": 0.6696,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.05770384520292282,
"rewards/margins": 0.05054632946848869,
"rewards/rejected": 0.007157514337450266,
"step": 1680
},
{
"epoch": 0.8111351091912647,
"grad_norm": 46.25,
"learning_rate": 5.232261757814924e-08,
"logits/chosen": -2.676637649536133,
"logits/rejected": -2.5312628746032715,
"logps/chosen": -239.4225616455078,
"logps/rejected": -201.6903076171875,
"loss": 0.6665,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.06439922004938126,
"rewards/margins": 0.0572139136493206,
"rewards/rejected": 0.007185307331383228,
"step": 1690
},
{
"epoch": 0.8159347252219823,
"grad_norm": 48.0,
"learning_rate": 4.978480408234465e-08,
"logits/chosen": -2.6256635189056396,
"logits/rejected": -2.5944604873657227,
"logps/chosen": -213.68728637695312,
"logps/rejected": -203.4552459716797,
"loss": 0.6819,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.035785894840955734,
"rewards/margins": 0.025840366259217262,
"rewards/rejected": 0.00994553230702877,
"step": 1700
},
{
"epoch": 0.8207343412526998,
"grad_norm": 49.0,
"learning_rate": 4.730325676738089e-08,
"logits/chosen": -2.7068257331848145,
"logits/rejected": -2.5982439517974854,
"logps/chosen": -227.2494354248047,
"logps/rejected": -194.36875915527344,
"loss": 0.6773,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.041812531650066376,
"rewards/margins": 0.0348488949239254,
"rewards/rejected": 0.006963637657463551,
"step": 1710
},
{
"epoch": 0.8255339572834173,
"grad_norm": 47.0,
"learning_rate": 4.487867301883527e-08,
"logits/chosen": -2.6358511447906494,
"logits/rejected": -2.5312399864196777,
"logps/chosen": -222.84445190429688,
"logps/rejected": -203.24679565429688,
"loss": 0.6697,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.05580927059054375,
"rewards/margins": 0.04995386302471161,
"rewards/rejected": 0.005855409428477287,
"step": 1720
},
{
"epoch": 0.8303335733141348,
"grad_norm": 45.75,
"learning_rate": 4.2511734213898085e-08,
"logits/chosen": -2.7530338764190674,
"logits/rejected": -2.6062283515930176,
"logps/chosen": -251.0270233154297,
"logps/rejected": -206.8340301513672,
"loss": 0.6759,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.0477396696805954,
"rewards/margins": 0.038779519498348236,
"rewards/rejected": 0.008960146456956863,
"step": 1730
},
{
"epoch": 0.8351331893448524,
"grad_norm": 59.75,
"learning_rate": 4.020310552988632e-08,
"logits/chosen": -2.726264476776123,
"logits/rejected": -2.5524630546569824,
"logps/chosen": -247.4801788330078,
"logps/rejected": -204.47012329101562,
"loss": 0.67,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.0596492774784565,
"rewards/margins": 0.04989113658666611,
"rewards/rejected": 0.009758138097822666,
"step": 1740
},
{
"epoch": 0.8399328053755699,
"grad_norm": 48.5,
"learning_rate": 3.795343575730975e-08,
"logits/chosen": -2.7325968742370605,
"logits/rejected": -2.602531671524048,
"logps/chosen": -251.66342163085938,
"logps/rejected": -216.50717163085938,
"loss": 0.6805,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.052908021956682205,
"rewards/margins": 0.029805365949869156,
"rewards/rejected": 0.0231026578694582,
"step": 1750
},
{
"epoch": 0.8447324214062875,
"grad_norm": 50.75,
"learning_rate": 3.576335711754236e-08,
"logits/chosen": -2.7325785160064697,
"logits/rejected": -2.6831870079040527,
"logps/chosen": -234.35336303710938,
"logps/rejected": -216.2118682861328,
"loss": 0.6801,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.04432320594787598,
"rewards/margins": 0.028889168053865433,
"rewards/rejected": 0.015434036031365395,
"step": 1760
},
{
"epoch": 0.849532037437005,
"grad_norm": 52.25,
"learning_rate": 3.363348508515015e-08,
"logits/chosen": -2.7496337890625,
"logits/rejected": -2.6104989051818848,
"logps/chosen": -236.09640502929688,
"logps/rejected": -209.546875,
"loss": 0.6741,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.05187439173460007,
"rewards/margins": 0.041690729558467865,
"rewards/rejected": 0.010183664970099926,
"step": 1770
},
{
"epoch": 0.8543316534677226,
"grad_norm": 52.0,
"learning_rate": 3.156441821492506e-08,
"logits/chosen": -2.709050416946411,
"logits/rejected": -2.584873676300049,
"logps/chosen": -234.66232299804688,
"logps/rejected": -206.11624145507812,
"loss": 0.6738,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.05997660011053085,
"rewards/margins": 0.04241427406668663,
"rewards/rejected": 0.017562326043844223,
"step": 1780
},
{
"epoch": 0.8591312694984401,
"grad_norm": 48.0,
"learning_rate": 2.955673797367411e-08,
"logits/chosen": -2.7106432914733887,
"logits/rejected": -2.567945718765259,
"logps/chosen": -238.9900665283203,
"logps/rejected": -189.52700805664062,
"loss": 0.6738,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.052786171436309814,
"rewards/margins": 0.04187396913766861,
"rewards/rejected": 0.01091220136731863,
"step": 1790
},
{
"epoch": 0.8639308855291576,
"grad_norm": 49.75,
"learning_rate": 2.7611008576810674e-08,
"logits/chosen": -2.724682331085205,
"logits/rejected": -2.603818416595459,
"logps/chosen": -215.18359375,
"logps/rejected": -187.88693237304688,
"loss": 0.6833,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.03471002355217934,
"rewards/margins": 0.02330438420176506,
"rewards/rejected": 0.011405635625123978,
"step": 1800
},
{
"epoch": 0.8639308855291576,
"eval_logits/chosen": -2.716348886489868,
"eval_logits/rejected": -2.595895290374756,
"eval_logps/chosen": -232.60520935058594,
"eval_logps/rejected": -205.4226837158203,
"eval_loss": 0.673865795135498,
"eval_rewards/accuracies": 0.6620000004768372,
"eval_rewards/chosen": 0.05277761444449425,
"eval_rewards/margins": 0.041967809200286865,
"eval_rewards/rejected": 0.010809808038175106,
"eval_runtime": 21.4412,
"eval_samples_per_second": 46.639,
"eval_steps_per_second": 11.66,
"step": 1800
},
{
"epoch": 0.8687305015598752,
"grad_norm": 53.5,
"learning_rate": 2.5727776829793767e-08,
"logits/chosen": -2.740374803543091,
"logits/rejected": -2.5743534564971924,
"logps/chosen": -236.7235565185547,
"logps/rejected": -174.8079833984375,
"loss": 0.6725,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.045521851629018784,
"rewards/margins": 0.04434143006801605,
"rewards/rejected": 0.0011804220266640186,
"step": 1810
},
{
"epoch": 0.8735301175905927,
"grad_norm": 52.75,
"learning_rate": 2.390757197446025e-08,
"logits/chosen": -2.721191883087158,
"logits/rejected": -2.5652270317077637,
"logps/chosen": -240.0341339111328,
"logps/rejected": -187.92848205566406,
"loss": 0.6739,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.05156964808702469,
"rewards/margins": 0.041832335293293,
"rewards/rejected": 0.009737305343151093,
"step": 1820
},
{
"epoch": 0.8783297336213103,
"grad_norm": 50.0,
"learning_rate": 2.2150905540292585e-08,
"logits/chosen": -2.736666679382324,
"logits/rejected": -2.5938286781311035,
"logps/chosen": -227.4267578125,
"logps/rejected": -202.85935974121094,
"loss": 0.6757,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.054040707647800446,
"rewards/margins": 0.03809930384159088,
"rewards/rejected": 0.015941400080919266,
"step": 1830
},
{
"epoch": 0.8831293496520278,
"grad_norm": 46.25,
"learning_rate": 2.0458271200664624e-08,
"logits/chosen": -2.6549439430236816,
"logits/rejected": -2.612755537033081,
"logps/chosen": -210.38265991210938,
"logps/rejected": -196.44735717773438,
"loss": 0.6762,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.04778756946325302,
"rewards/margins": 0.0375523678958416,
"rewards/rejected": 0.01023520715534687,
"step": 1840
},
{
"epoch": 0.8879289656827454,
"grad_norm": 48.0,
"learning_rate": 1.8830144634105206e-08,
"logits/chosen": -2.7017998695373535,
"logits/rejected": -2.5428473949432373,
"logps/chosen": -245.57290649414062,
"logps/rejected": -186.67721557617188,
"loss": 0.6703,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.061512064188718796,
"rewards/margins": 0.04928315803408623,
"rewards/rejected": 0.012228906154632568,
"step": 1850
},
{
"epoch": 0.8927285817134629,
"grad_norm": 46.5,
"learning_rate": 1.7266983390618994e-08,
"logits/chosen": -2.6695878505706787,
"logits/rejected": -2.551301956176758,
"logps/chosen": -227.3214111328125,
"logps/rejected": -186.639404296875,
"loss": 0.6687,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.0619601309299469,
"rewards/margins": 0.05267205834388733,
"rewards/rejected": 0.009288066066801548,
"step": 1860
},
{
"epoch": 0.8975281977441805,
"grad_norm": 44.5,
"learning_rate": 1.5769226763101885e-08,
"logits/chosen": -2.60475492477417,
"logits/rejected": -2.5645339488983154,
"logps/chosen": -228.18612670898438,
"logps/rejected": -207.1147003173828,
"loss": 0.6772,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.038809359073638916,
"rewards/margins": 0.03571794182062149,
"rewards/rejected": 0.0030914172530174255,
"step": 1870
},
{
"epoch": 0.902327813774898,
"grad_norm": 47.0,
"learning_rate": 1.4337295663887084e-08,
"logits/chosen": -2.763521194458008,
"logits/rejected": -2.614365339279175,
"logps/chosen": -238.0912628173828,
"logps/rejected": -187.44529724121094,
"loss": 0.6705,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.06117742136120796,
"rewards/margins": 0.04880157858133316,
"rewards/rejected": 0.012375839985907078,
"step": 1880
},
{
"epoch": 0.9071274298056156,
"grad_norm": 52.75,
"learning_rate": 1.2971592506456796e-08,
"logits/chosen": -2.6662535667419434,
"logits/rejected": -2.585869550704956,
"logps/chosen": -203.06375122070312,
"logps/rejected": -187.27978515625,
"loss": 0.6761,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.04065801948308945,
"rewards/margins": 0.03694169595837593,
"rewards/rejected": 0.0037163265515118837,
"step": 1890
},
{
"epoch": 0.9119270458363331,
"grad_norm": 46.75,
"learning_rate": 1.1672501092352544e-08,
"logits/chosen": -2.7174623012542725,
"logits/rejected": -2.5767555236816406,
"logps/chosen": -239.62747192382812,
"logps/rejected": -203.18661499023438,
"loss": 0.6694,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.05886067822575569,
"rewards/margins": 0.05083751678466797,
"rewards/rejected": 0.008023159578442574,
"step": 1900
},
{
"epoch": 0.9167266618670507,
"grad_norm": 50.5,
"learning_rate": 1.0440386503315967e-08,
"logits/chosen": -2.638658046722412,
"logits/rejected": -2.565709114074707,
"logps/chosen": -222.6656494140625,
"logps/rejected": -242.17562866210938,
"loss": 0.6788,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.049609534442424774,
"rewards/margins": 0.0319090262055397,
"rewards/rejected": 0.01770050823688507,
"step": 1910
},
{
"epoch": 0.9215262778977682,
"grad_norm": 49.25,
"learning_rate": 9.275594998690573e-09,
"logits/chosen": -2.688535690307617,
"logits/rejected": -2.516364574432373,
"logps/chosen": -248.7972412109375,
"logps/rejected": -191.66940307617188,
"loss": 0.6685,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.0669298768043518,
"rewards/margins": 0.052894193679094315,
"rewards/rejected": 0.014035684056580067,
"step": 1920
},
{
"epoch": 0.9263258939284857,
"grad_norm": 50.75,
"learning_rate": 8.178453918112782e-09,
"logits/chosen": -2.695676326751709,
"logits/rejected": -2.564342737197876,
"logps/chosen": -223.0631561279297,
"logps/rejected": -179.12435913085938,
"loss": 0.6699,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.04940425604581833,
"rewards/margins": 0.049749527126550674,
"rewards/rejected": -0.00034527387470006943,
"step": 1930
},
{
"epoch": 0.9311255099592033,
"grad_norm": 64.5,
"learning_rate": 7.149271589520167e-09,
"logits/chosen": -2.655266523361206,
"logits/rejected": -2.529818058013916,
"logps/chosen": -209.69650268554688,
"logps/rejected": -191.54139709472656,
"loss": 0.6801,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.03373004496097565,
"rewards/margins": 0.029740754514932632,
"rewards/rejected": 0.0039892946369946,
"step": 1940
},
{
"epoch": 0.9359251259899208,
"grad_norm": 40.75,
"learning_rate": 6.188337242502784e-09,
"logits/chosen": -2.6980903148651123,
"logits/rejected": -2.5514559745788574,
"logps/chosen": -236.17300415039062,
"logps/rejected": -191.43814086914062,
"loss": 0.6731,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.051222801208496094,
"rewards/margins": 0.04344618320465088,
"rewards/rejected": 0.007776615209877491,
"step": 1950
},
{
"epoch": 0.9407247420206384,
"grad_norm": 43.5,
"learning_rate": 5.295920927021108e-09,
"logits/chosen": -2.7303788661956787,
"logits/rejected": -2.624377727508545,
"logps/chosen": -233.98922729492188,
"logps/rejected": -197.11575317382812,
"loss": 0.6741,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.05416171997785568,
"rewards/margins": 0.04171646013855934,
"rewards/rejected": 0.01244526170194149,
"step": 1960
},
{
"epoch": 0.9455243580513559,
"grad_norm": 53.0,
"learning_rate": 4.472273437514357e-09,
"logits/chosen": -2.7538888454437256,
"logits/rejected": -2.6263771057128906,
"logps/chosen": -257.5274353027344,
"logps/rejected": -210.43649291992188,
"loss": 0.6698,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.0656302273273468,
"rewards/margins": 0.050034552812576294,
"rewards/rejected": 0.015595669858157635,
"step": 1970
},
{
"epoch": 0.9503239740820735,
"grad_norm": 46.75,
"learning_rate": 3.7176262424202522e-09,
"logits/chosen": -2.710458278656006,
"logits/rejected": -2.611675262451172,
"logps/chosen": -221.28195190429688,
"logps/rejected": -202.12786865234375,
"loss": 0.6825,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.03624237701296806,
"rewards/margins": 0.023813677951693535,
"rewards/rejected": 0.012428699992597103,
"step": 1980
},
{
"epoch": 0.955123590112791,
"grad_norm": 51.25,
"learning_rate": 3.0321914191255292e-09,
"logits/chosen": -2.684296131134033,
"logits/rejected": -2.57779598236084,
"logps/chosen": -241.6301727294922,
"logps/rejected": -214.57138061523438,
"loss": 0.6789,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.044890034943819046,
"rewards/margins": 0.03138625621795654,
"rewards/rejected": 0.013503775000572205,
"step": 1990
},
{
"epoch": 0.9599232061435086,
"grad_norm": 45.25,
"learning_rate": 2.416161594366417e-09,
"logits/chosen": -2.744062900543213,
"logits/rejected": -2.644768238067627,
"logps/chosen": -218.0794677734375,
"logps/rejected": -204.07879638671875,
"loss": 0.6743,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.05610308051109314,
"rewards/margins": 0.04027719795703888,
"rewards/rejected": 0.015825878828763962,
"step": 2000
},
{
"epoch": 0.9599232061435086,
"eval_logits/chosen": -2.716303825378418,
"eval_logits/rejected": -2.5958354473114014,
"eval_logps/chosen": -232.603271484375,
"eval_logps/rejected": -205.41600036621094,
"eval_loss": 0.6740667223930359,
"eval_rewards/accuracies": 0.6449999809265137,
"eval_rewards/chosen": 0.05297102406620979,
"eval_rewards/margins": 0.04149361699819565,
"eval_rewards/rejected": 0.011477403342723846,
"eval_runtime": 21.4581,
"eval_samples_per_second": 46.602,
"eval_steps_per_second": 11.651,
"step": 2000
},
{
"epoch": 0.9647228221742261,
"grad_norm": 46.75,
"learning_rate": 1.8697098900948283e-09,
"logits/chosen": -2.670266628265381,
"logits/rejected": -2.5685760974884033,
"logps/chosen": -225.6419219970703,
"logps/rejected": -218.05624389648438,
"loss": 0.6796,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.04892081022262573,
"rewards/margins": 0.0300876684486866,
"rewards/rejected": 0.01883314736187458,
"step": 2010
},
{
"epoch": 0.9695224382049437,
"grad_norm": 46.5,
"learning_rate": 1.3929898748261948e-09,
"logits/chosen": -2.751359224319458,
"logits/rejected": -2.6107022762298584,
"logps/chosen": -228.06881713867188,
"logps/rejected": -205.7585906982422,
"loss": 0.6756,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.042551226913928986,
"rewards/margins": 0.03797770291566849,
"rewards/rejected": 0.004573523066937923,
"step": 2020
},
{
"epoch": 0.9743220542356611,
"grad_norm": 50.0,
"learning_rate": 9.861355204825172e-10,
"logits/chosen": -2.7200140953063965,
"logits/rejected": -2.597716808319092,
"logps/chosen": -256.5636291503906,
"logps/rejected": -198.85813903808594,
"loss": 0.6759,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.05483978986740112,
"rewards/margins": 0.03850018233060837,
"rewards/rejected": 0.016339603811502457,
"step": 2030
},
{
"epoch": 0.9791216702663786,
"grad_norm": 47.25,
"learning_rate": 6.492611647420932e-10,
"logits/chosen": -2.6937568187713623,
"logits/rejected": -2.550854206085205,
"logps/chosen": -227.3085479736328,
"logps/rejected": -193.7042694091797,
"loss": 0.6746,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.0502045638859272,
"rewards/margins": 0.040242839604616165,
"rewards/rejected": 0.009961729869246483,
"step": 2040
},
{
"epoch": 0.9839212862970962,
"grad_norm": 50.0,
"learning_rate": 3.8246147890763636e-10,
"logits/chosen": -2.7479116916656494,
"logits/rejected": -2.6015403270721436,
"logps/chosen": -243.7687530517578,
"logps/rejected": -200.82359313964844,
"loss": 0.6735,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.05925138667225838,
"rewards/margins": 0.042620036751031876,
"rewards/rejected": 0.016631346195936203,
"step": 2050
},
{
"epoch": 0.9887209023278137,
"grad_norm": 44.25,
"learning_rate": 1.8581144130089266e-10,
"logits/chosen": -2.674731731414795,
"logits/rejected": -2.6031594276428223,
"logps/chosen": -220.9689483642578,
"logps/rejected": -206.5128173828125,
"loss": 0.6804,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.046965621411800385,
"rewards/margins": 0.028399232774972916,
"rewards/rejected": 0.018566394224762917,
"step": 2060
},
{
"epoch": 0.9935205183585313,
"grad_norm": 45.25,
"learning_rate": 5.936631619152255e-11,
"logits/chosen": -2.718005895614624,
"logits/rejected": -2.6375200748443604,
"logps/chosen": -232.222900390625,
"logps/rejected": -203.7240753173828,
"loss": 0.6763,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.05353207513689995,
"rewards/margins": 0.03654414042830467,
"rewards/rejected": 0.016987936571240425,
"step": 2070
},
{
"epoch": 0.9983201343892488,
"grad_norm": 49.25,
"learning_rate": 3.1616382663024467e-12,
"logits/chosen": -2.7572877407073975,
"logits/rejected": -2.611027956008911,
"logps/chosen": -232.747314453125,
"logps/rejected": -208.55935668945312,
"loss": 0.6791,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.04861464723944664,
"rewards/margins": 0.03146715834736824,
"rewards/rejected": 0.0171474888920784,
"step": 2080
},
{
"epoch": 0.9997600191984641,
"step": 2083,
"total_flos": 0.0,
"train_loss": 0.6792905164692074,
"train_runtime": 2163.7863,
"train_samples_per_second": 15.405,
"train_steps_per_second": 0.963
}
],
"logging_steps": 10,
"max_steps": 2083,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}