Files
TinyLlama-1.1B-IPO-PKU-Safe…/trainer_state.json
ModelHub XC d540a44812 初始化项目,由ModelHub XC社区提供模型
Model: AIPlans/TinyLlama-1.1B-IPO-PKU-SafeRLHF
Source: Original Platform
2026-06-06 11:06:21 +08:00

3323 lines
113 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9997600191984641,
"eval_steps": 200,
"global_step": 2083,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004799616030717543,
"grad_norm": 96.5,
"learning_rate": 2.3923444976076555e-07,
"logits/chosen": -2.8452980518341064,
"logits/rejected": -2.691192626953125,
"logps/chosen": -1.994722604751587,
"logps/rejected": -2.0614635944366455,
"loss": 24.9987,
"rewards/accuracies": 0.4312500059604645,
"rewards/chosen": 4.477056791074574e-06,
"rewards/margins": 1.363817136734724e-05,
"rewards/rejected": -9.161123671219684e-06,
"step": 10
},
{
"epoch": 0.009599232061435085,
"grad_norm": 85.0,
"learning_rate": 4.784688995215311e-07,
"logits/chosen": -2.862319231033325,
"logits/rejected": -2.7300217151641846,
"logps/chosen": -1.9412975311279297,
"logps/rejected": -1.9797922372817993,
"loss": 25.0066,
"rewards/accuracies": 0.4375,
"rewards/chosen": -2.4267139451694675e-05,
"rewards/margins": -6.541357288369909e-05,
"rewards/rejected": 4.114643161301501e-05,
"step": 20
},
{
"epoch": 0.014398848092152628,
"grad_norm": 112.5,
"learning_rate": 7.177033492822967e-07,
"logits/chosen": -2.8420886993408203,
"logits/rejected": -2.7661550045013428,
"logps/chosen": -2.054128646850586,
"logps/rejected": -2.0443522930145264,
"loss": 24.9991,
"rewards/accuracies": 0.5625,
"rewards/chosen": 1.9722799606824992e-06,
"rewards/margins": 9.74129761743825e-06,
"rewards/rejected": -7.769021976855583e-06,
"step": 30
},
{
"epoch": 0.01919846412287017,
"grad_norm": 88.5,
"learning_rate": 9.569377990430622e-07,
"logits/chosen": -2.8617727756500244,
"logits/rejected": -2.683840036392212,
"logps/chosen": -2.0417850017547607,
"logps/rejected": -1.9743082523345947,
"loss": 25.0031,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -4.878617619397119e-05,
"rewards/margins": -3.112506965408102e-05,
"rewards/rejected": -1.7661115634837188e-05,
"step": 40
},
{
"epoch": 0.023998080153587713,
"grad_norm": 100.5,
"learning_rate": 1.196172248803828e-06,
"logits/chosen": -2.817301034927368,
"logits/rejected": -2.731520175933838,
"logps/chosen": -2.010397434234619,
"logps/rejected": -2.070805788040161,
"loss": 24.9987,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -2.268821481266059e-05,
"rewards/margins": 1.2805535334337037e-05,
"rewards/rejected": -3.549374741851352e-05,
"step": 50
},
{
"epoch": 0.028797696184305256,
"grad_norm": 105.5,
"learning_rate": 1.4354066985645934e-06,
"logits/chosen": -2.8474650382995605,
"logits/rejected": -2.7305524349212646,
"logps/chosen": -1.9656894207000732,
"logps/rejected": -1.9973506927490234,
"loss": 24.998,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -3.9483980799559504e-05,
"rewards/margins": 1.9929146219510585e-05,
"rewards/rejected": -5.94131342950277e-05,
"step": 60
},
{
"epoch": 0.033597312215022795,
"grad_norm": 148.0,
"learning_rate": 1.6746411483253591e-06,
"logits/chosen": -2.882148504257202,
"logits/rejected": -2.741283416748047,
"logps/chosen": -2.0739669799804688,
"logps/rejected": -2.1058197021484375,
"loss": 24.9851,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -7.972978346515447e-05,
"rewards/margins": 0.00014972247299738228,
"rewards/rejected": -0.00022945224191062152,
"step": 70
},
{
"epoch": 0.03839692824574034,
"grad_norm": 82.0,
"learning_rate": 1.9138755980861244e-06,
"logits/chosen": -2.8617637157440186,
"logits/rejected": -2.7453629970550537,
"logps/chosen": -2.0088212490081787,
"logps/rejected": -2.0065696239471436,
"loss": 24.9943,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.0001579125237185508,
"rewards/margins": 5.802646046504378e-05,
"rewards/rejected": -0.00021593898418359458,
"step": 80
},
{
"epoch": 0.04319654427645788,
"grad_norm": 125.5,
"learning_rate": 2.15311004784689e-06,
"logits/chosen": -2.867288112640381,
"logits/rejected": -2.6688222885131836,
"logps/chosen": -2.0158286094665527,
"logps/rejected": -1.9923591613769531,
"loss": 24.9833,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.00020580650016199797,
"rewards/margins": 0.00016745278844609857,
"rewards/rejected": -0.0003732592740561813,
"step": 90
},
{
"epoch": 0.04799616030717543,
"grad_norm": 99.0,
"learning_rate": 2.392344497607656e-06,
"logits/chosen": -2.8639748096466064,
"logits/rejected": -2.7186648845672607,
"logps/chosen": -2.0739400386810303,
"logps/rejected": -2.125810384750366,
"loss": 24.9752,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.0003014372196048498,
"rewards/margins": 0.00024825712898746133,
"rewards/rejected": -0.0005496944067999721,
"step": 100
},
{
"epoch": 0.052795776337892966,
"grad_norm": 90.0,
"learning_rate": 2.631578947368421e-06,
"logits/chosen": -2.864802837371826,
"logits/rejected": -2.6625478267669678,
"logps/chosen": -2.05049467086792,
"logps/rejected": -1.9859590530395508,
"loss": 24.9738,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.0004892169963568449,
"rewards/margins": 0.00026246957713738084,
"rewards/rejected": -0.0007516865734942257,
"step": 110
},
{
"epoch": 0.05759539236861051,
"grad_norm": 109.0,
"learning_rate": 2.870813397129187e-06,
"logits/chosen": -2.8730175495147705,
"logits/rejected": -2.7375855445861816,
"logps/chosen": -1.9972280263900757,
"logps/rejected": -2.052138566970825,
"loss": 24.965,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.0006397126126103103,
"rewards/margins": 0.0003507775254547596,
"rewards/rejected": -0.0009904901962727308,
"step": 120
},
{
"epoch": 0.06239500839932805,
"grad_norm": 118.0,
"learning_rate": 3.1100478468899525e-06,
"logits/chosen": -2.836747646331787,
"logits/rejected": -2.7146553993225098,
"logps/chosen": -2.092850923538208,
"logps/rejected": -2.0353379249572754,
"loss": 24.9448,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.0006367649766616523,
"rewards/margins": 0.0005542241269722581,
"rewards/rejected": -0.0011909890454262495,
"step": 130
},
{
"epoch": 0.06719462443004559,
"grad_norm": 114.0,
"learning_rate": 3.3492822966507182e-06,
"logits/chosen": -2.792067050933838,
"logits/rejected": -2.735835075378418,
"logps/chosen": -2.027514934539795,
"logps/rejected": -2.1196627616882324,
"loss": 24.9435,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.0009214409510605037,
"rewards/margins": 0.0005676588043570518,
"rewards/rejected": -0.0014890998136252165,
"step": 140
},
{
"epoch": 0.07199424046076314,
"grad_norm": 96.0,
"learning_rate": 3.5885167464114835e-06,
"logits/chosen": -2.849137306213379,
"logits/rejected": -2.7531142234802246,
"logps/chosen": -2.0570991039276123,
"logps/rejected": -2.0227091312408447,
"loss": 24.9489,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.0011024547275155783,
"rewards/margins": 0.000515233725309372,
"rewards/rejected": -0.0016176884528249502,
"step": 150
},
{
"epoch": 0.07679385649148068,
"grad_norm": 112.5,
"learning_rate": 3.827751196172249e-06,
"logits/chosen": -2.81174635887146,
"logits/rejected": -2.698882579803467,
"logps/chosen": -2.147491931915283,
"logps/rejected": -2.1290345191955566,
"loss": 24.9252,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.001297778682783246,
"rewards/margins": 0.0007537564961239696,
"rewards/rejected": -0.0020515350624918938,
"step": 160
},
{
"epoch": 0.08159347252219823,
"grad_norm": 88.0,
"learning_rate": 4.066985645933015e-06,
"logits/chosen": -2.873908281326294,
"logits/rejected": -2.6953749656677246,
"logps/chosen": -2.028491258621216,
"logps/rejected": -2.0439553260803223,
"loss": 24.9024,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.0015579147730022669,
"rewards/margins": 0.0009802932618185878,
"rewards/rejected": -0.002538207918405533,
"step": 170
},
{
"epoch": 0.08639308855291576,
"grad_norm": 82.5,
"learning_rate": 4.30622009569378e-06,
"logits/chosen": -2.8824820518493652,
"logits/rejected": -2.7608413696289062,
"logps/chosen": -1.9971075057983398,
"logps/rejected": -1.9878934621810913,
"loss": 24.9013,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.0020060662645846605,
"rewards/margins": 0.0009957450674846768,
"rewards/rejected": -0.003001810982823372,
"step": 180
},
{
"epoch": 0.09119270458363331,
"grad_norm": 129.0,
"learning_rate": 4.5454545454545455e-06,
"logits/chosen": -2.8189311027526855,
"logits/rejected": -2.7244656085968018,
"logps/chosen": -2.0750174522399902,
"logps/rejected": -2.0801753997802734,
"loss": 24.791,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.002291204873472452,
"rewards/margins": 0.002107539912685752,
"rewards/rejected": -0.00439874455332756,
"step": 190
},
{
"epoch": 0.09599232061435085,
"grad_norm": 197.0,
"learning_rate": 4.784688995215312e-06,
"logits/chosen": -2.8791394233703613,
"logits/rejected": -2.741698980331421,
"logps/chosen": -2.0747439861297607,
"logps/rejected": -2.146533966064453,
"loss": 24.8672,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.003108713310211897,
"rewards/margins": 0.0013482414651662111,
"rewards/rejected": -0.004456955008208752,
"step": 200
},
{
"epoch": 0.09599232061435085,
"eval_logits/chosen": -2.8479666709899902,
"eval_logits/rejected": -2.7184581756591797,
"eval_logps/chosen": -2.0334408283233643,
"eval_logps/rejected": -2.053351879119873,
"eval_loss": 24.818401336669922,
"eval_rewards/accuracies": 0.7068871855735779,
"eval_rewards/chosen": -0.0031438919249922037,
"eval_rewards/margins": 0.0018456254620105028,
"eval_rewards/rejected": -0.0049895173870027065,
"eval_runtime": 42.4209,
"eval_samples_per_second": 86.938,
"eval_steps_per_second": 21.735,
"step": 200
},
{
"epoch": 0.1007919366450684,
"grad_norm": 93.0,
"learning_rate": 4.999996487062011e-06,
"logits/chosen": -2.792966365814209,
"logits/rejected": -2.71134352684021,
"logps/chosen": -2.032783269882202,
"logps/rejected": -2.0676028728485107,
"loss": 24.8449,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.0031417335849255323,
"rewards/margins": 0.0015724034747108817,
"rewards/rejected": -0.0047141374088823795,
"step": 210
},
{
"epoch": 0.10559155267578593,
"grad_norm": 129.0,
"learning_rate": 4.999574946449064e-06,
"logits/chosen": -2.855587959289551,
"logits/rejected": -2.7221155166625977,
"logps/chosen": -1.9932962656021118,
"logps/rejected": -1.971994161605835,
"loss": 24.784,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.0033854867797344923,
"rewards/margins": 0.0021876911632716656,
"rewards/rejected": -0.0055731781758368015,
"step": 220
},
{
"epoch": 0.11039116870650348,
"grad_norm": 135.0,
"learning_rate": 4.9984509539801644e-06,
"logits/chosen": -2.801809310913086,
"logits/rejected": -2.6762855052948,
"logps/chosen": -2.089353561401367,
"logps/rejected": -2.1304306983947754,
"loss": 24.8228,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.004665095824748278,
"rewards/margins": 0.0018366615986451507,
"rewards/rejected": -0.0065017566084861755,
"step": 230
},
{
"epoch": 0.11519078473722102,
"grad_norm": 89.5,
"learning_rate": 4.996624825529257e-06,
"logits/chosen": -2.8682496547698975,
"logits/rejected": -2.739226818084717,
"logps/chosen": -1.9662659168243408,
"logps/rejected": -2.0159642696380615,
"loss": 24.7663,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.004926943685859442,
"rewards/margins": 0.0023865732364356518,
"rewards/rejected": -0.0073135169222950935,
"step": 240
},
{
"epoch": 0.11999040076793857,
"grad_norm": 88.0,
"learning_rate": 4.994097074290524e-06,
"logits/chosen": -2.8152401447296143,
"logits/rejected": -2.6869540214538574,
"logps/chosen": -2.022200107574463,
"logps/rejected": -2.060678005218506,
"loss": 24.7275,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.0045785121619701385,
"rewards/margins": 0.0027693600859493017,
"rewards/rejected": -0.007347872015088797,
"step": 250
},
{
"epoch": 0.1247900167986561,
"grad_norm": 97.0,
"learning_rate": 4.990868410634163e-06,
"logits/chosen": -2.8812079429626465,
"logits/rejected": -2.762089490890503,
"logps/chosen": -2.090604305267334,
"logps/rejected": -2.0254673957824707,
"loss": 24.7174,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.005911321844905615,
"rewards/margins": 0.0029030940495431423,
"rewards/rejected": -0.008814416825771332,
"step": 260
},
{
"epoch": 0.12958963282937366,
"grad_norm": 99.5,
"learning_rate": 4.9869397419067535e-06,
"logits/chosen": -2.8669378757476807,
"logits/rejected": -2.765909433364868,
"logps/chosen": -1.991996169090271,
"logps/rejected": -2.146804094314575,
"loss": 24.6835,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.005290978588163853,
"rewards/margins": 0.0032333731651306152,
"rewards/rejected": -0.008524352684617043,
"step": 270
},
{
"epoch": 0.13438924886009118,
"grad_norm": 118.0,
"learning_rate": 4.982312172176264e-06,
"logits/chosen": -2.904074192047119,
"logits/rejected": -2.668342351913452,
"logps/chosen": -2.093069314956665,
"logps/rejected": -2.1040687561035156,
"loss": 24.6154,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.005795687437057495,
"rewards/margins": 0.003928179852664471,
"rewards/rejected": -0.009723867289721966,
"step": 280
},
{
"epoch": 0.13918886489080873,
"grad_norm": 79.5,
"learning_rate": 4.976987001921787e-06,
"logits/chosen": -2.8302197456359863,
"logits/rejected": -2.695197820663452,
"logps/chosen": -2.103074550628662,
"logps/rejected": -2.1673691272735596,
"loss": 24.4476,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.00616891635581851,
"rewards/margins": 0.005710528697818518,
"rewards/rejected": -0.011879445053637028,
"step": 290
},
{
"epoch": 0.14398848092152627,
"grad_norm": 147.0,
"learning_rate": 4.97096572766805e-06,
"logits/chosen": -2.846989870071411,
"logits/rejected": -2.7103562355041504,
"logps/chosen": -2.0663247108459473,
"logps/rejected": -2.1190006732940674,
"loss": 24.4601,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.00908294040709734,
"rewards/margins": 0.0055402969010174274,
"rewards/rejected": -0.01462323684245348,
"step": 300
},
{
"epoch": 0.14878809695224382,
"grad_norm": 136.0,
"learning_rate": 4.964250041564868e-06,
"logits/chosen": -2.809588670730591,
"logits/rejected": -2.695178747177124,
"logps/chosen": -2.110123634338379,
"logps/rejected": -2.0886242389678955,
"loss": 24.526,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.008971705101430416,
"rewards/margins": 0.004880317486822605,
"rewards/rejected": -0.01385202445089817,
"step": 310
},
{
"epoch": 0.15358771298296137,
"grad_norm": 128.0,
"learning_rate": 4.956841830911588e-06,
"logits/chosen": -2.797250747680664,
"logits/rejected": -2.6561648845672607,
"logps/chosen": -2.1683425903320312,
"logps/rejected": -2.1576995849609375,
"loss": 24.743,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.01187503058463335,
"rewards/margins": 0.0029168082401156425,
"rewards/rejected": -0.014791838824748993,
"step": 320
},
{
"epoch": 0.1583873290136789,
"grad_norm": 222.0,
"learning_rate": 4.9487431776267095e-06,
"logits/chosen": -2.8276097774505615,
"logits/rejected": -2.706470489501953,
"logps/chosen": -2.066014051437378,
"logps/rejected": -2.1523008346557617,
"loss": 24.459,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.009531533345580101,
"rewards/margins": 0.005659652408212423,
"rewards/rejected": -0.015191185288131237,
"step": 330
},
{
"epoch": 0.16318694504439646,
"grad_norm": 104.5,
"learning_rate": 4.939956357662806e-06,
"logits/chosen": -2.756941080093384,
"logits/rejected": -2.6084799766540527,
"logps/chosen": -2.158834457397461,
"logps/rejected": -2.2011919021606445,
"loss": 24.4022,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.010877870954573154,
"rewards/margins": 0.006291144993156195,
"rewards/rejected": -0.01716901734471321,
"step": 340
},
{
"epoch": 0.16798656107511398,
"grad_norm": 103.5,
"learning_rate": 4.9304838403669155e-06,
"logits/chosen": -2.749115228652954,
"logits/rejected": -2.6077589988708496,
"logps/chosen": -2.142630100250244,
"logps/rejected": -2.1457226276397705,
"loss": 24.4324,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.009920192882418633,
"rewards/margins": 0.0059211114421486855,
"rewards/rejected": -0.015841305255889893,
"step": 350
},
{
"epoch": 0.17278617710583152,
"grad_norm": 156.0,
"learning_rate": 4.920328287786587e-06,
"logits/chosen": -2.774263858795166,
"logits/rejected": -2.6588687896728516,
"logps/chosen": -2.1586852073669434,
"logps/rejected": -2.1538925170898438,
"loss": 24.3966,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.0111884456127882,
"rewards/margins": 0.006309033837169409,
"rewards/rejected": -0.017497479915618896,
"step": 360
},
{
"epoch": 0.17758579313654907,
"grad_norm": 97.0,
"learning_rate": 4.909492553921761e-06,
"logits/chosen": -2.796356201171875,
"logits/rejected": -2.614142894744873,
"logps/chosen": -2.1897201538085938,
"logps/rejected": -2.2553699016571045,
"loss": 24.191,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.011354168877005577,
"rewards/margins": 0.00839508417993784,
"rewards/rejected": -0.019749252125620842,
"step": 370
},
{
"epoch": 0.18238540916726662,
"grad_norm": 111.5,
"learning_rate": 4.897979683922728e-06,
"logits/chosen": -2.825216770172119,
"logits/rejected": -2.7077155113220215,
"logps/chosen": -2.0987040996551514,
"logps/rejected": -2.1244139671325684,
"loss": 24.3523,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.011827873066067696,
"rewards/margins": 0.006785357836633921,
"rewards/rejected": -0.01861323043704033,
"step": 380
},
{
"epoch": 0.18718502519798416,
"grad_norm": 120.0,
"learning_rate": 4.885792913234339e-06,
"logits/chosen": -2.7297961711883545,
"logits/rejected": -2.6895813941955566,
"logps/chosen": -2.137523651123047,
"logps/rejected": -2.2706260681152344,
"loss": 24.3565,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.012831161729991436,
"rewards/margins": 0.006704425904899836,
"rewards/rejected": -0.01953558810055256,
"step": 390
},
{
"epoch": 0.1919846412287017,
"grad_norm": 115.5,
"learning_rate": 4.872935666686767e-06,
"logits/chosen": -2.751054525375366,
"logits/rejected": -2.6253774166107178,
"logps/chosen": -2.1478214263916016,
"logps/rejected": -2.2352182865142822,
"loss": 24.1359,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.012199820950627327,
"rewards/margins": 0.00899488665163517,
"rewards/rejected": -0.0211947038769722,
"step": 400
},
{
"epoch": 0.1919846412287017,
"eval_logits/chosen": -2.780019998550415,
"eval_logits/rejected": -2.6541929244995117,
"eval_logps/chosen": -2.1337366104125977,
"eval_logps/rejected": -2.2183115482330322,
"eval_loss": 24.21178436279297,
"eval_rewards/accuracies": 0.7350867390632629,
"eval_rewards/chosen": -0.01317345630377531,
"eval_rewards/margins": 0.008312047459185123,
"eval_rewards/rejected": -0.021485503762960434,
"eval_runtime": 42.3362,
"eval_samples_per_second": 87.112,
"eval_steps_per_second": 21.778,
"step": 400
},
{
"epoch": 0.19678425725941925,
"grad_norm": 102.0,
"learning_rate": 4.859411557533019e-06,
"logits/chosen": -2.7728092670440674,
"logits/rejected": -2.6491637229919434,
"logps/chosen": -2.127131700515747,
"logps/rejected": -2.2246241569519043,
"loss": 24.1942,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.011492127552628517,
"rewards/margins": 0.008327809162437916,
"rewards/rejected": -0.01981993392109871,
"step": 410
},
{
"epoch": 0.2015838732901368,
"grad_norm": 110.5,
"learning_rate": 4.8452243864335216e-06,
"logits/chosen": -2.7723233699798584,
"logits/rejected": -2.7067365646362305,
"logps/chosen": -2.156499147415161,
"logps/rejected": -2.325840473175049,
"loss": 24.3836,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.01635473594069481,
"rewards/margins": 0.006584727205336094,
"rewards/rejected": -0.022939462214708328,
"step": 420
},
{
"epoch": 0.20638348932085432,
"grad_norm": 122.5,
"learning_rate": 4.830378140388016e-06,
"logits/chosen": -2.872277021408081,
"logits/rejected": -2.6837317943573,
"logps/chosen": -2.1920905113220215,
"logps/rejected": -2.2887978553771973,
"loss": 23.9503,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.014477565884590149,
"rewards/margins": 0.01090961042791605,
"rewards/rejected": -0.025387173518538475,
"step": 430
},
{
"epoch": 0.21118310535157186,
"grad_norm": 115.0,
"learning_rate": 4.814876991615104e-06,
"logits/chosen": -2.725365161895752,
"logits/rejected": -2.641483783721924,
"logps/chosen": -2.1563963890075684,
"logps/rejected": -2.23152756690979,
"loss": 24.2356,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.017191153019666672,
"rewards/margins": 0.008020048029720783,
"rewards/rejected": -0.02521120011806488,
"step": 440
},
{
"epoch": 0.2159827213822894,
"grad_norm": 147.0,
"learning_rate": 4.798725296379736e-06,
"logits/chosen": -2.743837356567383,
"logits/rejected": -2.661710262298584,
"logps/chosen": -2.130923271179199,
"logps/rejected": -2.252439022064209,
"loss": 23.9156,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.01469920389354229,
"rewards/margins": 0.011496955528855324,
"rewards/rejected": -0.026196161285042763,
"step": 450
},
{
"epoch": 0.22078233741300696,
"grad_norm": 101.0,
"learning_rate": 4.781927593768969e-06,
"logits/chosen": -2.785146951675415,
"logits/rejected": -2.6474814414978027,
"logps/chosen": -2.1885874271392822,
"logps/rejected": -2.2562036514282227,
"loss": 23.9065,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.017711792141199112,
"rewards/margins": 0.011485849507153034,
"rewards/rejected": -0.02919764257967472,
"step": 460
},
{
"epoch": 0.2255819534437245,
"grad_norm": 146.0,
"learning_rate": 4.764488604416365e-06,
"logits/chosen": -2.772955894470215,
"logits/rejected": -2.5793957710266113,
"logps/chosen": -2.20389986038208,
"logps/rejected": -2.3331658840179443,
"loss": 23.8238,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.018257344141602516,
"rewards/margins": 0.012381313368678093,
"rewards/rejected": -0.03063865378499031,
"step": 470
},
{
"epoch": 0.23038156947444205,
"grad_norm": 135.0,
"learning_rate": 4.7464132291753464e-06,
"logits/chosen": -2.7145836353302,
"logits/rejected": -2.617164134979248,
"logps/chosen": -2.245074510574341,
"logps/rejected": -2.3874802589416504,
"loss": 23.8524,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.021537043154239655,
"rewards/margins": 0.012409242801368237,
"rewards/rejected": -0.033946286886930466,
"step": 480
},
{
"epoch": 0.2351811855051596,
"grad_norm": 139.0,
"learning_rate": 4.727706547741924e-06,
"logits/chosen": -2.707705020904541,
"logits/rejected": -2.570323944091797,
"logps/chosen": -2.1667869091033936,
"logps/rejected": -2.322767496109009,
"loss": 23.9966,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.021533768624067307,
"rewards/margins": 0.010775558650493622,
"rewards/rejected": -0.03230933099985123,
"step": 490
},
{
"epoch": 0.23998080153587714,
"grad_norm": 112.0,
"learning_rate": 4.708373817227158e-06,
"logits/chosen": -2.6808485984802246,
"logits/rejected": -2.531834840774536,
"logps/chosen": -2.254300594329834,
"logps/rejected": -2.368455410003662,
"loss": 24.0655,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.024044666439294815,
"rewards/margins": 0.010202976875007153,
"rewards/rejected": -0.03424764424562454,
"step": 500
},
{
"epoch": 0.24478041756659466,
"grad_norm": 178.0,
"learning_rate": 4.688420470679754e-06,
"logits/chosen": -2.6859443187713623,
"logits/rejected": -2.536170482635498,
"logps/chosen": -2.3234546184539795,
"logps/rejected": -2.406687021255493,
"loss": 23.7121,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.022238891571760178,
"rewards/margins": 0.013746020384132862,
"rewards/rejected": -0.03598490357398987,
"step": 510
},
{
"epoch": 0.2495800335973122,
"grad_norm": 207.0,
"learning_rate": 4.667852115559227e-06,
"logits/chosen": -2.669847249984741,
"logits/rejected": -2.5342414379119873,
"logps/chosen": -2.2682666778564453,
"logps/rejected": -2.420802354812622,
"loss": 23.3262,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.022443583235144615,
"rewards/margins": 0.01810765266418457,
"rewards/rejected": -0.040551237761974335,
"step": 520
},
{
"epoch": 0.2543796496280298,
"grad_norm": 264.0,
"learning_rate": 4.646674532160041e-06,
"logits/chosen": -2.7094626426696777,
"logits/rejected": -2.6159610748291016,
"logps/chosen": -2.3004629611968994,
"logps/rejected": -2.3994975090026855,
"loss": 23.563,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.02754206955432892,
"rewards/margins": 0.015528721734881401,
"rewards/rejected": -0.04307079315185547,
"step": 530
},
{
"epoch": 0.2591792656587473,
"grad_norm": 160.0,
"learning_rate": 4.6248936719871855e-06,
"logits/chosen": -2.679922342300415,
"logits/rejected": -2.583193063735962,
"logps/chosen": -2.330016613006592,
"logps/rejected": -2.447922945022583,
"loss": 23.3011,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.027103563770651817,
"rewards/margins": 0.01810879446566105,
"rewards/rejected": -0.045212358236312866,
"step": 540
},
{
"epoch": 0.2639788816894648,
"grad_norm": 111.5,
"learning_rate": 4.60251565608363e-06,
"logits/chosen": -2.7204761505126953,
"logits/rejected": -2.620889663696289,
"logps/chosen": -2.2654976844787598,
"logps/rejected": -2.431476593017578,
"loss": 23.6555,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.03067711926996708,
"rewards/margins": 0.014292201027274132,
"rewards/rejected": -0.04496932029724121,
"step": 550
},
{
"epoch": 0.26877849772018236,
"grad_norm": 158.0,
"learning_rate": 4.579546773310136e-06,
"logits/chosen": -2.6384267807006836,
"logits/rejected": -2.4383621215820312,
"logps/chosen": -2.314610004425049,
"logps/rejected": -2.5072264671325684,
"loss": 23.3107,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.0350789912045002,
"rewards/margins": 0.01811934821307659,
"rewards/rejected": -0.05319833755493164,
"step": 560
},
{
"epoch": 0.2735781137508999,
"grad_norm": 198.0,
"learning_rate": 4.5559934785779115e-06,
"logits/chosen": -2.727713108062744,
"logits/rejected": -2.4828808307647705,
"logps/chosen": -2.3135414123535156,
"logps/rejected": -2.619668960571289,
"loss": 22.5403,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.03365301340818405,
"rewards/margins": 0.027801329270005226,
"rewards/rejected": -0.06145433709025383,
"step": 570
},
{
"epoch": 0.27837772978161746,
"grad_norm": 163.0,
"learning_rate": 4.531862391034591e-06,
"logits/chosen": -2.579312801361084,
"logits/rejected": -2.5060434341430664,
"logps/chosen": -2.392556667327881,
"logps/rejected": -2.646885871887207,
"loss": 22.6032,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.03701896592974663,
"rewards/margins": 0.025927256792783737,
"rewards/rejected": -0.06294622272253036,
"step": 580
},
{
"epoch": 0.283177345812335,
"grad_norm": 173.0,
"learning_rate": 4.507160292204074e-06,
"logits/chosen": -2.6914494037628174,
"logits/rejected": -2.5408122539520264,
"logps/chosen": -2.485811233520508,
"logps/rejected": -2.7463126182556152,
"loss": 23.0391,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.04941215366125107,
"rewards/margins": 0.021939506754279137,
"rewards/rejected": -0.07135166227817535,
"step": 590
},
{
"epoch": 0.28797696184305255,
"grad_norm": 202.0,
"learning_rate": 4.481894124080714e-06,
"logits/chosen": -2.6994218826293945,
"logits/rejected": -2.5610313415527344,
"logps/chosen": -2.4740612506866455,
"logps/rejected": -2.7683298587799072,
"loss": 22.6582,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.049321599304676056,
"rewards/margins": 0.025723570957779884,
"rewards/rejected": -0.07504516839981079,
"step": 600
},
{
"epoch": 0.28797696184305255,
"eval_logits/chosen": -2.6280736923217773,
"eval_logits/rejected": -2.499150037765503,
"eval_logps/chosen": -2.5252108573913574,
"eval_logps/rejected": -2.7638449668884277,
"eval_loss": 22.906818389892578,
"eval_rewards/accuracies": 0.7280368804931641,
"eval_rewards/chosen": -0.05232088640332222,
"eval_rewards/margins": 0.02371794730424881,
"eval_rewards/rejected": -0.07603883743286133,
"eval_runtime": 42.3283,
"eval_samples_per_second": 87.129,
"eval_steps_per_second": 21.782,
"step": 600
},
{
"epoch": 0.2927765778737701,
"grad_norm": 179.0,
"learning_rate": 4.456070987178427e-06,
"logits/chosen": -2.6366207599639893,
"logits/rejected": -2.4643049240112305,
"logps/chosen": -2.578244686126709,
"logps/rejected": -2.8700923919677734,
"loss": 22.3263,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.053984034806489944,
"rewards/margins": 0.0292525552213192,
"rewards/rejected": -0.08323659002780914,
"step": 610
},
{
"epoch": 0.29757619390448764,
"grad_norm": 199.0,
"learning_rate": 4.429698138535242e-06,
"logits/chosen": -2.5474119186401367,
"logits/rejected": -2.471780300140381,
"logps/chosen": -2.5730671882629395,
"logps/rejected": -2.793931484222412,
"loss": 22.9645,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.0592615120112896,
"rewards/margins": 0.023461997509002686,
"rewards/rejected": -0.08272351324558258,
"step": 620
},
{
"epoch": 0.3023758099352052,
"grad_norm": 247.0,
"learning_rate": 4.402782989673867e-06,
"logits/chosen": -2.655505657196045,
"logits/rejected": -2.4748637676239014,
"logps/chosen": -2.772947072982788,
"logps/rejected": -3.0535125732421875,
"loss": 22.0546,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.06798827648162842,
"rewards/margins": 0.032981835305690765,
"rewards/rejected": -0.10097010433673859,
"step": 630
},
{
"epoch": 0.30717542596592273,
"grad_norm": 290.0,
"learning_rate": 4.375333104518842e-06,
"logits/chosen": -2.5236053466796875,
"logits/rejected": -2.501242160797119,
"logps/chosen": -2.8443820476531982,
"logps/rejected": -3.1306471824645996,
"loss": 22.4565,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.08212462067604065,
"rewards/margins": 0.030932435765862465,
"rewards/rejected": -0.11305705457925797,
"step": 640
},
{
"epoch": 0.3119750419966403,
"grad_norm": 430.0,
"learning_rate": 4.347356197270852e-06,
"logits/chosen": -2.6051764488220215,
"logits/rejected": -2.42108416557312,
"logps/chosen": -2.9477272033691406,
"logps/rejected": -3.3664963245391846,
"loss": 21.8643,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.0971546396613121,
"rewards/margins": 0.037734415382146835,
"rewards/rejected": -0.13488906621932983,
"step": 650
},
{
"epoch": 0.3167746580273578,
"grad_norm": 320.0,
"learning_rate": 4.318860130238828e-06,
"logits/chosen": -2.4937551021575928,
"logits/rejected": -2.4179840087890625,
"logps/chosen": -3.0144529342651367,
"logps/rejected": -3.405735731124878,
"loss": 22.1218,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.10292772203683853,
"rewards/margins": 0.038406241685152054,
"rewards/rejected": -0.1413339525461197,
"step": 660
},
{
"epoch": 0.32157427405807537,
"grad_norm": 916.0,
"learning_rate": 4.289852911630407e-06,
"logits/chosen": -2.6185176372528076,
"logits/rejected": -2.442739248275757,
"logps/chosen": -3.2067477703094482,
"logps/rejected": -3.5941262245178223,
"loss": 21.47,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.11121847480535507,
"rewards/margins": 0.04536464437842369,
"rewards/rejected": -0.15658311545848846,
"step": 670
},
{
"epoch": 0.3263738900887929,
"grad_norm": 288.0,
"learning_rate": 4.260342693301396e-06,
"logits/chosen": -2.554288625717163,
"logits/rejected": -2.401498317718506,
"logps/chosen": -3.1978609561920166,
"logps/rejected": -3.532090663909912,
"loss": 21.7135,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.11809118092060089,
"rewards/margins": 0.041451532393693924,
"rewards/rejected": -0.15954270958900452,
"step": 680
},
{
"epoch": 0.33117350611951046,
"grad_norm": 322.0,
"learning_rate": 4.2303377684648735e-06,
"logits/chosen": -2.521932601928711,
"logits/rejected": -2.4651012420654297,
"logps/chosen": -3.2608189582824707,
"logps/rejected": -3.8548882007598877,
"loss": 21.1311,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.12732163071632385,
"rewards/margins": 0.04996702820062637,
"rewards/rejected": -0.17728865146636963,
"step": 690
},
{
"epoch": 0.33597312215022795,
"grad_norm": 340.0,
"learning_rate": 4.199846569360558e-06,
"logits/chosen": -2.530945062637329,
"logits/rejected": -2.415677547454834,
"logps/chosen": -3.485921859741211,
"logps/rejected": -3.974107265472412,
"loss": 21.4086,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.14324818551540375,
"rewards/margins": 0.04987693950533867,
"rewards/rejected": -0.19312509894371033,
"step": 700
},
{
"epoch": 0.3407727381809455,
"grad_norm": 392.0,
"learning_rate": 4.168877664885104e-06,
"logits/chosen": -2.5249454975128174,
"logits/rejected": -2.341789722442627,
"logps/chosen": -3.4189887046813965,
"logps/rejected": -4.103830814361572,
"loss": 19.6115,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.1408146768808365,
"rewards/margins": 0.06821783632040024,
"rewards/rejected": -0.20903250575065613,
"step": 710
},
{
"epoch": 0.34557235421166305,
"grad_norm": 366.0,
"learning_rate": 4.1374397581840035e-06,
"logits/chosen": -2.546001434326172,
"logits/rejected": -2.4036498069763184,
"logps/chosen": -3.642977237701416,
"logps/rejected": -4.318962097167969,
"loss": 20.6931,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.1615218222141266,
"rewards/margins": 0.07000206410884857,
"rewards/rejected": -0.23152390122413635,
"step": 720
},
{
"epoch": 0.3503719702423806,
"grad_norm": 358.0,
"learning_rate": 4.105541684205752e-06,
"logits/chosen": -2.486359119415283,
"logits/rejected": -2.3801631927490234,
"logps/chosen": -3.5780110359191895,
"logps/rejected": -4.285566806793213,
"loss": 19.9833,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.15976889431476593,
"rewards/margins": 0.06793634593486786,
"rewards/rejected": -0.22770527005195618,
"step": 730
},
{
"epoch": 0.35517158627309814,
"grad_norm": 612.0,
"learning_rate": 4.073192407218972e-06,
"logits/chosen": -2.5256106853485107,
"logits/rejected": -2.341278553009033,
"logps/chosen": -3.9327900409698486,
"logps/rejected": -4.685281276702881,
"loss": 19.3244,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.18909943103790283,
"rewards/margins": 0.08580980449914932,
"rewards/rejected": -0.27490919828414917,
"step": 740
},
{
"epoch": 0.3599712023038157,
"grad_norm": 426.0,
"learning_rate": 4.040401018293204e-06,
"logits/chosen": -2.4474287033081055,
"logits/rejected": -2.348024368286133,
"logps/chosen": -4.319335460662842,
"logps/rejected": -5.043323040008545,
"loss": 21.4805,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.22994713485240936,
"rewards/margins": 0.06516584753990173,
"rewards/rejected": -0.2951129376888275,
"step": 750
},
{
"epoch": 0.36477081833453323,
"grad_norm": 632.0,
"learning_rate": 4.007176732744054e-06,
"logits/chosen": -2.4296607971191406,
"logits/rejected": -2.407029390335083,
"logps/chosen": -4.300168514251709,
"logps/rejected": -5.448733329772949,
"loss": 17.9868,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.22992363572120667,
"rewards/margins": 0.1060696393251419,
"rewards/rejected": -0.335993230342865,
"step": 760
},
{
"epoch": 0.3695704343652508,
"grad_norm": 434.0,
"learning_rate": 3.9735288875434254e-06,
"logits/chosen": -2.526063919067383,
"logits/rejected": -2.2934727668762207,
"logps/chosen": -4.9603986740112305,
"logps/rejected": -5.963088035583496,
"loss": 19.4164,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.29594284296035767,
"rewards/margins": 0.10229671001434326,
"rewards/rejected": -0.3982395529747009,
"step": 770
},
{
"epoch": 0.3743700503959683,
"grad_norm": 728.0,
"learning_rate": 3.939466938695565e-06,
"logits/chosen": -2.4236252307891846,
"logits/rejected": -2.3333449363708496,
"logps/chosen": -5.106991767883301,
"logps/rejected": -5.907806873321533,
"loss": 21.2552,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.30414628982543945,
"rewards/margins": 0.0843992829322815,
"rewards/rejected": -0.38854554295539856,
"step": 780
},
{
"epoch": 0.37916966642668587,
"grad_norm": 712.0,
"learning_rate": 3.905000458579657e-06,
"logits/chosen": -2.440783977508545,
"logits/rejected": -2.383155345916748,
"logps/chosen": -4.998103618621826,
"logps/rejected": -5.937209129333496,
"loss": 21.1386,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.3009765148162842,
"rewards/margins": 0.0884396880865097,
"rewards/rejected": -0.3894162178039551,
"step": 790
},
{
"epoch": 0.3839692824574034,
"grad_norm": 552.0,
"learning_rate": 3.87013913325971e-06,
"logits/chosen": -2.4607484340667725,
"logits/rejected": -2.2976975440979004,
"logps/chosen": -5.141398906707764,
"logps/rejected": -6.184676170349121,
"loss": 21.2405,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.3076976537704468,
"rewards/margins": 0.10376839339733124,
"rewards/rejected": -0.4114660322666168,
"step": 800
},
{
"epoch": 0.3839692824574034,
"eval_logits/chosen": -2.4837894439697266,
"eval_logits/rejected": -2.3571584224700928,
"eval_logps/chosen": -4.861568450927734,
"eval_logps/rejected": -5.898691177368164,
"eval_loss": 19.59800910949707,
"eval_rewards/accuracies": 0.7207158207893372,
"eval_rewards/chosen": -0.28595665097236633,
"eval_rewards/margins": 0.10356683284044266,
"eval_rewards/rejected": -0.3895234763622284,
"eval_runtime": 42.3566,
"eval_samples_per_second": 87.07,
"eval_steps_per_second": 21.768,
"step": 800
},
{
"epoch": 0.38876889848812096,
"grad_norm": 516.0,
"learning_rate": 3.8348927597624965e-06,
"logits/chosen": -2.5247642993927,
"logits/rejected": -2.4116523265838623,
"logps/chosen": -4.852524757385254,
"logps/rejected": -5.755672931671143,
"loss": 19.6512,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.2827115058898926,
"rewards/margins": 0.09489643573760986,
"rewards/rejected": -0.37760791182518005,
"step": 810
},
{
"epoch": 0.3935685145188385,
"grad_norm": 840.0,
"learning_rate": 3.7992712433243117e-06,
"logits/chosen": -2.4883017539978027,
"logits/rejected": -2.2909560203552246,
"logps/chosen": -4.790406227111816,
"logps/rejected": -5.914352893829346,
"loss": 17.8575,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.268026739358902,
"rewards/margins": 0.1235589012503624,
"rewards/rejected": -0.39158564805984497,
"step": 820
},
{
"epoch": 0.39836813054955605,
"grad_norm": 504.0,
"learning_rate": 3.7632845946073136e-06,
"logits/chosen": -2.5037264823913574,
"logits/rejected": -2.3298327922821045,
"logps/chosen": -4.825437545776367,
"logps/rejected": -5.963745594024658,
"loss": 17.5615,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.28179073333740234,
"rewards/margins": 0.11633607000112534,
"rewards/rejected": -0.39812684059143066,
"step": 830
},
{
"epoch": 0.4031677465802736,
"grad_norm": 624.0,
"learning_rate": 3.7269429268862513e-06,
"logits/chosen": -2.4601664543151855,
"logits/rejected": -2.3821940422058105,
"logps/chosen": -5.032884120941162,
"logps/rejected": -6.054814338684082,
"loss": 20.4908,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.3037104308605194,
"rewards/margins": 0.10300777107477188,
"rewards/rejected": -0.40671825408935547,
"step": 840
},
{
"epoch": 0.40796736261099115,
"grad_norm": 516.0,
"learning_rate": 3.690256453206334e-06,
"logits/chosen": -2.447526216506958,
"logits/rejected": -2.4134206771850586,
"logps/chosen": -5.021899700164795,
"logps/rejected": -6.324837684631348,
"loss": 18.5459,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.3016757667064667,
"rewards/margins": 0.13094934821128845,
"rewards/rejected": -0.4326251447200775,
"step": 850
},
{
"epoch": 0.41276697864170864,
"grad_norm": 524.0,
"learning_rate": 3.6532354835130844e-06,
"logits/chosen": -2.4638381004333496,
"logits/rejected": -2.351792097091675,
"logps/chosen": -5.105628967285156,
"logps/rejected": -6.278634548187256,
"loss": 18.2749,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.2969185709953308,
"rewards/margins": 0.12300584465265274,
"rewards/rejected": -0.41992440819740295,
"step": 860
},
{
"epoch": 0.4175665946724262,
"grad_norm": 632.0,
"learning_rate": 3.6158904217549446e-06,
"logits/chosen": -2.4699833393096924,
"logits/rejected": -2.385835886001587,
"logps/chosen": -5.065582752227783,
"logps/rejected": -6.120713233947754,
"loss": 18.5943,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.3055879473686218,
"rewards/margins": 0.10839849710464478,
"rewards/rejected": -0.4139864444732666,
"step": 870
},
{
"epoch": 0.42236621070314373,
"grad_norm": 596.0,
"learning_rate": 3.5782317629594708e-06,
"logits/chosen": -2.4469141960144043,
"logits/rejected": -2.3317453861236572,
"logps/chosen": -5.162604808807373,
"logps/rejected": -6.183932781219482,
"loss": 19.7477,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.318970650434494,
"rewards/margins": 0.10084603726863861,
"rewards/rejected": -0.41981667280197144,
"step": 880
},
{
"epoch": 0.4271658267338613,
"grad_norm": 532.0,
"learning_rate": 3.5402700902839317e-06,
"logits/chosen": -2.3398728370666504,
"logits/rejected": -2.290860652923584,
"logps/chosen": -4.9323530197143555,
"logps/rejected": -6.385754585266113,
"loss": 18.616,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.29864078760147095,
"rewards/margins": 0.13272006809711456,
"rewards/rejected": -0.4313608705997467,
"step": 890
},
{
"epoch": 0.4319654427645788,
"grad_norm": 406.0,
"learning_rate": 3.5020160720411408e-06,
"logits/chosen": -2.419809103012085,
"logits/rejected": -2.292680263519287,
"logps/chosen": -4.806826591491699,
"logps/rejected": -6.202929496765137,
"loss": 18.3222,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -0.28297653794288635,
"rewards/margins": 0.13922308385372162,
"rewards/rejected": -0.4221995770931244,
"step": 900
},
{
"epoch": 0.43676505879529637,
"grad_norm": 784.0,
"learning_rate": 3.4634804587013505e-06,
"logits/chosen": -2.4105165004730225,
"logits/rejected": -2.356565237045288,
"logps/chosen": -5.013951301574707,
"logps/rejected": -5.902289390563965,
"loss": 21.7458,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.2960304319858551,
"rewards/margins": 0.08540787547826767,
"rewards/rejected": -0.38143831491470337,
"step": 910
},
{
"epoch": 0.4415646748260139,
"grad_norm": 672.0,
"learning_rate": 3.424674079871073e-06,
"logits/chosen": -2.399247407913208,
"logits/rejected": -2.2991671562194824,
"logps/chosen": -4.808962821960449,
"logps/rejected": -5.891338348388672,
"loss": 19.6052,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.2799363434314728,
"rewards/margins": 0.10835248231887817,
"rewards/rejected": -0.38828879594802856,
"step": 920
},
{
"epoch": 0.44636429085673146,
"grad_norm": 452.0,
"learning_rate": 3.3856078412496424e-06,
"logits/chosen": -2.4416189193725586,
"logits/rejected": -2.2881383895874023,
"logps/chosen": -5.245587348937988,
"logps/rejected": -6.376460075378418,
"loss": 18.8935,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -0.3177250027656555,
"rewards/margins": 0.1189032793045044,
"rewards/rejected": -0.4366282522678375,
"step": 930
},
{
"epoch": 0.451163906887449,
"grad_norm": 884.0,
"learning_rate": 3.346292721564407e-06,
"logits/chosen": -2.475155830383301,
"logits/rejected": -2.3437106609344482,
"logps/chosen": -5.09636116027832,
"logps/rejected": -6.522047996520996,
"loss": 18.0911,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.3036734163761139,
"rewards/margins": 0.14839448034763336,
"rewards/rejected": -0.45206791162490845,
"step": 940
},
{
"epoch": 0.45596352291816655,
"grad_norm": 476.0,
"learning_rate": 3.306739769485394e-06,
"logits/chosen": -2.4079017639160156,
"logits/rejected": -2.2727808952331543,
"logps/chosen": -4.894346237182617,
"logps/rejected": -6.419544219970703,
"loss": 17.65,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.2916712164878845,
"rewards/margins": 0.14445583522319794,
"rewards/rejected": -0.43612709641456604,
"step": 950
},
{
"epoch": 0.4607631389488841,
"grad_norm": 812.0,
"learning_rate": 3.266960100520316e-06,
"logits/chosen": -2.4455816745758057,
"logits/rejected": -2.3314123153686523,
"logps/chosen": -5.410442352294922,
"logps/rejected": -6.7745513916015625,
"loss": 19.015,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.34293264150619507,
"rewards/margins": 0.13221827149391174,
"rewards/rejected": -0.4751509130001068,
"step": 960
},
{
"epoch": 0.46556275497960165,
"grad_norm": 548.0,
"learning_rate": 3.2269648938907977e-06,
"logits/chosen": -2.409432888031006,
"logits/rejected": -2.273597240447998,
"logps/chosen": -5.212181568145752,
"logps/rejected": -6.484990119934082,
"loss": 19.1993,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.32137319445610046,
"rewards/margins": 0.12608560919761658,
"rewards/rejected": -0.44745880365371704,
"step": 970
},
{
"epoch": 0.4703623710103192,
"grad_norm": 492.0,
"learning_rate": 3.186765389390696e-06,
"logits/chosen": -2.4819793701171875,
"logits/rejected": -2.3250374794006348,
"logps/chosen": -5.389103412628174,
"logps/rejected": -6.684311866760254,
"loss": 17.9804,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.33990758657455444,
"rewards/margins": 0.1264369785785675,
"rewards/rejected": -0.46634459495544434,
"step": 980
},
{
"epoch": 0.47516198704103674,
"grad_norm": 652.0,
"learning_rate": 3.146372884227393e-06,
"logits/chosen": -2.424643039703369,
"logits/rejected": -2.302924156188965,
"logps/chosen": -5.546316623687744,
"logps/rejected": -6.512415885925293,
"loss": 20.8357,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.35321250557899475,
"rewards/margins": 0.09708378463983536,
"rewards/rejected": -0.4502963125705719,
"step": 990
},
{
"epoch": 0.4799616030717543,
"grad_norm": 544.0,
"learning_rate": 3.1057987298469693e-06,
"logits/chosen": -2.3848588466644287,
"logits/rejected": -2.2567191123962402,
"logps/chosen": -5.158990859985352,
"logps/rejected": -6.674688816070557,
"loss": 16.8004,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.31221428513526917,
"rewards/margins": 0.1547573357820511,
"rewards/rejected": -0.46697157621383667,
"step": 1000
},
{
"epoch": 0.4799616030717543,
"eval_logits/chosen": -2.4227147102355957,
"eval_logits/rejected": -2.290954113006592,
"eval_logps/chosen": -5.388276100158691,
"eval_logps/rejected": -6.679952621459961,
"eval_loss": 18.88290023803711,
"eval_rewards/accuracies": 0.7299349308013916,
"eval_rewards/chosen": -0.338627427816391,
"eval_rewards/margins": 0.1290222406387329,
"eval_rewards/rejected": -0.4676496684551239,
"eval_runtime": 42.4066,
"eval_samples_per_second": 86.968,
"eval_steps_per_second": 21.742,
"step": 1000
},
{
"epoch": 0.48476121910247183,
"grad_norm": 436.0,
"learning_rate": 3.06505432874411e-06,
"logits/chosen": -2.368929147720337,
"logits/rejected": -2.199861764907837,
"logps/chosen": -5.607582092285156,
"logps/rejected": -6.669485569000244,
"loss": 19.8122,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.3550570607185364,
"rewards/margins": 0.11564314365386963,
"rewards/rejected": -0.470700204372406,
"step": 1010
},
{
"epoch": 0.4895608351331893,
"grad_norm": 628.0,
"learning_rate": 3.024151131257688e-06,
"logits/chosen": -2.4425511360168457,
"logits/rejected": -2.282534122467041,
"logps/chosen": -5.256259441375732,
"logps/rejected": -6.541900634765625,
"loss": 18.8243,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.3223492205142975,
"rewards/margins": 0.1309443712234497,
"rewards/rejected": -0.45329350233078003,
"step": 1020
},
{
"epoch": 0.49436045116390687,
"grad_norm": 664.0,
"learning_rate": 2.983100632352889e-06,
"logits/chosen": -2.475759983062744,
"logits/rejected": -2.2628698348999023,
"logps/chosen": -5.166888236999512,
"logps/rejected": -6.666528224945068,
"loss": 16.9634,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.31691330671310425,
"rewards/margins": 0.1498645842075348,
"rewards/rejected": -0.46677789092063904,
"step": 1030
},
{
"epoch": 0.4991600671946244,
"grad_norm": 732.0,
"learning_rate": 2.9419143683907987e-06,
"logits/chosen": -2.381078004837036,
"logits/rejected": -2.3059287071228027,
"logps/chosen": -5.452860355377197,
"logps/rejected": -6.59310245513916,
"loss": 20.2132,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.3491719365119934,
"rewards/margins": 0.10954836755990982,
"rewards/rejected": -0.45872029662132263,
"step": 1040
},
{
"epoch": 0.503959683225342,
"grad_norm": 604.0,
"learning_rate": 2.9006039138863572e-06,
"logits/chosen": -2.37727689743042,
"logits/rejected": -2.2305667400360107,
"logps/chosen": -5.224418640136719,
"logps/rejected": -6.83002233505249,
"loss": 17.5785,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.3208765983581543,
"rewards/margins": 0.15252140164375305,
"rewards/rejected": -0.47339797019958496,
"step": 1050
},
{
"epoch": 0.5087592992560596,
"grad_norm": 1304.0,
"learning_rate": 2.8591808782555883e-06,
"logits/chosen": -2.3770766258239746,
"logits/rejected": -2.300572395324707,
"logps/chosen": -5.146916389465332,
"logps/rejected": -6.356301307678223,
"loss": 19.416,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.3121251165866852,
"rewards/margins": 0.11791741847991943,
"rewards/rejected": -0.430042564868927,
"step": 1060
},
{
"epoch": 0.5135589152867771,
"grad_norm": 684.0,
"learning_rate": 2.817656902553024e-06,
"logits/chosen": -2.404541492462158,
"logits/rejected": -2.285163164138794,
"logps/chosen": -5.202984809875488,
"logps/rejected": -6.198671340942383,
"loss": 20.2766,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.3168830871582031,
"rewards/margins": 0.1051924005150795,
"rewards/rejected": -0.42207545042037964,
"step": 1070
},
{
"epoch": 0.5183585313174947,
"grad_norm": 596.0,
"learning_rate": 2.7760436562002354e-06,
"logits/chosen": -2.414473533630371,
"logits/rejected": -2.2184202671051025,
"logps/chosen": -5.178278923034668,
"logps/rejected": -6.526847839355469,
"loss": 19.3254,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.318962424993515,
"rewards/margins": 0.12891003489494324,
"rewards/rejected": -0.44787248969078064,
"step": 1080
},
{
"epoch": 0.5231581473482121,
"grad_norm": 400.0,
"learning_rate": 2.7343528337063924e-06,
"logits/chosen": -2.502185106277466,
"logits/rejected": -2.3397912979125977,
"logps/chosen": -5.1583051681518555,
"logps/rejected": -6.327475547790527,
"loss": 19.2467,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.31146809458732605,
"rewards/margins": 0.12158875167369843,
"rewards/rejected": -0.4330568313598633,
"step": 1090
},
{
"epoch": 0.5279577633789296,
"grad_norm": 492.0,
"learning_rate": 2.692596151381774e-06,
"logits/chosen": -2.3989977836608887,
"logits/rejected": -2.327848434448242,
"logps/chosen": -5.252208232879639,
"logps/rejected": -6.45583438873291,
"loss": 19.3017,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.3295753598213196,
"rewards/margins": 0.11794829368591309,
"rewards/rejected": -0.44752365350723267,
"step": 1100
},
{
"epoch": 0.5327573794096472,
"grad_norm": 748.0,
"learning_rate": 2.650785344045149e-06,
"logits/chosen": -2.3955113887786865,
"logits/rejected": -2.2868549823760986,
"logps/chosen": -5.376906394958496,
"logps/rejected": -6.9210381507873535,
"loss": 17.2919,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.33396396040916443,
"rewards/margins": 0.15392349660396576,
"rewards/rejected": -0.4878874719142914,
"step": 1110
},
{
"epoch": 0.5375569954403647,
"grad_norm": 828.0,
"learning_rate": 2.6089321617259583e-06,
"logits/chosen": -2.388028621673584,
"logits/rejected": -2.246572971343994,
"logps/chosen": -5.159210205078125,
"logps/rejected": -6.546744346618652,
"loss": 17.853,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.31711554527282715,
"rewards/margins": 0.13862815499305725,
"rewards/rejected": -0.4557436406612396,
"step": 1120
},
{
"epoch": 0.5423566114710823,
"grad_norm": 788.0,
"learning_rate": 2.567048366362225e-06,
"logits/chosen": -2.377281665802002,
"logits/rejected": -2.249464273452759,
"logps/chosen": -5.263833522796631,
"logps/rejected": -6.494312286376953,
"loss": 19.6448,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.32637596130371094,
"rewards/margins": 0.12430766969919205,
"rewards/rejected": -0.45068368315696716,
"step": 1130
},
{
"epoch": 0.5471562275017998,
"grad_norm": 580.0,
"learning_rate": 2.525145728495106e-06,
"logits/chosen": -2.3985917568206787,
"logits/rejected": -2.2843620777130127,
"logps/chosen": -5.43961763381958,
"logps/rejected": -6.620604515075684,
"loss": 19.2967,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.34411385655403137,
"rewards/margins": 0.12279383838176727,
"rewards/rejected": -0.46690768003463745,
"step": 1140
},
{
"epoch": 0.5519558435325174,
"grad_norm": 612.0,
"learning_rate": 2.4832360239610416e-06,
"logits/chosen": -2.4003376960754395,
"logits/rejected": -2.280998706817627,
"logps/chosen": -5.3138885498046875,
"logps/rejected": -6.607431888580322,
"loss": 18.5493,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.3358193039894104,
"rewards/margins": 0.12418397516012192,
"rewards/rejected": -0.4600032866001129,
"step": 1150
},
{
"epoch": 0.5567554595632349,
"grad_norm": 604.0,
"learning_rate": 2.441331030582407e-06,
"logits/chosen": -2.400425672531128,
"logits/rejected": -2.304504871368408,
"logps/chosen": -5.102894306182861,
"logps/rejected": -6.318313121795654,
"loss": 17.8164,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.3085941672325134,
"rewards/margins": 0.12492404133081436,
"rewards/rejected": -0.4335181713104248,
"step": 1160
},
{
"epoch": 0.5615550755939525,
"grad_norm": 712.0,
"learning_rate": 2.3994425248576102e-06,
"logits/chosen": -2.449359178543091,
"logits/rejected": -2.2663731575012207,
"logps/chosen": -5.372702121734619,
"logps/rejected": -6.342513084411621,
"loss": 19.6098,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.33157217502593994,
"rewards/margins": 0.10498969256877899,
"rewards/rejected": -0.43656182289123535,
"step": 1170
},
{
"epoch": 0.56635469162467,
"grad_norm": 668.0,
"learning_rate": 2.357582278651553e-06,
"logits/chosen": -2.3872971534729004,
"logits/rejected": -2.2551045417785645,
"logps/chosen": -5.195388317108154,
"logps/rejected": -6.63875675201416,
"loss": 18.3228,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.317712664604187,
"rewards/margins": 0.1470889151096344,
"rewards/rejected": -0.464801549911499,
"step": 1180
},
{
"epoch": 0.5711543076553875,
"grad_norm": 540.0,
"learning_rate": 2.315762055887411e-06,
"logits/chosen": -2.4101014137268066,
"logits/rejected": -2.2738122940063477,
"logps/chosen": -5.302289009094238,
"logps/rejected": -6.561962127685547,
"loss": 18.411,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.3272460103034973,
"rewards/margins": 0.1362571120262146,
"rewards/rejected": -0.46350306272506714,
"step": 1190
},
{
"epoch": 0.5759539236861051,
"grad_norm": 716.0,
"learning_rate": 2.273993609240629e-06,
"logits/chosen": -2.3701391220092773,
"logits/rejected": -2.2508363723754883,
"logps/chosen": -5.6830153465271,
"logps/rejected": -6.565041542053223,
"loss": 20.9951,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.37105122208595276,
"rewards/margins": 0.08828712999820709,
"rewards/rejected": -0.45933833718299866,
"step": 1200
},
{
"epoch": 0.5759539236861051,
"eval_logits/chosen": -2.4138081073760986,
"eval_logits/rejected": -2.282156229019165,
"eval_logps/chosen": -5.275580406188965,
"eval_logps/rejected": -6.555542469024658,
"eval_loss": 18.547019958496094,
"eval_rewards/accuracies": 0.7342733144760132,
"eval_rewards/chosen": -0.32735785841941833,
"eval_rewards/margins": 0.1278507560491562,
"eval_rewards/rejected": -0.4552086293697357,
"eval_runtime": 42.4478,
"eval_samples_per_second": 86.883,
"eval_steps_per_second": 21.721,
"step": 1200
},
{
"epoch": 0.5807535397168226,
"grad_norm": 454.0,
"learning_rate": 2.2322886768360874e-06,
"logits/chosen": -2.3059446811676025,
"logits/rejected": -2.2170162200927734,
"logps/chosen": -4.870773792266846,
"logps/rejected": -6.4870758056640625,
"loss": 16.5756,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.2862984538078308,
"rewards/margins": 0.16298805177211761,
"rewards/rejected": -0.4492865204811096,
"step": 1210
},
{
"epoch": 0.5855531557475402,
"grad_norm": 482.0,
"learning_rate": 2.190658978949352e-06,
"logits/chosen": -2.423422336578369,
"logits/rejected": -2.2647502422332764,
"logps/chosen": -5.086521148681641,
"logps/rejected": -6.444764614105225,
"loss": 18.83,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.3160659670829773,
"rewards/margins": 0.13146895170211792,
"rewards/rejected": -0.4475349485874176,
"step": 1220
},
{
"epoch": 0.5903527717782577,
"grad_norm": 604.0,
"learning_rate": 2.149116214712943e-06,
"logits/chosen": -2.4074902534484863,
"logits/rejected": -2.2750191688537598,
"logps/chosen": -5.390463352203369,
"logps/rejected": -6.534971714019775,
"loss": 19.6129,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.34208056330680847,
"rewards/margins": 0.11322804540395737,
"rewards/rejected": -0.45530858635902405,
"step": 1230
},
{
"epoch": 0.5951523878089753,
"grad_norm": 1088.0,
"learning_rate": 2.107672058828544e-06,
"logits/chosen": -2.4263134002685547,
"logits/rejected": -2.294344186782837,
"logps/chosen": -5.470616817474365,
"logps/rejected": -6.507209777832031,
"loss": 20.2223,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.34126371145248413,
"rewards/margins": 0.11158865690231323,
"rewards/rejected": -0.452852338552475,
"step": 1240
},
{
"epoch": 0.5999520038396928,
"grad_norm": 824.0,
"learning_rate": 2.066338158286083e-06,
"logits/chosen": -2.417226552963257,
"logits/rejected": -2.3527023792266846,
"logps/chosen": -5.317389488220215,
"logps/rejected": -6.782002925872803,
"loss": 17.742,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.3306230902671814,
"rewards/margins": 0.14506593346595764,
"rewards/rejected": -0.47568902373313904,
"step": 1250
},
{
"epoch": 0.6047516198704104,
"grad_norm": 520.0,
"learning_rate": 2.025126129090588e-06,
"logits/chosen": -2.4989140033721924,
"logits/rejected": -2.3209874629974365,
"logps/chosen": -5.407619476318359,
"logps/rejected": -6.895826816558838,
"loss": 16.1524,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -0.3376930356025696,
"rewards/margins": 0.14856931567192078,
"rewards/rejected": -0.48626232147216797,
"step": 1260
},
{
"epoch": 0.6095512359011279,
"grad_norm": 474.0,
"learning_rate": 1.9840475529977655e-06,
"logits/chosen": -2.383535385131836,
"logits/rejected": -2.271467685699463,
"logps/chosen": -5.673216342926025,
"logps/rejected": -6.774017333984375,
"loss": 20.3021,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.36859917640686035,
"rewards/margins": 0.11497589200735092,
"rewards/rejected": -0.4835750460624695,
"step": 1270
},
{
"epoch": 0.6143508519318455,
"grad_norm": 628.0,
"learning_rate": 1.9431139742591897e-06,
"logits/chosen": -2.399413585662842,
"logits/rejected": -2.2823705673217773,
"logps/chosen": -5.129962921142578,
"logps/rejected": -6.771612644195557,
"loss": 16.3546,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.3149269223213196,
"rewards/margins": 0.1614220291376114,
"rewards/rejected": -0.47634896636009216,
"step": 1280
},
{
"epoch": 0.619150467962563,
"grad_norm": 466.0,
"learning_rate": 1.9023368963780458e-06,
"logits/chosen": -2.3983473777770996,
"logits/rejected": -2.2607951164245605,
"logps/chosen": -5.099265098571777,
"logps/rejected": -6.167372703552246,
"loss": 19.8386,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.3082767724990845,
"rewards/margins": 0.10987050831317902,
"rewards/rejected": -0.4181472659111023,
"step": 1290
},
{
"epoch": 0.6239500839932806,
"grad_norm": 576.0,
"learning_rate": 1.861727778876314e-06,
"logits/chosen": -2.3845856189727783,
"logits/rejected": -2.2755587100982666,
"logps/chosen": -5.35369873046875,
"logps/rejected": -6.725785255432129,
"loss": 17.3317,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -0.33213597536087036,
"rewards/margins": 0.15116584300994873,
"rewards/rejected": -0.4833018183708191,
"step": 1300
},
{
"epoch": 0.6287497000239981,
"grad_norm": 652.0,
"learning_rate": 1.8212980340743152e-06,
"logits/chosen": -2.4343972206115723,
"logits/rejected": -2.3320324420928955,
"logps/chosen": -5.641356468200684,
"logps/rejected": -6.386562347412109,
"loss": 22.8785,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.36470332741737366,
"rewards/margins": 0.07602353394031525,
"rewards/rejected": -0.4407268464565277,
"step": 1310
},
{
"epoch": 0.6335493160547156,
"grad_norm": 748.0,
"learning_rate": 1.7810590238835279e-06,
"logits/chosen": -2.328641653060913,
"logits/rejected": -2.3151795864105225,
"logps/chosen": -5.246110439300537,
"logps/rejected": -6.596944332122803,
"loss": 18.2564,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.32314419746398926,
"rewards/margins": 0.1345185935497284,
"rewards/rejected": -0.45766276121139526,
"step": 1320
},
{
"epoch": 0.6383489320854332,
"grad_norm": 588.0,
"learning_rate": 1.7410220566135605e-06,
"logits/chosen": -2.448850393295288,
"logits/rejected": -2.3165359497070312,
"logps/chosen": -5.31040096282959,
"logps/rejected": -6.448628902435303,
"loss": 18.7647,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.32558658719062805,
"rewards/margins": 0.11407657712697983,
"rewards/rejected": -0.4396631717681885,
"step": 1330
},
{
"epoch": 0.6431485481161507,
"grad_norm": 548.0,
"learning_rate": 1.7011983837942023e-06,
"logits/chosen": -2.4035420417785645,
"logits/rejected": -2.276169776916504,
"logps/chosen": -4.991080284118652,
"logps/rejected": -6.5472259521484375,
"loss": 16.5535,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.2946924865245819,
"rewards/margins": 0.15590792894363403,
"rewards/rejected": -0.45060038566589355,
"step": 1340
},
{
"epoch": 0.6479481641468683,
"grad_norm": 648.0,
"learning_rate": 1.661599197013416e-06,
"logits/chosen": -2.433659076690674,
"logits/rejected": -2.290144205093384,
"logps/chosen": -5.426926612854004,
"logps/rejected": -6.584383964538574,
"loss": 18.6522,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.3438794016838074,
"rewards/margins": 0.11993012577295303,
"rewards/rejected": -0.4638095498085022,
"step": 1350
},
{
"epoch": 0.6527477801775858,
"grad_norm": 712.0,
"learning_rate": 1.6222356247721831e-06,
"logits/chosen": -2.4088618755340576,
"logits/rejected": -2.264636516571045,
"logps/chosen": -5.1082048416137695,
"logps/rejected": -6.308968544006348,
"loss": 18.5336,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.3122280240058899,
"rewards/margins": 0.11769269406795502,
"rewards/rejected": -0.4299207329750061,
"step": 1360
},
{
"epoch": 0.6575473962083034,
"grad_norm": 960.0,
"learning_rate": 1.5831187293570826e-06,
"logits/chosen": -2.4132819175720215,
"logits/rejected": -2.275599241256714,
"logps/chosen": -5.336572647094727,
"logps/rejected": -6.749258995056152,
"loss": 18.7735,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.33027634024620056,
"rewards/margins": 0.13894790410995483,
"rewards/rejected": -0.4692242741584778,
"step": 1370
},
{
"epoch": 0.6623470122390209,
"grad_norm": 848.0,
"learning_rate": 1.544259503731465e-06,
"logits/chosen": -2.416928291320801,
"logits/rejected": -2.2550978660583496,
"logps/chosen": -5.133327007293701,
"logps/rejected": -6.72185754776001,
"loss": 15.7633,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.313863605260849,
"rewards/margins": 0.16045403480529785,
"rewards/rejected": -0.47431764006614685,
"step": 1380
},
{
"epoch": 0.6671466282697385,
"grad_norm": 536.0,
"learning_rate": 1.5056688684461235e-06,
"logits/chosen": -2.404127597808838,
"logits/rejected": -2.252338409423828,
"logps/chosen": -5.645106315612793,
"logps/rejected": -6.854499816894531,
"loss": 20.0842,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.36415529251098633,
"rewards/margins": 0.11795148998498917,
"rewards/rejected": -0.4821067452430725,
"step": 1390
},
{
"epoch": 0.6719462443004559,
"grad_norm": 494.0,
"learning_rate": 1.4673576685703027e-06,
"logits/chosen": -2.3945698738098145,
"logits/rejected": -2.292322874069214,
"logps/chosen": -5.456864356994629,
"logps/rejected": -6.6700286865234375,
"loss": 19.3603,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.3404294550418854,
"rewards/margins": 0.1247081533074379,
"rewards/rejected": -0.4651375412940979,
"step": 1400
},
{
"epoch": 0.6719462443004559,
"eval_logits/chosen": -2.4091436862945557,
"eval_logits/rejected": -2.276984930038452,
"eval_logps/chosen": -5.36878776550293,
"eval_logps/rejected": -6.672922611236572,
"eval_loss": 18.452621459960938,
"eval_rewards/accuracies": 0.7367136478424072,
"eval_rewards/chosen": -0.3366785943508148,
"eval_rewards/margins": 0.13026797771453857,
"eval_rewards/rejected": -0.466946542263031,
"eval_runtime": 42.3773,
"eval_samples_per_second": 87.028,
"eval_steps_per_second": 21.757,
"step": 1400
},
{
"epoch": 0.6767458603311735,
"grad_norm": 528.0,
"learning_rate": 1.4293366706439293e-06,
"logits/chosen": -2.3648762702941895,
"logits/rejected": -2.2613933086395264,
"logps/chosen": -5.728005886077881,
"logps/rejected": -7.000042915344238,
"loss": 20.4836,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.37347841262817383,
"rewards/margins": 0.12528380751609802,
"rewards/rejected": -0.49876219034194946,
"step": 1410
},
{
"epoch": 0.681545476361891,
"grad_norm": 736.0,
"learning_rate": 1.3916165596519015e-06,
"logits/chosen": -2.439873218536377,
"logits/rejected": -2.2366085052490234,
"logps/chosen": -5.539605617523193,
"logps/rejected": -6.8966546058654785,
"loss": 16.7958,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.3576871454715729,
"rewards/margins": 0.1379992663860321,
"rewards/rejected": -0.49568644165992737,
"step": 1420
},
{
"epoch": 0.6863450923926085,
"grad_norm": 556.0,
"learning_rate": 1.3542079360213089e-06,
"logits/chosen": -2.429901599884033,
"logits/rejected": -2.2471795082092285,
"logps/chosen": -5.207893371582031,
"logps/rejected": -6.384365081787109,
"loss": 18.2659,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.32029086351394653,
"rewards/margins": 0.12673424184322357,
"rewards/rejected": -0.4470250606536865,
"step": 1430
},
{
"epoch": 0.6911447084233261,
"grad_norm": 400.0,
"learning_rate": 1.317121312642406e-06,
"logits/chosen": -2.412353515625,
"logits/rejected": -2.237959623336792,
"logps/chosen": -5.456640720367432,
"logps/rejected": -7.014123439788818,
"loss": 17.618,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.3444870114326477,
"rewards/margins": 0.1576370894908905,
"rewards/rejected": -0.5021240711212158,
"step": 1440
},
{
"epoch": 0.6959443244540436,
"grad_norm": 704.0,
"learning_rate": 1.2803671119141953e-06,
"logits/chosen": -2.3219709396362305,
"logits/rejected": -2.228998899459839,
"logps/chosen": -5.75357723236084,
"logps/rejected": -6.837206840515137,
"loss": 20.7317,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.3743920922279358,
"rewards/margins": 0.10638390481472015,
"rewards/rejected": -0.48077592253685,
"step": 1450
},
{
"epoch": 0.7007439404847612,
"grad_norm": 1080.0,
"learning_rate": 1.2439556628154293e-06,
"logits/chosen": -2.392932653427124,
"logits/rejected": -2.2123451232910156,
"logps/chosen": -5.481171607971191,
"logps/rejected": -7.101468086242676,
"loss": 17.3517,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.35039958357810974,
"rewards/margins": 0.1585194319486618,
"rewards/rejected": -0.5089190006256104,
"step": 1460
},
{
"epoch": 0.7055435565154787,
"grad_norm": 880.0,
"learning_rate": 1.207897198001878e-06,
"logits/chosen": -2.4182324409484863,
"logits/rejected": -2.30369234085083,
"logps/chosen": -5.496799468994141,
"logps/rejected": -6.734198093414307,
"loss": 18.8115,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.3505345582962036,
"rewards/margins": 0.12324990332126617,
"rewards/rejected": -0.4737844467163086,
"step": 1470
},
{
"epoch": 0.7103431725461963,
"grad_norm": 648.0,
"learning_rate": 1.1722018509306587e-06,
"logits/chosen": -2.4155890941619873,
"logits/rejected": -2.2205352783203125,
"logps/chosen": -5.449498176574707,
"logps/rejected": -6.962679386138916,
"loss": 17.6053,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.3423009514808655,
"rewards/margins": 0.148566335439682,
"rewards/rejected": -0.4908672869205475,
"step": 1480
},
{
"epoch": 0.7151427885769138,
"grad_norm": 502.0,
"learning_rate": 1.1368796530124442e-06,
"logits/chosen": -2.3607187271118164,
"logits/rejected": -2.206583023071289,
"logps/chosen": -5.186774253845215,
"logps/rejected": -6.510534763336182,
"loss": 17.1114,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.3175831735134125,
"rewards/margins": 0.1387149691581726,
"rewards/rejected": -0.45629817247390747,
"step": 1490
},
{
"epoch": 0.7199424046076314,
"grad_norm": 580.0,
"learning_rate": 1.101940530792356e-06,
"logits/chosen": -2.4134833812713623,
"logits/rejected": -2.2638492584228516,
"logps/chosen": -5.019357204437256,
"logps/rejected": -6.50915002822876,
"loss": 17.2364,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.30193233489990234,
"rewards/margins": 0.15041805803775787,
"rewards/rejected": -0.452350378036499,
"step": 1500
},
{
"epoch": 0.7247420206383489,
"grad_norm": 418.0,
"learning_rate": 1.0673943031603134e-06,
"logits/chosen": -2.404956340789795,
"logits/rejected": -2.308595657348633,
"logps/chosen": -5.355484962463379,
"logps/rejected": -6.544129848480225,
"loss": 18.7578,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.3347395062446594,
"rewards/margins": 0.1166982650756836,
"rewards/rejected": -0.4514378011226654,
"step": 1510
},
{
"epoch": 0.7295416366690665,
"grad_norm": 536.0,
"learning_rate": 1.0332506785916524e-06,
"logits/chosen": -2.3823771476745605,
"logits/rejected": -2.271340847015381,
"logps/chosen": -5.479881763458252,
"logps/rejected": -6.646791934967041,
"loss": 19.1971,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.34438061714172363,
"rewards/margins": 0.12062084674835205,
"rewards/rejected": -0.46500149369239807,
"step": 1520
},
{
"epoch": 0.734341252699784,
"grad_norm": 716.0,
"learning_rate": 9.995192524187639e-07,
"logits/chosen": -2.326873302459717,
"logits/rejected": -2.2747578620910645,
"logps/chosen": -5.4852070808410645,
"logps/rejected": -6.537548065185547,
"loss": 21.3827,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.3506949841976166,
"rewards/margins": 0.10114116966724396,
"rewards/rejected": -0.45183616876602173,
"step": 1530
},
{
"epoch": 0.7391408687305016,
"grad_norm": 888.0,
"learning_rate": 9.662095041345318e-07,
"logits/chosen": -2.3540749549865723,
"logits/rejected": -2.216001510620117,
"logps/chosen": -5.573788642883301,
"logps/rejected": -6.802327632904053,
"loss": 18.3834,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.3563232123851776,
"rewards/margins": 0.12263502925634384,
"rewards/rejected": -0.47895827889442444,
"step": 1540
},
{
"epoch": 0.7439404847612191,
"grad_norm": 808.0,
"learning_rate": 9.333307947283258e-07,
"logits/chosen": -2.375044822692871,
"logits/rejected": -2.289389133453369,
"logps/chosen": -5.813979148864746,
"logps/rejected": -6.759008884429932,
"loss": 20.425,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.3850165009498596,
"rewards/margins": 0.09783297777175903,
"rewards/rejected": -0.48284950852394104,
"step": 1550
},
{
"epoch": 0.7487401007919366,
"grad_norm": 504.0,
"learning_rate": 9.00892364055298e-07,
"logits/chosen": -2.392712116241455,
"logits/rejected": -2.283189058303833,
"logps/chosen": -5.542091369628906,
"logps/rejected": -6.62581729888916,
"loss": 18.9382,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.3540818393230438,
"rewards/margins": 0.11669294536113739,
"rewards/rejected": -0.47077473998069763,
"step": 1560
},
{
"epoch": 0.7535397168226542,
"grad_norm": 628.0,
"learning_rate": 8.689033282397166e-07,
"logits/chosen": -2.3972506523132324,
"logits/rejected": -2.2452807426452637,
"logps/chosen": -5.5895209312438965,
"logps/rejected": -6.914453983306885,
"loss": 17.9813,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.35753870010375977,
"rewards/margins": 0.13575167953968048,
"rewards/rejected": -0.49329036474227905,
"step": 1570
},
{
"epoch": 0.7583393328533717,
"grad_norm": 600.0,
"learning_rate": 8.373726771130769e-07,
"logits/chosen": -2.412078619003296,
"logits/rejected": -2.266860008239746,
"logps/chosen": -5.364583492279053,
"logps/rejected": -6.602282524108887,
"loss": 19.0929,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.33540457487106323,
"rewards/margins": 0.12357480823993683,
"rewards/rejected": -0.45897936820983887,
"step": 1580
},
{
"epoch": 0.7631389488840893,
"grad_norm": 480.0,
"learning_rate": 8.063092716877016e-07,
"logits/chosen": -2.338613986968994,
"logits/rejected": -2.211350440979004,
"logps/chosen": -5.179539680480957,
"logps/rejected": -6.339354991912842,
"loss": 18.8355,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.3131060004234314,
"rewards/margins": 0.12318776547908783,
"rewards/rejected": -0.4362937808036804,
"step": 1590
},
{
"epoch": 0.7679385649148068,
"grad_norm": 688.0,
"learning_rate": 7.757218416665446e-07,
"logits/chosen": -2.4351515769958496,
"logits/rejected": -2.2595877647399902,
"logps/chosen": -5.679512977600098,
"logps/rejected": -7.2098388671875,
"loss": 16.8747,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.3652041256427765,
"rewards/margins": 0.1489575207233429,
"rewards/rejected": -0.5141616463661194,
"step": 1600
},
{
"epoch": 0.7679385649148068,
"eval_logits/chosen": -2.4042322635650635,
"eval_logits/rejected": -2.271738290786743,
"eval_logps/chosen": -5.411951065063477,
"eval_logps/rejected": -6.727044582366943,
"eval_loss": 18.418222427368164,
"eval_rewards/accuracies": 0.7383405566215515,
"eval_rewards/chosen": -0.3409949839115143,
"eval_rewards/margins": 0.13136383891105652,
"eval_rewards/rejected": -0.4723588228225708,
"eval_runtime": 42.3003,
"eval_samples_per_second": 87.186,
"eval_steps_per_second": 21.797,
"step": 1600
},
{
"epoch": 0.7727381809455244,
"grad_norm": 486.0,
"learning_rate": 7.456189829898955e-07,
"logits/chosen": -2.4270052909851074,
"logits/rejected": -2.2367396354675293,
"logps/chosen": -5.292275428771973,
"logps/rejected": -6.929448127746582,
"loss": 15.9241,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.33168870210647583,
"rewards/margins": 0.1642860472202301,
"rewards/rejected": -0.49597471952438354,
"step": 1610
},
{
"epoch": 0.7775377969762419,
"grad_norm": 516.0,
"learning_rate": 7.160091554196732e-07,
"logits/chosen": -2.4502692222595215,
"logits/rejected": -2.2896740436553955,
"logps/chosen": -5.418766498565674,
"logps/rejected": -6.900321006774902,
"loss": 17.7239,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.34466320276260376,
"rewards/margins": 0.14509515464305878,
"rewards/rejected": -0.48975834250450134,
"step": 1620
},
{
"epoch": 0.7823374130069595,
"grad_norm": 840.0,
"learning_rate": 6.869006801619941e-07,
"logits/chosen": -2.369670867919922,
"logits/rejected": -2.294945240020752,
"logps/chosen": -5.324276924133301,
"logps/rejected": -6.590291500091553,
"loss": 19.8398,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.3284462094306946,
"rewards/margins": 0.12793660163879395,
"rewards/rejected": -0.4563828408718109,
"step": 1630
},
{
"epoch": 0.787137029037677,
"grad_norm": 988.0,
"learning_rate": 6.583017375286726e-07,
"logits/chosen": -2.4032444953918457,
"logits/rejected": -2.264955759048462,
"logps/chosen": -5.57873010635376,
"logps/rejected": -6.539863586425781,
"loss": 20.8332,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.35667234659194946,
"rewards/margins": 0.10117383301258087,
"rewards/rejected": -0.45784610509872437,
"step": 1640
},
{
"epoch": 0.7919366450683946,
"grad_norm": 564.0,
"learning_rate": 6.30220364638324e-07,
"logits/chosen": -2.4060983657836914,
"logits/rejected": -2.290734052658081,
"logps/chosen": -5.397780895233154,
"logps/rejected": -6.521121025085449,
"loss": 19.2479,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.3351799547672272,
"rewards/margins": 0.11636437475681305,
"rewards/rejected": -0.45154428482055664,
"step": 1650
},
{
"epoch": 0.7967362610991121,
"grad_norm": 708.0,
"learning_rate": 6.02664453157703e-07,
"logits/chosen": -2.4713082313537598,
"logits/rejected": -2.3311212062835693,
"logps/chosen": -5.263553619384766,
"logps/rejected": -6.440840721130371,
"loss": 19.1745,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.3277600407600403,
"rewards/margins": 0.11137590557336807,
"rewards/rejected": -0.43913593888282776,
"step": 1660
},
{
"epoch": 0.8015358771298297,
"grad_norm": 824.0,
"learning_rate": 5.756417470839195e-07,
"logits/chosen": -2.42775297164917,
"logits/rejected": -2.2976040840148926,
"logps/chosen": -5.576961040496826,
"logps/rejected": -6.82025146484375,
"loss": 18.3447,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.3563191294670105,
"rewards/margins": 0.13009047508239746,
"rewards/rejected": -0.48640960454940796,
"step": 1670
},
{
"epoch": 0.8063354931605472,
"grad_norm": 728.0,
"learning_rate": 5.491598405681559e-07,
"logits/chosen": -2.47291898727417,
"logits/rejected": -2.2564706802368164,
"logps/chosen": -5.675844669342041,
"logps/rejected": -6.974596977233887,
"loss": 17.8208,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.36322662234306335,
"rewards/margins": 0.13114073872566223,
"rewards/rejected": -0.49436742067337036,
"step": 1680
},
{
"epoch": 0.8111351091912647,
"grad_norm": 544.0,
"learning_rate": 5.232261757814924e-07,
"logits/chosen": -2.3544998168945312,
"logits/rejected": -2.1982970237731934,
"logps/chosen": -5.333452224731445,
"logps/rejected": -6.894648551940918,
"loss": 17.2108,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.3354581296443939,
"rewards/margins": 0.16147710382938385,
"rewards/rejected": -0.49693527817726135,
"step": 1690
},
{
"epoch": 0.8159347252219823,
"grad_norm": 1032.0,
"learning_rate": 4.978480408234465e-07,
"logits/chosen": -2.306948184967041,
"logits/rejected": -2.2744388580322266,
"logps/chosen": -5.288256645202637,
"logps/rejected": -6.248696327209473,
"loss": 20.4558,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.3333366811275482,
"rewards/margins": 0.09241757541894913,
"rewards/rejected": -0.42575424909591675,
"step": 1700
},
{
"epoch": 0.8207343412526998,
"grad_norm": 976.0,
"learning_rate": 4.73032567673809e-07,
"logits/chosen": -2.4197311401367188,
"logits/rejected": -2.288029193878174,
"logps/chosen": -5.471469402313232,
"logps/rejected": -6.522982120513916,
"loss": 19.3212,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.34092700481414795,
"rewards/margins": 0.11506900936365128,
"rewards/rejected": -0.45599597692489624,
"step": 1710
},
{
"epoch": 0.8255339572834173,
"grad_norm": 478.0,
"learning_rate": 4.487867301883528e-07,
"logits/chosen": -2.3192758560180664,
"logits/rejected": -2.225492000579834,
"logps/chosen": -5.3679704666137695,
"logps/rejected": -6.774080753326416,
"loss": 19.9218,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.336590051651001,
"rewards/margins": 0.13415463268756866,
"rewards/rejected": -0.47074466943740845,
"step": 1720
},
{
"epoch": 0.8303335733141348,
"grad_norm": 532.0,
"learning_rate": 4.2511734213898093e-07,
"logits/chosen": -2.4426679611206055,
"logits/rejected": -2.2684130668640137,
"logps/chosen": -5.739571571350098,
"logps/rejected": -6.8670654296875,
"loss": 20.5216,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.36932170391082764,
"rewards/margins": 0.11277206242084503,
"rewards/rejected": -0.48209381103515625,
"step": 1730
},
{
"epoch": 0.8351331893448524,
"grad_norm": 394.0,
"learning_rate": 4.020310552988632e-07,
"logits/chosen": -2.4151153564453125,
"logits/rejected": -2.201359987258911,
"logps/chosen": -5.472858905792236,
"logps/rejected": -7.020216464996338,
"loss": 16.3484,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.3507462441921234,
"rewards/margins": 0.15251484513282776,
"rewards/rejected": -0.5032610297203064,
"step": 1740
},
{
"epoch": 0.8399328053755699,
"grad_norm": 544.0,
"learning_rate": 3.7953435757309756e-07,
"logits/chosen": -2.4227585792541504,
"logits/rejected": -2.263484239578247,
"logps/chosen": -5.642158508300781,
"logps/rejected": -6.891043186187744,
"loss": 19.026,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.3608216643333435,
"rewards/margins": 0.1325806826353073,
"rewards/rejected": -0.4934023320674896,
"step": 1750
},
{
"epoch": 0.8447324214062875,
"grad_norm": 624.0,
"learning_rate": 3.5763357117542364e-07,
"logits/chosen": -2.419222116470337,
"logits/rejected": -2.3370187282562256,
"logps/chosen": -5.40153169631958,
"logps/rejected": -6.698393821716309,
"loss": 18.4071,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.3343198299407959,
"rewards/margins": 0.1330634355545044,
"rewards/rejected": -0.4673832952976227,
"step": 1760
},
{
"epoch": 0.849532037437005,
"grad_norm": 816.0,
"learning_rate": 3.363348508515016e-07,
"logits/chosen": -2.4378068447113037,
"logits/rejected": -2.2770543098449707,
"logps/chosen": -5.5027971267700195,
"logps/rejected": -6.707958221435547,
"loss": 19.1375,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.34939098358154297,
"rewards/margins": 0.12113787233829498,
"rewards/rejected": -0.47052890062332153,
"step": 1770
},
{
"epoch": 0.8543316534677226,
"grad_norm": 684.0,
"learning_rate": 3.156441821492506e-07,
"logits/chosen": -2.379523277282715,
"logits/rejected": -2.269045352935791,
"logps/chosen": -5.7871599197387695,
"logps/rejected": -7.179323673248291,
"loss": 18.729,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.38277268409729004,
"rewards/margins": 0.1358826756477356,
"rewards/rejected": -0.5186554193496704,
"step": 1780
},
{
"epoch": 0.8591312694984401,
"grad_norm": 824.0,
"learning_rate": 2.9556737973674117e-07,
"logits/chosen": -2.4030494689941406,
"logits/rejected": -2.2455122470855713,
"logps/chosen": -5.486414909362793,
"logps/rejected": -6.7095842361450195,
"loss": 19.8424,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.3436943292617798,
"rewards/margins": 0.12356676161289215,
"rewards/rejected": -0.46726107597351074,
"step": 1790
},
{
"epoch": 0.8639308855291576,
"grad_norm": 668.0,
"learning_rate": 2.761100857681068e-07,
"logits/chosen": -2.418900728225708,
"logits/rejected": -2.3005001544952393,
"logps/chosen": -5.6175031661987305,
"logps/rejected": -6.8584747314453125,
"loss": 20.3795,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.36307087540626526,
"rewards/margins": 0.11948154121637344,
"rewards/rejected": -0.4825524687767029,
"step": 1800
},
{
"epoch": 0.8639308855291576,
"eval_logits/chosen": -2.403691530227661,
"eval_logits/rejected": -2.2711212635040283,
"eval_logps/chosen": -5.420097827911377,
"eval_logps/rejected": -6.736894130706787,
"eval_loss": 18.413162231445312,
"eval_rewards/accuracies": 0.7369847893714905,
"eval_rewards/chosen": -0.3418095111846924,
"eval_rewards/margins": 0.13153418898582458,
"eval_rewards/rejected": -0.47334375977516174,
"eval_runtime": 42.3869,
"eval_samples_per_second": 87.008,
"eval_steps_per_second": 21.752,
"step": 1800
},
{
"epoch": 0.8687305015598752,
"grad_norm": 452.0,
"learning_rate": 2.5727776829793774e-07,
"logits/chosen": -2.4535417556762695,
"logits/rejected": -2.252277374267578,
"logps/chosen": -5.440467357635498,
"logps/rejected": -6.776512145996094,
"loss": 16.8697,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.3419545590877533,
"rewards/margins": 0.1435195952653885,
"rewards/rejected": -0.485474169254303,
"step": 1810
},
{
"epoch": 0.8735301175905927,
"grad_norm": 624.0,
"learning_rate": 2.3907571974460255e-07,
"logits/chosen": -2.400630235671997,
"logits/rejected": -2.220672369003296,
"logps/chosen": -5.255295753479004,
"logps/rejected": -6.702149868011475,
"loss": 16.3895,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -0.3260168433189392,
"rewards/margins": 0.14902091026306152,
"rewards/rejected": -0.47503766417503357,
"step": 1820
},
{
"epoch": 0.8783297336213103,
"grad_norm": 636.0,
"learning_rate": 2.2150905540292589e-07,
"logits/chosen": -2.4029934406280518,
"logits/rejected": -2.251993179321289,
"logps/chosen": -5.307319164276123,
"logps/rejected": -6.41394567489624,
"loss": 20.257,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.33146747946739197,
"rewards/margins": 0.11496575176715851,
"rewards/rejected": -0.4464332163333893,
"step": 1830
},
{
"epoch": 0.8831293496520278,
"grad_norm": 1104.0,
"learning_rate": 2.0458271200664626e-07,
"logits/chosen": -2.3530220985412598,
"logits/rejected": -2.3088059425354004,
"logps/chosen": -5.657820224761963,
"logps/rejected": -6.9473066329956055,
"loss": 18.6253,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.3602226972579956,
"rewards/margins": 0.1361093521118164,
"rewards/rejected": -0.4963320791721344,
"step": 1840
},
{
"epoch": 0.8879289656827454,
"grad_norm": 764.0,
"learning_rate": 1.8830144634105206e-07,
"logits/chosen": -2.3955461978912354,
"logits/rejected": -2.2103283405303955,
"logps/chosen": -5.34378719329834,
"logps/rejected": -6.975235939025879,
"loss": 15.1707,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.32622575759887695,
"rewards/margins": 0.16939938068389893,
"rewards/rejected": -0.49562519788742065,
"step": 1850
},
{
"epoch": 0.8927285817134629,
"grad_norm": 512.0,
"learning_rate": 1.7266983390618997e-07,
"logits/chosen": -2.3531813621520996,
"logits/rejected": -2.204556465148926,
"logps/chosen": -5.405613899230957,
"logps/rejected": -6.852238655090332,
"loss": 16.7754,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.3349411189556122,
"rewards/margins": 0.15225782990455627,
"rewards/rejected": -0.48719897866249084,
"step": 1860
},
{
"epoch": 0.8975281977441805,
"grad_norm": 1004.0,
"learning_rate": 1.5769226763101887e-07,
"logits/chosen": -2.3103063106536865,
"logits/rejected": -2.249112606048584,
"logps/chosen": -5.4530744552612305,
"logps/rejected": -6.606575965881348,
"loss": 20.5694,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.3367684781551361,
"rewards/margins": 0.11924157291650772,
"rewards/rejected": -0.456010103225708,
"step": 1870
},
{
"epoch": 0.902327813774898,
"grad_norm": 652.0,
"learning_rate": 1.4337295663887086e-07,
"logits/chosen": -2.479418992996216,
"logits/rejected": -2.300482988357544,
"logps/chosen": -5.243393421173096,
"logps/rejected": -6.76312255859375,
"loss": 16.1228,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.3213045597076416,
"rewards/margins": 0.157609760761261,
"rewards/rejected": -0.4789143204689026,
"step": 1880
},
{
"epoch": 0.9071274298056156,
"grad_norm": 3776.0,
"learning_rate": 1.2971592506456799e-07,
"logits/chosen": -2.3731493949890137,
"logits/rejected": -2.271982431411743,
"logps/chosen": -5.576071262359619,
"logps/rejected": -7.10996150970459,
"loss": 17.8524,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.3627855181694031,
"rewards/margins": 0.14737771451473236,
"rewards/rejected": -0.5101632475852966,
"step": 1890
},
{
"epoch": 0.9119270458363331,
"grad_norm": 768.0,
"learning_rate": 1.1672501092352545e-07,
"logits/chosen": -2.416693687438965,
"logits/rejected": -2.273679256439209,
"logps/chosen": -5.268587589263916,
"logps/rejected": -6.44775915145874,
"loss": 18.9423,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.32210221886634827,
"rewards/margins": 0.11919095367193222,
"rewards/rejected": -0.4412931501865387,
"step": 1900
},
{
"epoch": 0.9167266618670507,
"grad_norm": 486.0,
"learning_rate": 1.0440386503315969e-07,
"logits/chosen": -2.2994284629821777,
"logits/rejected": -2.2361371517181396,
"logps/chosen": -5.450763702392578,
"logps/rejected": -6.711311340332031,
"loss": 18.5158,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.3464415669441223,
"rewards/margins": 0.12386713922023773,
"rewards/rejected": -0.47030869126319885,
"step": 1910
},
{
"epoch": 0.9215262778977682,
"grad_norm": 1248.0,
"learning_rate": 9.275594998690574e-08,
"logits/chosen": -2.3635449409484863,
"logits/rejected": -2.1750545501708984,
"logps/chosen": -5.208844184875488,
"logps/rejected": -6.700813293457031,
"loss": 16.5264,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.3230743110179901,
"rewards/margins": 0.14837422966957092,
"rewards/rejected": -0.47144851088523865,
"step": 1920
},
{
"epoch": 0.9263258939284857,
"grad_norm": 608.0,
"learning_rate": 8.178453918112783e-08,
"logits/chosen": -2.373126268386841,
"logits/rejected": -2.2234952449798584,
"logps/chosen": -5.264535427093506,
"logps/rejected": -6.703366279602051,
"loss": 17.3548,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.32319965958595276,
"rewards/margins": 0.1474277526140213,
"rewards/rejected": -0.4706273674964905,
"step": 1930
},
{
"epoch": 0.9311255099592033,
"grad_norm": 784.0,
"learning_rate": 7.149271589520167e-08,
"logits/chosen": -2.36932110786438,
"logits/rejected": -2.214989185333252,
"logps/chosen": -5.2449631690979,
"logps/rejected": -6.579471588134766,
"loss": 19.2316,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.32618004083633423,
"rewards/margins": 0.1317538321018219,
"rewards/rejected": -0.45793384313583374,
"step": 1940
},
{
"epoch": 0.9359251259899208,
"grad_norm": 616.0,
"learning_rate": 6.188337242502784e-08,
"logits/chosen": -2.3810677528381348,
"logits/rejected": -2.2315850257873535,
"logps/chosen": -5.400321006774902,
"logps/rejected": -6.946858882904053,
"loss": 17.7003,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.3430989384651184,
"rewards/margins": 0.1488824486732483,
"rewards/rejected": -0.4919814169406891,
"step": 1950
},
{
"epoch": 0.9407247420206384,
"grad_norm": 1056.0,
"learning_rate": 5.295920927021109e-08,
"logits/chosen": -2.438217878341675,
"logits/rejected": -2.2882134914398193,
"logps/chosen": -5.346053123474121,
"logps/rejected": -6.5962233543396,
"loss": 18.0238,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.3248424530029297,
"rewards/margins": 0.1324876993894577,
"rewards/rejected": -0.4573301672935486,
"step": 1960
},
{
"epoch": 0.9455243580513559,
"grad_norm": 572.0,
"learning_rate": 4.472273437514357e-08,
"logits/chosen": -2.427647829055786,
"logits/rejected": -2.2796761989593506,
"logps/chosen": -5.448599815368652,
"logps/rejected": -6.87127685546875,
"loss": 18.4709,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.3391854166984558,
"rewards/margins": 0.14958731830120087,
"rewards/rejected": -0.48877277970314026,
"step": 1970
},
{
"epoch": 0.9503239740820735,
"grad_norm": 472.0,
"learning_rate": 3.717626242420252e-08,
"logits/chosen": -2.3831515312194824,
"logits/rejected": -2.2835147380828857,
"logps/chosen": -5.475564956665039,
"logps/rejected": -6.520347595214844,
"loss": 20.1635,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.3528803288936615,
"rewards/margins": 0.10237631946802139,
"rewards/rejected": -0.4552566409111023,
"step": 1980
},
{
"epoch": 0.955123590112791,
"grad_norm": 584.0,
"learning_rate": 3.03219141912553e-08,
"logits/chosen": -2.370147228240967,
"logits/rejected": -2.2424869537353516,
"logps/chosen": -5.509493350982666,
"logps/rejected": -6.655867099761963,
"loss": 19.492,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.34782880544662476,
"rewards/margins": 0.11750034987926483,
"rewards/rejected": -0.465329110622406,
"step": 1990
},
{
"epoch": 0.9599232061435086,
"grad_norm": 608.0,
"learning_rate": 2.4161615943664174e-08,
"logits/chosen": -2.410212993621826,
"logits/rejected": -2.3204267024993896,
"logps/chosen": -5.513731479644775,
"logps/rejected": -6.744071960449219,
"loss": 18.8851,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.35477545857429504,
"rewards/margins": 0.11902030557394028,
"rewards/rejected": -0.47379574179649353,
"step": 2000
},
{
"epoch": 0.9599232061435086,
"eval_logits/chosen": -2.403712511062622,
"eval_logits/rejected": -2.2712814807891846,
"eval_logps/chosen": -5.4227728843688965,
"eval_logps/rejected": -6.740372657775879,
"eval_loss": 18.409746170043945,
"eval_rewards/accuracies": 0.7380694150924683,
"eval_rewards/chosen": -0.3420771062374115,
"eval_rewards/margins": 0.131614550948143,
"eval_rewards/rejected": -0.47369158267974854,
"eval_runtime": 42.4596,
"eval_samples_per_second": 86.859,
"eval_steps_per_second": 21.715,
"step": 2000
},
{
"epoch": 0.9647228221742261,
"grad_norm": 466.0,
"learning_rate": 1.8697098900948285e-08,
"logits/chosen": -2.34519362449646,
"logits/rejected": -2.2365708351135254,
"logps/chosen": -5.304901123046875,
"logps/rejected": -6.641317844390869,
"loss": 17.3234,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.32782405614852905,
"rewards/margins": 0.13560935854911804,
"rewards/rejected": -0.4634334444999695,
"step": 2010
},
{
"epoch": 0.9695224382049437,
"grad_norm": 804.0,
"learning_rate": 1.392989874826195e-08,
"logits/chosen": -2.447765827178955,
"logits/rejected": -2.289475202560425,
"logps/chosen": -5.604885578155518,
"logps/rejected": -6.676812648773193,
"loss": 20.7508,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.36262819170951843,
"rewards/margins": 0.10569081455469131,
"rewards/rejected": -0.46831902861595154,
"step": 2020
},
{
"epoch": 0.9743220542356611,
"grad_norm": 552.0,
"learning_rate": 9.861355204825173e-09,
"logits/chosen": -2.4042015075683594,
"logits/rejected": -2.26240873336792,
"logps/chosen": -5.389679431915283,
"logps/rejected": -6.621462821960449,
"loss": 18.0717,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.3353418707847595,
"rewards/margins": 0.1277102828025818,
"rewards/rejected": -0.4630521237850189,
"step": 2030
},
{
"epoch": 0.9791216702663786,
"grad_norm": 532.0,
"learning_rate": 6.492611647420932e-09,
"logits/chosen": -2.400986671447754,
"logits/rejected": -2.2538914680480957,
"logps/chosen": -5.181879997253418,
"logps/rejected": -6.5074639320373535,
"loss": 17.3962,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.3201446533203125,
"rewards/margins": 0.13040900230407715,
"rewards/rejected": -0.45055365562438965,
"step": 2040
},
{
"epoch": 0.9839212862970962,
"grad_norm": 864.0,
"learning_rate": 3.8246147890763645e-09,
"logits/chosen": -2.442328453063965,
"logits/rejected": -2.2787184715270996,
"logps/chosen": -5.620997428894043,
"logps/rejected": -6.637421607971191,
"loss": 22.1172,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.3605797588825226,
"rewards/margins": 0.09213758260011673,
"rewards/rejected": -0.4527173638343811,
"step": 2050
},
{
"epoch": 0.9887209023278137,
"grad_norm": 724.0,
"learning_rate": 1.8581144130089269e-09,
"logits/chosen": -2.357362985610962,
"logits/rejected": -2.289252519607544,
"logps/chosen": -5.391598701477051,
"logps/rejected": -6.834118843078613,
"loss": 19.477,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.3401409983634949,
"rewards/margins": 0.14060457050800323,
"rewards/rejected": -0.4807455539703369,
"step": 2060
},
{
"epoch": 0.9935205183585313,
"grad_norm": 536.0,
"learning_rate": 5.936631619152256e-10,
"logits/chosen": -2.417983293533325,
"logits/rejected": -2.3095850944519043,
"logps/chosen": -5.147790908813477,
"logps/rejected": -6.479160308837891,
"loss": 17.8655,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.31320589780807495,
"rewards/margins": 0.13332203030586243,
"rewards/rejected": -0.446527898311615,
"step": 2070
},
{
"epoch": 0.9983201343892488,
"grad_norm": 728.0,
"learning_rate": 3.161638266302447e-11,
"logits/chosen": -2.452502727508545,
"logits/rejected": -2.293308973312378,
"logps/chosen": -5.578939914703369,
"logps/rejected": -6.6824493408203125,
"loss": 19.633,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.3561863601207733,
"rewards/margins": 0.10789221525192261,
"rewards/rejected": -0.4640785753726959,
"step": 2080
},
{
"epoch": 0.9997600191984641,
"step": 2083,
"total_flos": 0.0,
"train_loss": 20.556188849835113,
"train_runtime": 1584.05,
"train_samples_per_second": 21.044,
"train_steps_per_second": 1.315
}
],
"logging_steps": 10,
"max_steps": 2083,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}