{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.26721891909947226, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005344378381989445, "grad_norm": 31.37949562072754, "learning_rate": 0.0, "logits/chosen": -0.2252655029296875, "logits/rejected": -0.192626953125, "logps/chosen": -110.828125, "logps/rejected": -115.515625, "logps/weighted_chosen": -0.31903076171875, "logps/weighted_rejected": -0.333709716796875, "loss": 0.6914, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "rewards/weighted_accuracies": 0.0, "rewards/weighted_chosen": 0.0, "rewards/weighted_margins": 0.0, "rewards/weighted_rejected": 0.0, "step": 1 }, { "epoch": 0.005344378381989445, "grad_norm": 75.76042938232422, "learning_rate": 4.787234042553191e-08, "logits/chosen": -0.3284708559513092, "logits/rejected": -0.3214448392391205, "logps/chosen": -134.46006774902344, "logps/rejected": -138.06597900390625, "logps/weighted_chosen": -0.3890923261642456, "logps/weighted_rejected": -0.4067721962928772, "loss": 0.6953, "rewards/accuracies": 0.2951388955116272, "rewards/chosen": -0.2072482705116272, "rewards/margins": -0.1790364533662796, "rewards/rejected": -0.0282118059694767, "rewards/weighted_accuracies": 0.3472222089767456, "rewards/weighted_chosen": -0.0032717387657612562, "rewards/weighted_margins": -0.0047516291961073875, "rewards/weighted_rejected": 0.0014813741436228156, "step": 10 }, { "epoch": 0.01068875676397889, "grad_norm": 24.94420623779297, "learning_rate": 1.0106382978723404e-07, "logits/chosen": -0.2780090272426605, "logits/rejected": -0.2689048647880554, "logps/chosen": -115.3070297241211, "logps/rejected": -114.8101577758789, "logps/weighted_chosen": -0.354583740234375, "logps/weighted_rejected": -0.36929017305374146, "loss": 0.6925, "rewards/accuracies": 0.3687500059604645, "rewards/chosen": 0.0087890625, "rewards/margins": -0.02734375, "rewards/rejected": 0.0361328125, "rewards/weighted_accuracies": 0.4312500059604645, "rewards/weighted_chosen": 0.0016719817649573088, "rewards/weighted_margins": 0.0011638641590252519, "rewards/weighted_rejected": 0.0005052566411904991, "step": 20 }, { "epoch": 0.016033135145968335, "grad_norm": 26.90618133544922, "learning_rate": 1.5425531914893615e-07, "logits/chosen": -0.26707762479782104, "logits/rejected": -0.2697288393974304, "logps/chosen": -122.49687194824219, "logps/rejected": -128.2218780517578, "logps/weighted_chosen": -0.364663690328598, "logps/weighted_rejected": -0.40430909395217896, "loss": 0.6919, "rewards/accuracies": 0.3687500059604645, "rewards/chosen": 0.099609375, "rewards/margins": 0.2177734375, "rewards/rejected": -0.1181640625, "rewards/weighted_accuracies": 0.4468750059604645, "rewards/weighted_chosen": 0.0011037830263376236, "rewards/weighted_margins": 0.0029600143898278475, "rewards/weighted_rejected": -0.0018524170154705644, "step": 30 }, { "epoch": 0.02137751352795778, "grad_norm": 19.056455612182617, "learning_rate": 2.074468085106383e-07, "logits/chosen": -0.31552428007125854, "logits/rejected": -0.309671014547348, "logps/chosen": -126.5132827758789, "logps/rejected": -127.7515640258789, "logps/weighted_chosen": -0.3717803955078125, "logps/weighted_rejected": -0.36720579862594604, "loss": 0.6927, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.041015625, "rewards/margins": 0.150390625, "rewards/rejected": -0.109375, "rewards/weighted_accuracies": 0.44062501192092896, "rewards/weighted_chosen": 0.0009314537164755166, "rewards/weighted_margins": 0.0009433746454305947, "rewards/weighted_rejected": -1.831054760259576e-05, "step": 40 }, { "epoch": 0.026721891909947223, "grad_norm": 94.1146469116211, "learning_rate": 2.6063829787234044e-07, "logits/chosen": -0.2799697816371918, "logits/rejected": -0.2664199769496918, "logps/chosen": -120.34375, "logps/rejected": -120.19062805175781, "logps/weighted_chosen": -0.367788702249527, "logps/weighted_rejected": -0.37299805879592896, "loss": 0.6934, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.01806640625, "rewards/margins": -0.11865234375, "rewards/rejected": 0.1005859375, "rewards/weighted_accuracies": 0.421875, "rewards/weighted_chosen": 0.0011091709602624178, "rewards/weighted_margins": -0.0005058288807049394, "rewards/weighted_rejected": 0.00161571498028934, "step": 50 }, { "epoch": 0.03206627029193667, "grad_norm": 47.161922454833984, "learning_rate": 3.1382978723404253e-07, "logits/chosen": -0.22172394394874573, "logits/rejected": -0.2157600373029709, "logps/chosen": -115.8382797241211, "logps/rejected": -118.75859069824219, "logps/weighted_chosen": -0.36602783203125, "logps/weighted_rejected": -0.369253545999527, "loss": 0.6908, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.10546875, "rewards/margins": -0.1259765625, "rewards/rejected": 0.0205078125, "rewards/weighted_accuracies": 0.4000000059604645, "rewards/weighted_chosen": 0.008862781338393688, "rewards/weighted_margins": 0.00811080913990736, "rewards/weighted_rejected": 0.0007405281066894531, "step": 60 }, { "epoch": 0.037410648673926114, "grad_norm": 42.38877868652344, "learning_rate": 3.6702127659574467e-07, "logits/chosen": -0.299722284078598, "logits/rejected": -0.29665374755859375, "logps/chosen": -114.1656265258789, "logps/rejected": -118.0765609741211, "logps/weighted_chosen": -0.35313719511032104, "logps/weighted_rejected": -0.3739013671875, "loss": 0.6949, "rewards/accuracies": 0.375, "rewards/chosen": -0.0693359375, "rewards/margins": -0.0458984375, "rewards/rejected": -0.0234375, "rewards/weighted_accuracies": 0.390625, "rewards/weighted_chosen": 0.0011390686267986894, "rewards/weighted_margins": -0.0028884888160973787, "rewards/weighted_rejected": 0.0040260315872728825, "step": 70 }, { "epoch": 0.04275502705591556, "grad_norm": 35.451927185058594, "learning_rate": 4.202127659574468e-07, "logits/chosen": -0.30262452363967896, "logits/rejected": -0.24024733901023865, "logps/chosen": -112.3812484741211, "logps/rejected": -111.8375015258789, "logps/weighted_chosen": -0.37481385469436646, "logps/weighted_rejected": -0.38435667753219604, "loss": 0.694, "rewards/accuracies": 0.390625, "rewards/chosen": -0.1552734375, "rewards/margins": -0.0341796875, "rewards/rejected": -0.12109375, "rewards/weighted_accuracies": 0.48750001192092896, "rewards/weighted_chosen": 0.01006317138671875, "rewards/weighted_margins": 0.0036018372047692537, "rewards/weighted_rejected": 0.006462156772613525, "step": 80 }, { "epoch": 0.048099405437905, "grad_norm": 42.264678955078125, "learning_rate": 4.734042553191489e-07, "logits/chosen": -0.2891853451728821, "logits/rejected": -0.23835448920726776, "logps/chosen": -118.0296859741211, "logps/rejected": -116.484375, "logps/weighted_chosen": -0.376077264547348, "logps/weighted_rejected": -0.3848114013671875, "loss": 0.6935, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.0458984375, "rewards/margins": 0.267578125, "rewards/rejected": -0.2216796875, "rewards/weighted_accuracies": 0.46562498807907104, "rewards/weighted_chosen": 0.016840171068906784, "rewards/weighted_margins": 0.014923477545380592, "rewards/weighted_rejected": 0.00187511439435184, "step": 90 }, { "epoch": 0.053443783819894446, "grad_norm": 36.87267303466797, "learning_rate": 5.26595744680851e-07, "logits/chosen": -0.3333755433559418, "logits/rejected": -0.28821104764938354, "logps/chosen": -124.0367202758789, "logps/rejected": -124.33906555175781, "logps/weighted_chosen": -0.39268797636032104, "logps/weighted_rejected": -0.4093261659145355, "loss": 0.6875, "rewards/accuracies": 0.44062501192092896, "rewards/chosen": -0.0302734375, "rewards/margins": 0.36835938692092896, "rewards/rejected": -0.39863282442092896, "rewards/weighted_accuracies": 0.484375, "rewards/weighted_chosen": 0.012582575902342796, "rewards/weighted_margins": 0.026942063122987747, "rewards/weighted_rejected": -0.014329910278320312, "step": 100 }, { "epoch": 0.058788162201883896, "grad_norm": 17.8848876953125, "learning_rate": 5.797872340425531e-07, "logits/chosen": -0.3335327208042145, "logits/rejected": -0.32384032011032104, "logps/chosen": -117.6968765258789, "logps/rejected": -119.85859680175781, "logps/weighted_chosen": -0.35866087675094604, "logps/weighted_rejected": -0.37585145235061646, "loss": 0.7015, "rewards/accuracies": 0.3968749940395355, "rewards/chosen": -0.524609386920929, "rewards/margins": -0.01328125037252903, "rewards/rejected": -0.511523425579071, "rewards/weighted_accuracies": 0.4281249940395355, "rewards/weighted_chosen": -0.0057319640181958675, "rewards/weighted_margins": 0.0012493133544921875, "rewards/weighted_rejected": -0.0070056915283203125, "step": 110 }, { "epoch": 0.06413254058387334, "grad_norm": 37.190059661865234, "learning_rate": 6.329787234042553e-07, "logits/chosen": -0.29607391357421875, "logits/rejected": -0.2735137939453125, "logps/chosen": -120.81718444824219, "logps/rejected": -127.04219055175781, "logps/weighted_chosen": -0.41831666231155396, "logps/weighted_rejected": -0.42036741971969604, "loss": 0.7443, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.43964844942092896, "rewards/margins": 0.690234363079071, "rewards/rejected": -1.1298828125, "rewards/weighted_accuracies": 0.45625001192092896, "rewards/weighted_chosen": -0.07387389987707138, "rewards/weighted_margins": -0.04258232191205025, "rewards/weighted_rejected": -0.031409453600645065, "step": 120 }, { "epoch": 0.06947691896586278, "grad_norm": 22.76742172241211, "learning_rate": 6.861702127659574e-07, "logits/chosen": -0.30709609389305115, "logits/rejected": -0.29381561279296875, "logps/chosen": -120.3140640258789, "logps/rejected": -121.73515319824219, "logps/weighted_chosen": -0.38916015625, "logps/weighted_rejected": -0.38067322969436646, "loss": 0.6906, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -0.8837890625, "rewards/margins": 0.4273437559604645, "rewards/rejected": -1.310937523841858, "rewards/weighted_accuracies": 0.46875, "rewards/weighted_chosen": -0.0019147873390465975, "rewards/weighted_margins": 0.025261688977479935, "rewards/weighted_rejected": -0.027143806219100952, "step": 130 }, { "epoch": 0.07482129734785223, "grad_norm": 25.72498321533203, "learning_rate": 7.393617021276596e-07, "logits/chosen": -0.2590804994106293, "logits/rejected": -0.25146180391311646, "logps/chosen": -114.2992172241211, "logps/rejected": -118.26094055175781, "logps/weighted_chosen": -0.376434326171875, "logps/weighted_rejected": -0.39961546659469604, "loss": 0.6968, "rewards/accuracies": 0.4906249940395355, "rewards/chosen": -1.7428710460662842, "rewards/margins": 1.043554663658142, "rewards/rejected": -2.7867188453674316, "rewards/weighted_accuracies": 0.46562498807907104, "rewards/weighted_chosen": -0.01796722412109375, "rewards/weighted_margins": 0.044054411351680756, "rewards/weighted_rejected": -0.062059782445430756, "step": 140 }, { "epoch": 0.08016567572984168, "grad_norm": 27.819217681884766, "learning_rate": 7.925531914893616e-07, "logits/chosen": -0.260824590921402, "logits/rejected": -0.2456924468278885, "logps/chosen": -118.3109359741211, "logps/rejected": -116.1898422241211, "logps/weighted_chosen": -0.3584175109863281, "logps/weighted_rejected": -0.389230340719223, "loss": 0.693, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -2.796679735183716, "rewards/margins": 1.470312476158142, "rewards/rejected": -4.267773628234863, "rewards/weighted_accuracies": 0.5406249761581421, "rewards/weighted_chosen": 0.02875671349465847, "rewards/weighted_margins": 0.08138389885425568, "rewards/weighted_rejected": -0.05276889726519585, "step": 150 }, { "epoch": 0.08551005411183112, "grad_norm": 20.792280197143555, "learning_rate": 8.457446808510637e-07, "logits/chosen": -0.27181702852249146, "logits/rejected": -0.26198044419288635, "logps/chosen": -119.7906265258789, "logps/rejected": -120.09687805175781, "logps/weighted_chosen": -0.3836608827114105, "logps/weighted_rejected": -0.40611571073532104, "loss": 0.6795, "rewards/accuracies": 0.503125011920929, "rewards/chosen": -3.6748046875, "rewards/margins": 1.8689453601837158, "rewards/rejected": -5.542382717132568, "rewards/weighted_accuracies": 0.543749988079071, "rewards/weighted_chosen": -0.013747024349868298, "rewards/weighted_margins": 0.10787200927734375, "rewards/weighted_rejected": -0.12159118801355362, "step": 160 }, { "epoch": 0.09085443249382057, "grad_norm": 28.162086486816406, "learning_rate": 8.989361702127659e-07, "logits/chosen": -0.30354803800582886, "logits/rejected": -0.28291016817092896, "logps/chosen": -118.81172180175781, "logps/rejected": -123.3851547241211, "logps/weighted_chosen": -0.364785760641098, "logps/weighted_rejected": -0.40638428926467896, "loss": 0.7104, "rewards/accuracies": 0.528124988079071, "rewards/chosen": -4.450781345367432, "rewards/margins": 1.540624976158142, "rewards/rejected": -5.989843845367432, "rewards/weighted_accuracies": 0.49687498807907104, "rewards/weighted_chosen": -0.05272483825683594, "rewards/weighted_margins": 0.03959999233484268, "rewards/weighted_rejected": -0.0922950729727745, "step": 170 }, { "epoch": 0.09619881087581, "grad_norm": 62.7450065612793, "learning_rate": 9.52127659574468e-07, "logits/chosen": -0.3086685240268707, "logits/rejected": -0.29756468534469604, "logps/chosen": -120.9000015258789, "logps/rejected": -121.6031265258789, "logps/weighted_chosen": -0.4021057188510895, "logps/weighted_rejected": -0.43016356229782104, "loss": 0.6902, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -5.364843845367432, "rewards/margins": 0.95654296875, "rewards/rejected": -6.323437690734863, "rewards/weighted_accuracies": 0.515625, "rewards/weighted_chosen": -0.07762374728918076, "rewards/weighted_margins": 0.042090605944395065, "rewards/weighted_rejected": -0.11983337253332138, "step": 180 }, { "epoch": 0.10154318925779945, "grad_norm": 107.93684387207031, "learning_rate": 9.99999129927477e-07, "logits/chosen": -0.2998809814453125, "logits/rejected": -0.2729034423828125, "logps/chosen": -121.68046569824219, "logps/rejected": -122.90547180175781, "logps/weighted_chosen": -0.4000488221645355, "logps/weighted_rejected": -0.49153441190719604, "loss": 0.6846, "rewards/accuracies": 0.4906249940395355, "rewards/chosen": -7.084570407867432, "rewards/margins": 1.763671875, "rewards/rejected": -8.850390434265137, "rewards/weighted_accuracies": 0.5625, "rewards/weighted_chosen": -0.06708984076976776, "rewards/weighted_margins": 0.19685058295726776, "rewards/weighted_rejected": -0.2636512815952301, "step": 190 }, { "epoch": 0.10688756763978889, "grad_norm": 42.62810516357422, "learning_rate": 9.99894724888679e-07, "logits/chosen": -0.2994216978549957, "logits/rejected": -0.2775813937187195, "logps/chosen": -129.10311889648438, "logps/rejected": -127.71015930175781, "logps/weighted_chosen": -0.40317994356155396, "logps/weighted_rejected": -0.4718689024448395, "loss": 0.7052, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -8.422460556030273, "rewards/margins": 1.215234398841858, "rewards/rejected": -9.638280868530273, "rewards/weighted_accuracies": 0.518750011920929, "rewards/weighted_chosen": -0.10227356106042862, "rewards/weighted_margins": 0.08837012946605682, "rewards/weighted_rejected": -0.19074249267578125, "step": 200 }, { "epoch": 0.11223194602177834, "grad_norm": 56.36786651611328, "learning_rate": 9.996163469793475e-07, "logits/chosen": -0.3200393617153168, "logits/rejected": -0.28080445528030396, "logps/chosen": -127.65625, "logps/rejected": -122.5406265258789, "logps/weighted_chosen": -0.402487188577652, "logps/weighted_rejected": -0.4784179627895355, "loss": 0.6855, "rewards/accuracies": 0.559374988079071, "rewards/chosen": -9.696874618530273, "rewards/margins": 3.138671875, "rewards/rejected": -12.8359375, "rewards/weighted_accuracies": 0.559374988079071, "rewards/weighted_chosen": -0.07228164374828339, "rewards/weighted_margins": 0.1660926789045334, "rewards/weighted_rejected": -0.23847046494483948, "step": 210 }, { "epoch": 0.11757632440376779, "grad_norm": 24.45851707458496, "learning_rate": 9.991640930802883e-07, "logits/chosen": -0.30699461698532104, "logits/rejected": -0.3066558837890625, "logps/chosen": -125.8734359741211, "logps/rejected": -129.6999969482422, "logps/weighted_chosen": -0.41710203886032104, "logps/weighted_rejected": -0.4827117919921875, "loss": 0.6884, "rewards/accuracies": 0.546875, "rewards/chosen": -12.277539253234863, "rewards/margins": 2.571484327316284, "rewards/rejected": -14.851171493530273, "rewards/weighted_accuracies": 0.5218750238418579, "rewards/weighted_chosen": -0.12371826171875, "rewards/weighted_margins": 0.1390731781721115, "rewards/weighted_rejected": -0.26273268461227417, "step": 220 }, { "epoch": 0.12292070278575723, "grad_norm": 27.215944290161133, "learning_rate": 9.98538120584459e-07, "logits/chosen": -0.3107505738735199, "logits/rejected": -0.283193975687027, "logps/chosen": -134.8015594482422, "logps/rejected": -138.5890655517578, "logps/weighted_chosen": -0.4462524354457855, "logps/weighted_rejected": -0.511853039264679, "loss": 0.6939, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -15.346875190734863, "rewards/margins": 2.830273389816284, "rewards/rejected": -18.179492950439453, "rewards/weighted_accuracies": 0.5562499761581421, "rewards/weighted_chosen": -0.1530204713344574, "rewards/weighted_margins": 0.1789344847202301, "rewards/weighted_rejected": -0.3320491909980774, "step": 230 }, { "epoch": 0.12826508116774668, "grad_norm": 17.173702239990234, "learning_rate": 9.977386473421917e-07, "logits/chosen": -0.27986279129981995, "logits/rejected": -0.2772073745727539, "logps/chosen": -121.0453109741211, "logps/rejected": -124.65312194824219, "logps/weighted_chosen": -0.4528869688510895, "logps/weighted_rejected": -0.501416027545929, "loss": 0.7222, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -18.59375, "rewards/margins": 2.5132813453674316, "rewards/rejected": -21.106054306030273, "rewards/weighted_accuracies": 0.578125, "rewards/weighted_chosen": -0.2343955934047699, "rewards/weighted_margins": 0.07334060966968536, "rewards/weighted_rejected": -0.30777662992477417, "step": 240 }, { "epoch": 0.13360945954973613, "grad_norm": 24.67556381225586, "learning_rate": 9.96765951585378e-07, "logits/chosen": -0.262664794921875, "logits/rejected": -0.24544373154640198, "logps/chosen": -121.7421875, "logps/rejected": -126.42500305175781, "logps/weighted_chosen": -0.46705931425094604, "logps/weighted_rejected": -0.547576904296875, "loss": 0.6683, "rewards/accuracies": 0.5625, "rewards/chosen": -21.9228515625, "rewards/margins": 2.5238280296325684, "rewards/rejected": -24.447071075439453, "rewards/weighted_accuracies": 0.5687500238418579, "rewards/weighted_chosen": -0.2455238401889801, "rewards/weighted_margins": 0.1756332367658615, "rewards/weighted_rejected": -0.4210983216762543, "step": 250 }, { "epoch": 0.13895383793172555, "grad_norm": 33.55352783203125, "learning_rate": 9.956203718306388e-07, "logits/chosen": -0.18781813979148865, "logits/rejected": -0.156982421875, "logps/chosen": -127.8578109741211, "logps/rejected": -132.35546875, "logps/weighted_chosen": -0.4968322813510895, "logps/weighted_rejected": -0.5148254632949829, "loss": 0.7213, "rewards/accuracies": 0.5531250238418579, "rewards/chosen": -26.109960556030273, "rewards/margins": 2.660937547683716, "rewards/rejected": -28.761327743530273, "rewards/weighted_accuracies": 0.534375011920929, "rewards/weighted_chosen": -0.29717254638671875, "rewards/weighted_margins": 0.04170074313879013, "rewards/weighted_rejected": -0.3386779725551605, "step": 260 }, { "epoch": 0.144298216313715, "grad_norm": 242.56521606445312, "learning_rate": 9.943023067615136e-07, "logits/chosen": -0.17297974228858948, "logits/rejected": -0.1584724485874176, "logps/chosen": -139.23046875, "logps/rejected": -142.3390655517578, "logps/weighted_chosen": -0.48270875215530396, "logps/weighted_rejected": -0.560772716999054, "loss": 0.7059, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -33.55195236206055, "rewards/margins": 3.075976610183716, "rewards/rejected": -36.6328125, "rewards/weighted_accuracies": 0.4937500059604645, "rewards/weighted_chosen": -0.321615606546402, "rewards/weighted_margins": 0.13364562392234802, "rewards/weighted_rejected": -0.45517730712890625, "step": 270 }, { "epoch": 0.14964259469570446, "grad_norm": 16.8142147064209, "learning_rate": 9.928122150897112e-07, "logits/chosen": -0.21183013916015625, "logits/rejected": -0.168986514210701, "logps/chosen": -130.66250610351562, "logps/rejected": -132.0812530517578, "logps/weighted_chosen": -0.48672789335250854, "logps/weighted_rejected": -0.578961193561554, "loss": 0.6614, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -34.4287109375, "rewards/margins": 3.7144532203674316, "rewards/rejected": -38.127342224121094, "rewards/weighted_accuracies": 0.5687500238418579, "rewards/weighted_chosen": -0.28246229887008667, "rewards/weighted_margins": 0.23281364142894745, "rewards/weighted_rejected": -0.5149310827255249, "step": 280 }, { "epoch": 0.1549869730776939, "grad_norm": 17.110448837280273, "learning_rate": 9.9115061539547e-07, "logits/chosen": -0.20588979125022888, "logits/rejected": -0.18258285522460938, "logps/chosen": -141.0203094482422, "logps/rejected": -142.50625610351562, "logps/weighted_chosen": -0.4897003173828125, "logps/weighted_rejected": -0.5554351806640625, "loss": 0.7051, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -38.395896911621094, "rewards/margins": 3.002734422683716, "rewards/rejected": -41.394920349121094, "rewards/weighted_accuracies": 0.5406249761581421, "rewards/weighted_chosen": -0.3098343014717102, "rewards/weighted_margins": 0.1397857666015625, "rewards/weighted_rejected": -0.4498863220214844, "step": 290 }, { "epoch": 0.16033135145968336, "grad_norm": 21.960878372192383, "learning_rate": 9.893180859470818e-07, "logits/chosen": -0.1905662566423416, "logits/rejected": -0.16956177353858948, "logps/chosen": -131.609375, "logps/rejected": -133.328125, "logps/weighted_chosen": -0.5007995367050171, "logps/weighted_rejected": -0.5999816656112671, "loss": 0.6676, "rewards/accuracies": 0.559374988079071, "rewards/chosen": -38.5810546875, "rewards/margins": 5.082226753234863, "rewards/rejected": -43.662498474121094, "rewards/weighted_accuracies": 0.590624988079071, "rewards/weighted_chosen": -0.27146607637405396, "rewards/weighted_margins": 0.2570602297782898, "rewards/weighted_rejected": -0.5287536382675171, "step": 300 }, { "epoch": 0.16567572984167278, "grad_norm": 45.954952239990234, "learning_rate": 9.873152644996424e-07, "logits/chosen": -0.23566055297851562, "logits/rejected": -0.23574523627758026, "logps/chosen": -134.5734405517578, "logps/rejected": -136.5500030517578, "logps/weighted_chosen": -0.565338134765625, "logps/weighted_rejected": -0.620849609375, "loss": 0.7314, "rewards/accuracies": 0.59375, "rewards/chosen": -44.349021911621094, "rewards/margins": 6.559765815734863, "rewards/rejected": -50.908592224121094, "rewards/weighted_accuracies": 0.606249988079071, "rewards/weighted_chosen": -0.44298553466796875, "rewards/weighted_margins": 0.1432647705078125, "rewards/weighted_rejected": -0.5857940912246704, "step": 310 }, { "epoch": 0.17102010822366223, "grad_norm": 22.280086517333984, "learning_rate": 9.85142848073103e-07, "logits/chosen": -0.2385093718767166, "logits/rejected": -0.21721191704273224, "logps/chosen": -138.27499389648438, "logps/rejected": -138.13516235351562, "logps/weighted_chosen": -0.5242675542831421, "logps/weighted_rejected": -0.591705322265625, "loss": 0.6903, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -48.820899963378906, "rewards/margins": 1.562109351158142, "rewards/rejected": -50.394920349121094, "rewards/weighted_accuracies": 0.5718749761581421, "rewards/weighted_chosen": -0.3802032470703125, "rewards/weighted_margins": 0.14343567192554474, "rewards/weighted_rejected": -0.523608386516571, "step": 320 }, { "epoch": 0.17636448660565168, "grad_norm": 19.125673294067383, "learning_rate": 9.828015927096914e-07, "logits/chosen": -0.2693939208984375, "logits/rejected": -0.23780974745750427, "logps/chosen": -148.56875610351562, "logps/rejected": -147.24453735351562, "logps/weighted_chosen": -0.521636962890625, "logps/weighted_rejected": -0.5547729730606079, "loss": 0.6703, "rewards/accuracies": 0.528124988079071, "rewards/chosen": -50.613868713378906, "rewards/margins": 3.5126953125, "rewards/rejected": -54.113670349121094, "rewards/weighted_accuracies": 0.550000011920929, "rewards/weighted_chosen": -0.321145623922348, "rewards/weighted_margins": 0.18524780869483948, "rewards/weighted_rejected": -0.5064395666122437, "step": 330 }, { "epoch": 0.18170886498764113, "grad_norm": 14.343570709228516, "learning_rate": 9.802923132107968e-07, "logits/chosen": -0.25108033418655396, "logits/rejected": -0.2313240021467209, "logps/chosen": -149.78671264648438, "logps/rejected": -153.48046875, "logps/weighted_chosen": -0.562329113483429, "logps/weighted_rejected": -0.583233654499054, "loss": 0.741, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -58.06855392456055, "rewards/margins": 4.334374904632568, "rewards/rejected": -62.40312576293945, "rewards/weighted_accuracies": 0.512499988079071, "rewards/weighted_chosen": -0.47095948457717896, "rewards/weighted_margins": 0.02723388746380806, "rewards/weighted_rejected": -0.498382568359375, "step": 340 }, { "epoch": 0.18705324336963056, "grad_norm": 66.20745849609375, "learning_rate": 9.776158828534024e-07, "logits/chosen": -0.2837265133857727, "logits/rejected": -0.2537124752998352, "logps/chosen": -149.5359344482422, "logps/rejected": -194.4765625, "logps/weighted_chosen": -0.540771484375, "logps/weighted_rejected": -0.589801013469696, "loss": 0.6779, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -59.75273513793945, "rewards/margins": 7.256640434265137, "rewards/rejected": -66.99531555175781, "rewards/weighted_accuracies": 0.5531250238418579, "rewards/weighted_chosen": -0.4245468080043793, "rewards/weighted_margins": 0.146717831492424, "rewards/weighted_rejected": -0.571148693561554, "step": 350 }, { "epoch": 0.19239762175162, "grad_norm": 13.690327644348145, "learning_rate": 9.747732330861695e-07, "logits/chosen": -0.18397827446460724, "logits/rejected": -0.15533828735351562, "logps/chosen": -143.4562530517578, "logps/rejected": -145.44686889648438, "logps/weighted_chosen": -0.5499817132949829, "logps/weighted_rejected": -0.6820312738418579, "loss": 0.6644, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -63.58203125, "rewards/margins": 7.447851657867432, "rewards/rejected": -71.0308609008789, "rewards/weighted_accuracies": 0.6031249761581421, "rewards/weighted_chosen": -0.43936461210250854, "rewards/weighted_margins": 0.3239502012729645, "rewards/weighted_rejected": -0.7634918093681335, "step": 360 }, { "epoch": 0.19774200013360946, "grad_norm": 20.364688873291016, "learning_rate": 9.717653532052742e-07, "logits/chosen": -0.16991272568702698, "logits/rejected": -0.16076354682445526, "logps/chosen": -137.11172485351562, "logps/rejected": -146.09375, "logps/weighted_chosen": -0.609234631061554, "logps/weighted_rejected": -0.658831775188446, "loss": 0.7108, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -63.02363204956055, "rewards/margins": 9.1494140625, "rewards/rejected": -72.1488265991211, "rewards/weighted_accuracies": 0.59375, "rewards/weighted_chosen": -0.5135604739189148, "rewards/weighted_margins": 0.1833236664533615, "rewards/weighted_rejected": -0.6966766119003296, "step": 370 }, { "epoch": 0.2030863785155989, "grad_norm": 15.459892272949219, "learning_rate": 9.685932900101146e-07, "logits/chosen": -0.17396697402000427, "logits/rejected": -0.1525276154279709, "logps/chosen": -143.62655639648438, "logps/rejected": -147.03515625, "logps/weighted_chosen": -0.584460437297821, "logps/weighted_rejected": -0.642120361328125, "loss": 0.7159, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -68.6537094116211, "rewards/margins": 5.928124904632568, "rewards/rejected": -74.57109069824219, "rewards/weighted_accuracies": 0.59375, "rewards/weighted_chosen": -0.528277575969696, "rewards/weighted_margins": 0.11352996528148651, "rewards/weighted_rejected": -0.6417190432548523, "step": 380 }, { "epoch": 0.20843075689758836, "grad_norm": 25.710723876953125, "learning_rate": 9.652581474390043e-07, "logits/chosen": -0.17167052626609802, "logits/rejected": -0.14335784316062927, "logps/chosen": -147.3156280517578, "logps/rejected": -151.1484375, "logps/weighted_chosen": -0.5953735113143921, "logps/weighted_rejected": -0.649151623249054, "loss": 0.7014, "rewards/accuracies": 0.59375, "rewards/chosen": -70.93769836425781, "rewards/margins": 9.441015243530273, "rewards/rejected": -80.35664367675781, "rewards/weighted_accuracies": 0.612500011920929, "rewards/weighted_chosen": -0.5777953863143921, "rewards/weighted_margins": 0.10624237358570099, "rewards/weighted_rejected": -0.6836212277412415, "step": 390 }, { "epoch": 0.21377513527957778, "grad_norm": 42.97126007080078, "learning_rate": 9.61761086184981e-07, "logits/chosen": -0.192851260304451, "logits/rejected": -0.16070251166820526, "logps/chosen": -148.39688110351562, "logps/rejected": -149.6046905517578, "logps/weighted_chosen": -0.6315368413925171, "logps/weighted_rejected": -0.7087768316268921, "loss": 0.7113, "rewards/accuracies": 0.565625011920929, "rewards/chosen": -73.91288757324219, "rewards/margins": 6.164453029632568, "rewards/rejected": -80.05976867675781, "rewards/weighted_accuracies": 0.559374988079071, "rewards/weighted_chosen": -0.6400848627090454, "rewards/weighted_margins": 0.11194305121898651, "rewards/weighted_rejected": -0.752105712890625, "step": 400 }, { "epoch": 0.21911951366156723, "grad_norm": 17.99481773376465, "learning_rate": 9.581033232918629e-07, "logits/chosen": -0.14135894179344177, "logits/rejected": -0.11229457706212997, "logps/chosen": -145.88827514648438, "logps/rejected": -149.74063110351562, "logps/weighted_chosen": -0.6018310785293579, "logps/weighted_rejected": -0.7620849609375, "loss": 0.6764, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -74.9345703125, "rewards/margins": 8.443944931030273, "rewards/rejected": -83.36601257324219, "rewards/weighted_accuracies": 0.578125, "rewards/weighted_chosen": -0.578625500202179, "rewards/weighted_margins": 0.35536497831344604, "rewards/weighted_rejected": -0.9342681765556335, "step": 410 }, { "epoch": 0.22446389204355668, "grad_norm": 19.236024856567383, "learning_rate": 9.542861317306952e-07, "logits/chosen": -0.1445457488298416, "logits/rejected": -0.1324237883090973, "logps/chosen": -148.76718139648438, "logps/rejected": -150.97811889648438, "logps/weighted_chosen": -0.5991576910018921, "logps/weighted_rejected": -0.674072265625, "loss": 0.6735, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -80.28242492675781, "rewards/margins": 4.427929878234863, "rewards/rejected": -84.7109375, "rewards/weighted_accuracies": 0.559374988079071, "rewards/weighted_chosen": -0.543652355670929, "rewards/weighted_margins": 0.184315487742424, "rewards/weighted_rejected": -0.727569580078125, "step": 420 }, { "epoch": 0.22980827042554614, "grad_norm": 14.300553321838379, "learning_rate": 9.503108399567308e-07, "logits/chosen": -0.14830398559570312, "logits/rejected": -0.09484557807445526, "logps/chosen": -162.40625, "logps/rejected": -167.7195281982422, "logps/weighted_chosen": -0.5840820074081421, "logps/weighted_rejected": -0.647021472454071, "loss": 0.672, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -89.32051086425781, "rewards/margins": 8.4365234375, "rewards/rejected": -97.79023742675781, "rewards/weighted_accuracies": 0.596875011920929, "rewards/weighted_chosen": -0.545318603515625, "rewards/weighted_margins": 0.13585510849952698, "rewards/weighted_rejected": -0.681243896484375, "step": 430 }, { "epoch": 0.23515264880753559, "grad_norm": 21.087541580200195, "learning_rate": 9.461788314471034e-07, "logits/chosen": -0.10236664116382599, "logits/rejected": -0.05696678161621094, "logps/chosen": -155.13827514648438, "logps/rejected": -159.828125, "logps/weighted_chosen": -0.6568237543106079, "logps/weighted_rejected": -0.7309814691543579, "loss": 0.6898, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -92.36328125, "rewards/margins": 7.942968845367432, "rewards/rejected": -100.3109359741211, "rewards/weighted_accuracies": 0.559374988079071, "rewards/weighted_chosen": -0.6760101318359375, "rewards/weighted_margins": 0.16096191108226776, "rewards/weighted_rejected": -0.836883544921875, "step": 440 }, { "epoch": 0.240497027189525, "grad_norm": 19.655607223510742, "learning_rate": 9.418915442193509e-07, "logits/chosen": -0.10150299221277237, "logits/rejected": -0.05734825134277344, "logps/chosen": -152.95858764648438, "logps/rejected": -165.30078125, "logps/weighted_chosen": -0.6426635980606079, "logps/weighted_rejected": -0.69622802734375, "loss": 0.7073, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -89.14433288574219, "rewards/margins": 14.668359756469727, "rewards/rejected": -103.8412094116211, "rewards/weighted_accuracies": 0.565625011920929, "rewards/weighted_chosen": -0.694122314453125, "rewards/weighted_margins": 0.11862488090991974, "rewards/weighted_rejected": -0.8125030398368835, "step": 450 }, { "epoch": 0.24584140557151446, "grad_norm": 35.64816665649414, "learning_rate": 9.374504703309579e-07, "logits/chosen": -0.1730697602033615, "logits/rejected": -0.1376514434814453, "logps/chosen": -149.97811889648438, "logps/rejected": -152.8171844482422, "logps/weighted_chosen": -0.638531506061554, "logps/weighted_rejected": -0.77386474609375, "loss": 0.6563, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -86.3251953125, "rewards/margins": 8.056055068969727, "rewards/rejected": -94.4281234741211, "rewards/weighted_accuracies": 0.5625, "rewards/weighted_chosen": -0.6749175786972046, "rewards/weighted_margins": 0.30900877714157104, "rewards/weighted_rejected": -0.983563244342804, "step": 460 }, { "epoch": 0.2511857839535039, "grad_norm": 20.12373161315918, "learning_rate": 9.328571553600915e-07, "logits/chosen": -0.14519290626049042, "logits/rejected": -0.11081619560718536, "logps/chosen": -154.73828125, "logps/rejected": -158.703125, "logps/weighted_chosen": -0.656390368938446, "logps/weighted_rejected": -0.7307983636856079, "loss": 0.7214, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -92.1263656616211, "rewards/margins": 7.731054782867432, "rewards/rejected": -99.86836242675781, "rewards/weighted_accuracies": 0.565625011920929, "rewards/weighted_chosen": -0.6984283328056335, "rewards/weighted_margins": 0.14166870713233948, "rewards/weighted_rejected": -0.8403259515762329, "step": 470 }, { "epoch": 0.25653016233549336, "grad_norm": 19.16153907775879, "learning_rate": 9.281131978677106e-07, "logits/chosen": -0.1819503754377365, "logits/rejected": -0.14701232314109802, "logps/chosen": -160.57968139648438, "logps/rejected": -164.2414093017578, "logps/weighted_chosen": -0.609344482421875, "logps/weighted_rejected": -0.750903308391571, "loss": 0.6525, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -93.19140625, "rewards/margins": 9.908788681030273, "rewards/rejected": -103.0589828491211, "rewards/weighted_accuracies": 0.590624988079071, "rewards/weighted_chosen": -0.6482604742050171, "rewards/weighted_margins": 0.2539626955986023, "rewards/weighted_rejected": -0.9027160406112671, "step": 480 }, { "epoch": 0.2618745407174828, "grad_norm": 47.42090606689453, "learning_rate": 9.232202488412361e-07, "logits/chosen": -0.18560639023780823, "logits/rejected": -0.152149960398674, "logps/chosen": -150.15078735351562, "logps/rejected": -156.22109985351562, "logps/weighted_chosen": -0.6249145269393921, "logps/weighted_rejected": -0.755505383014679, "loss": 0.6793, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -89.3853530883789, "rewards/margins": 8.602734565734863, "rewards/rejected": -97.9830093383789, "rewards/weighted_accuracies": 0.5531250238418579, "rewards/weighted_chosen": -0.6580413579940796, "rewards/weighted_margins": 0.2508483827114105, "rewards/weighted_rejected": -0.909197986125946, "step": 490 }, { "epoch": 0.26721891909947226, "grad_norm": 21.92582893371582, "learning_rate": 9.181800111199766e-07, "logits/chosen": -0.2139892578125, "logits/rejected": -0.185211181640625, "logps/chosen": -153.05624389648438, "logps/rejected": -155.6671905517578, "logps/weighted_chosen": -0.660186767578125, "logps/weighted_rejected": -0.740765392780304, "loss": 0.6631, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -91.49101257324219, "rewards/margins": 12.721094131469727, "rewards/rejected": -104.1937484741211, "rewards/weighted_accuracies": 0.6156250238418579, "rewards/weighted_chosen": -0.679455578327179, "rewards/weighted_margins": 0.2384185791015625, "rewards/weighted_rejected": -0.9178100824356079, "step": 500 }, { "epoch": 0.26721891909947226, "eval_logits/chosen": -0.27163267135620117, "eval_logits/rejected": -0.24348750710487366, "eval_logps/chosen": -159.72760009765625, "eval_logps/rejected": -165.05091857910156, "eval_logps/weighted_chosen": -0.6352449059486389, "eval_logps/weighted_rejected": -0.7405111789703369, "eval_loss": 0.6863088607788086, "eval_rewards/accuracies": 0.5992871522903442, "eval_rewards/chosen": -97.6285629272461, "eval_rewards/margins": 10.883528709411621, "eval_rewards/rejected": -108.50712585449219, "eval_rewards/weighted_accuracies": 0.5972505211830139, "eval_rewards/weighted_chosen": -0.6794247031211853, "eval_rewards/weighted_margins": 0.20837070047855377, "eval_rewards/weighted_rejected": -0.8877954483032227, "eval_runtime": 1137.126, "eval_samples_per_second": 1.725, "eval_steps_per_second": 0.432, "step": 500 } ], "logging_steps": 10, "max_steps": 1872, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }