{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.26171159382360637, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005234231876472127, "grad_norm": 124.9239730834961, "learning_rate": 0.0, "logits/chosen": -0.4007568359375, "logits/rejected": -0.41790771484375, "logps/chosen": -297.796875, "logps/rejected": -248.078125, "logps/weighted_chosen": -4.70263671875, "logps/weighted_rejected": -3.35546875, "loss": 0.6914, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "rewards/weighted_accuracies": 0.0, "rewards/weighted_chosen": 0.0, "rewards/weighted_margins": 0.0, "rewards/weighted_rejected": 0.0, "step": 1 }, { "epoch": 0.005234231876472127, "grad_norm": 265.8483581542969, "learning_rate": 4.6875e-08, "logits/chosen": -0.3176540732383728, "logits/rejected": -0.3532511293888092, "logps/chosen": -275.5720520019531, "logps/rejected": -255.88714599609375, "logps/weighted_chosen": -2.5362956523895264, "logps/weighted_rejected": -2.774468421936035, "loss": 0.6903, "rewards/accuracies": 0.2743055522441864, "rewards/chosen": 0.0078125, "rewards/margins": 0.02973090298473835, "rewards/rejected": -0.02191840298473835, "rewards/weighted_accuracies": 0.3055555522441864, "rewards/weighted_chosen": 0.0025702582206577063, "rewards/weighted_margins": 0.0017564562149345875, "rewards/weighted_rejected": 0.0008138020639307797, "step": 10 }, { "epoch": 0.010468463752944255, "grad_norm": 17.298032760620117, "learning_rate": 9.895833333333332e-08, "logits/chosen": -0.2979484498500824, "logits/rejected": -0.3138488829135895, "logps/chosen": -294.40625, "logps/rejected": -272.69061279296875, "logps/weighted_chosen": -2.3424315452575684, "logps/weighted_rejected": -2.307080030441284, "loss": 0.6912, "rewards/accuracies": 0.296875, "rewards/chosen": 0.007617187686264515, "rewards/margins": 0.05781250074505806, "rewards/rejected": -0.05019531399011612, "rewards/weighted_accuracies": 0.3531250059604645, "rewards/weighted_chosen": 0.0003913879336323589, "rewards/weighted_margins": 0.00116729736328125, "rewards/weighted_rejected": -0.0007759094005450606, "step": 20 }, { "epoch": 0.015702695629416383, "grad_norm": 79.90486907958984, "learning_rate": 1.5104166666666664e-07, "logits/chosen": -0.2918685972690582, "logits/rejected": -0.3376312255859375, "logps/chosen": -298.02032470703125, "logps/rejected": -268.0210876464844, "logps/weighted_chosen": -2.290698289871216, "logps/weighted_rejected": -2.578540086746216, "loss": 0.6921, "rewards/accuracies": 0.3187499940395355, "rewards/chosen": 0.0751953125, "rewards/margins": 0.10107421875, "rewards/rejected": -0.02587890625, "rewards/weighted_accuracies": 0.375, "rewards/weighted_chosen": 0.0023864745162427425, "rewards/weighted_margins": -9.1552734375e-05, "rewards/weighted_rejected": 0.0024780272506177425, "step": 30 }, { "epoch": 0.02093692750588851, "grad_norm": 32.818607330322266, "learning_rate": 2.03125e-07, "logits/chosen": -0.3010299801826477, "logits/rejected": -0.3435005247592926, "logps/chosen": -278.60235595703125, "logps/rejected": -253.9250030517578, "logps/weighted_chosen": -2.3670654296875, "logps/weighted_rejected": -2.760937452316284, "loss": 0.6923, "rewards/accuracies": 0.34375, "rewards/chosen": 0.12041015923023224, "rewards/margins": 0.13857421278953552, "rewards/rejected": -0.01816406287252903, "rewards/weighted_accuracies": 0.4156250059604645, "rewards/weighted_chosen": 0.0007911681896075606, "rewards/weighted_margins": -0.00026035308837890625, "rewards/weighted_rejected": 0.0010513305896893144, "step": 40 }, { "epoch": 0.02617115938236064, "grad_norm": 17.478111267089844, "learning_rate": 2.552083333333333e-07, "logits/chosen": -0.2825515866279602, "logits/rejected": -0.3209354281425476, "logps/chosen": -280.23126220703125, "logps/rejected": -267.45001220703125, "logps/weighted_chosen": -2.329760789871216, "logps/weighted_rejected": -2.414306640625, "loss": 0.6877, "rewards/accuracies": 0.30937498807907104, "rewards/chosen": 0.01835937425494194, "rewards/margins": -0.06035156175494194, "rewards/rejected": 0.07871093600988388, "rewards/weighted_accuracies": 0.41874998807907104, "rewards/weighted_chosen": 0.0070404051803052425, "rewards/weighted_margins": 0.00885009765625, "rewards/weighted_rejected": -0.0018066406482830644, "step": 50 }, { "epoch": 0.031405391258832765, "grad_norm": 39.04142761230469, "learning_rate": 3.0729166666666665e-07, "logits/chosen": -0.3157821595668793, "logits/rejected": -0.30972975492477417, "logps/chosen": -277.58123779296875, "logps/rejected": -261.6187438964844, "logps/weighted_chosen": -2.753100633621216, "logps/weighted_rejected": -2.6282715797424316, "loss": 0.6846, "rewards/accuracies": 0.34687501192092896, "rewards/chosen": 0.13593749701976776, "rewards/margins": 0.02968749962747097, "rewards/rejected": 0.10625000298023224, "rewards/weighted_accuracies": 0.4937500059604645, "rewards/weighted_chosen": 0.02093353308737278, "rewards/weighted_margins": 0.015999604016542435, "rewards/weighted_rejected": 0.0049346922896802425, "step": 60 }, { "epoch": 0.036639623135304895, "grad_norm": 73.14978790283203, "learning_rate": 3.59375e-07, "logits/chosen": -0.32021790742874146, "logits/rejected": -0.32699280977249146, "logps/chosen": -289.6148376464844, "logps/rejected": -244.8367156982422, "logps/weighted_chosen": -2.221527099609375, "logps/weighted_rejected": -2.581958055496216, "loss": 0.6775, "rewards/accuracies": 0.49687498807907104, "rewards/chosen": 0.43046873807907104, "rewards/margins": 0.5244140625, "rewards/rejected": -0.09394530951976776, "rewards/weighted_accuracies": 0.53125, "rewards/weighted_chosen": 0.030600737780332565, "rewards/weighted_margins": 0.034238435328006744, "rewards/weighted_rejected": -0.0036373138427734375, "step": 70 }, { "epoch": 0.04187385501177702, "grad_norm": 65.60453796386719, "learning_rate": 4.114583333333333e-07, "logits/chosen": -0.28860169649124146, "logits/rejected": -0.3099655210971832, "logps/chosen": -289.33984375, "logps/rejected": -270.3453063964844, "logps/weighted_chosen": -2.456127882003784, "logps/weighted_rejected": -2.6986327171325684, "loss": 0.6733, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 0.509765625, "rewards/margins": 0.56396484375, "rewards/rejected": -0.05419921875, "rewards/weighted_accuracies": 0.5625, "rewards/weighted_chosen": 0.04326324537396431, "rewards/weighted_margins": 0.05183257907629013, "rewards/weighted_rejected": -0.008570862002670765, "step": 80 }, { "epoch": 0.04710808688824915, "grad_norm": 30.41583824157715, "learning_rate": 4.6354166666666664e-07, "logits/chosen": -0.3197769224643707, "logits/rejected": -0.3126419186592102, "logps/chosen": -280.0874938964844, "logps/rejected": -257.95782470703125, "logps/weighted_chosen": -2.6199097633361816, "logps/weighted_rejected": -2.705517530441284, "loss": 0.6743, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.5191406011581421, "rewards/margins": 0.6358398199081421, "rewards/rejected": -0.11669921875, "rewards/weighted_accuracies": 0.5718749761581421, "rewards/weighted_chosen": 0.07943420112133026, "rewards/weighted_margins": 0.07568969577550888, "rewards/weighted_rejected": 0.003753662109375, "step": 90 }, { "epoch": 0.05234231876472128, "grad_norm": 61.700382232666016, "learning_rate": 5.156249999999999e-07, "logits/chosen": -0.2895751893520355, "logits/rejected": -0.3621200621128082, "logps/chosen": -290.88592529296875, "logps/rejected": -286.73126220703125, "logps/weighted_chosen": -2.083447217941284, "logps/weighted_rejected": -2.6861205101013184, "loss": 0.6862, "rewards/accuracies": 0.5406249761581421, "rewards/chosen": 0.10849609225988388, "rewards/margins": 0.94482421875, "rewards/rejected": -0.8363281488418579, "rewards/weighted_accuracies": 0.574999988079071, "rewards/weighted_chosen": 0.05786743015050888, "rewards/weighted_margins": 0.058258056640625, "rewards/weighted_rejected": -0.0002998351992573589, "step": 100 }, { "epoch": 0.05757655064119341, "grad_norm": 33.45664978027344, "learning_rate": 5.677083333333333e-07, "logits/chosen": -0.3412017822265625, "logits/rejected": -0.3312934935092926, "logps/chosen": -296.8671875, "logps/rejected": -262.3460998535156, "logps/weighted_chosen": -2.776904344558716, "logps/weighted_rejected": -2.8023438453674316, "loss": 0.6835, "rewards/accuracies": 0.546875, "rewards/chosen": -0.4873046875, "rewards/margins": 1.0051758289337158, "rewards/rejected": -1.4924805164337158, "rewards/weighted_accuracies": 0.5062500238418579, "rewards/weighted_chosen": 0.01709899865090847, "rewards/weighted_margins": 0.05614929273724556, "rewards/weighted_rejected": -0.03909149020910263, "step": 110 }, { "epoch": 0.06281078251766553, "grad_norm": 61.74055862426758, "learning_rate": 6.197916666666666e-07, "logits/chosen": -0.3482359051704407, "logits/rejected": -0.3736562728881836, "logps/chosen": -295.49530029296875, "logps/rejected": -256.08282470703125, "logps/weighted_chosen": -2.066760301589966, "logps/weighted_rejected": -2.2858643531799316, "loss": 0.6833, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.8194335699081421, "rewards/margins": 1.502539038658142, "rewards/rejected": -2.321972608566284, "rewards/weighted_accuracies": 0.5874999761581421, "rewards/weighted_chosen": 0.01521911658346653, "rewards/weighted_margins": 0.05682678148150444, "rewards/weighted_rejected": -0.04169006273150444, "step": 120 }, { "epoch": 0.06804501439413765, "grad_norm": 45.78140640258789, "learning_rate": 6.718749999999999e-07, "logits/chosen": -0.3164505064487457, "logits/rejected": -0.3121093809604645, "logps/chosen": -306.4761657714844, "logps/rejected": -279.87030029296875, "logps/weighted_chosen": -2.1032347679138184, "logps/weighted_rejected": -2.432788133621216, "loss": 0.6718, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.4637207090854645, "rewards/margins": 1.8840820789337158, "rewards/rejected": -2.347851514816284, "rewards/weighted_accuracies": 0.559374988079071, "rewards/weighted_chosen": 0.01874847337603569, "rewards/weighted_margins": 0.07195129245519638, "rewards/weighted_rejected": -0.05319824069738388, "step": 130 }, { "epoch": 0.07327924627060979, "grad_norm": 47.97172546386719, "learning_rate": 7.239583333333333e-07, "logits/chosen": -0.36130523681640625, "logits/rejected": -0.3767889142036438, "logps/chosen": -300.2398376464844, "logps/rejected": -276.47344970703125, "logps/weighted_chosen": -2.402172803878784, "logps/weighted_rejected": -2.704345703125, "loss": 0.6601, "rewards/accuracies": 0.625, "rewards/chosen": -1.6025390625, "rewards/margins": 2.2777342796325684, "rewards/rejected": -3.8802733421325684, "rewards/weighted_accuracies": 0.628125011920929, "rewards/weighted_chosen": 0.0066894530318677425, "rewards/weighted_margins": 0.10413207858800888, "rewards/weighted_rejected": -0.0973968505859375, "step": 140 }, { "epoch": 0.07851347814708191, "grad_norm": 18.48399543762207, "learning_rate": 7.760416666666666e-07, "logits/chosen": -0.3260757327079773, "logits/rejected": -0.33597564697265625, "logps/chosen": -286.6578063964844, "logps/rejected": -255.8078155517578, "logps/weighted_chosen": -2.939697265625, "logps/weighted_rejected": -2.9384522438049316, "loss": 0.6693, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.329296827316284, "rewards/margins": 2.532031297683716, "rewards/rejected": -4.861328125, "rewards/weighted_accuracies": 0.625, "rewards/weighted_chosen": 0.01940460130572319, "rewards/weighted_margins": 0.12819823622703552, "rewards/weighted_rejected": -0.10869445651769638, "step": 150 }, { "epoch": 0.08374771002355404, "grad_norm": 16.847576141357422, "learning_rate": 8.28125e-07, "logits/chosen": -0.37299805879592896, "logits/rejected": -0.4207763671875, "logps/chosen": -306.01873779296875, "logps/rejected": -279.0289001464844, "logps/weighted_chosen": -2.4432616233825684, "logps/weighted_rejected": -2.4922118186950684, "loss": 0.631, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -2.1587891578674316, "rewards/margins": 3.69091796875, "rewards/rejected": -5.849707126617432, "rewards/weighted_accuracies": 0.637499988079071, "rewards/weighted_chosen": 0.04796142503619194, "rewards/weighted_margins": 0.21007385849952698, "rewards/weighted_rejected": -0.1621856689453125, "step": 160 }, { "epoch": 0.08898194190002617, "grad_norm": 237.514404296875, "learning_rate": 8.802083333333333e-07, "logits/chosen": -0.42098236083984375, "logits/rejected": -0.4231323301792145, "logps/chosen": -306.859375, "logps/rejected": -266.03125, "logps/weighted_chosen": -2.6180663108825684, "logps/weighted_rejected": -2.9256348609924316, "loss": 0.709, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -3.73828125, "rewards/margins": 4.098242282867432, "rewards/rejected": -7.836328029632568, "rewards/weighted_accuracies": 0.6031249761581421, "rewards/weighted_chosen": 0.06289978325366974, "rewards/weighted_margins": 0.186981201171875, "rewards/weighted_rejected": -0.12393493950366974, "step": 170 }, { "epoch": 0.0942161737764983, "grad_norm": 52.46952819824219, "learning_rate": 9.322916666666666e-07, "logits/chosen": -0.38860780000686646, "logits/rejected": -0.40771445631980896, "logps/chosen": -280.2789001464844, "logps/rejected": -267.69219970703125, "logps/weighted_chosen": -2.579449415206909, "logps/weighted_rejected": -3.0037841796875, "loss": 0.6853, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -6.609179496765137, "rewards/margins": 4.819726467132568, "rewards/rejected": -11.427831649780273, "rewards/weighted_accuracies": 0.6031249761581421, "rewards/weighted_chosen": -0.04149780422449112, "rewards/weighted_margins": 0.20274047553539276, "rewards/weighted_rejected": -0.24451598525047302, "step": 180 }, { "epoch": 0.09945040565297043, "grad_norm": 37.741477966308594, "learning_rate": 9.84375e-07, "logits/chosen": -0.4061637818813324, "logits/rejected": -0.4410812258720398, "logps/chosen": -314.60626220703125, "logps/rejected": -278.87811279296875, "logps/weighted_chosen": -2.378369092941284, "logps/weighted_rejected": -2.7929444313049316, "loss": 0.6517, "rewards/accuracies": 0.625, "rewards/chosen": -8.988476753234863, "rewards/margins": 6.061327934265137, "rewards/rejected": -15.050390243530273, "rewards/weighted_accuracies": 0.621874988079071, "rewards/weighted_chosen": -0.11425475776195526, "rewards/weighted_margins": 0.22973021864891052, "rewards/weighted_rejected": -0.3441314697265625, "step": 190 }, { "epoch": 0.10468463752944256, "grad_norm": 76.11390686035156, "learning_rate": 9.99959085414323e-07, "logits/chosen": -0.4347488284111023, "logits/rejected": -0.46932679414749146, "logps/chosen": -319.34686279296875, "logps/rejected": -274.1890563964844, "logps/weighted_chosen": -2.6759276390075684, "logps/weighted_rejected": -3.110583543777466, "loss": 0.6539, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -10.064160346984863, "rewards/margins": 6.332421779632568, "rewards/rejected": -16.395898818969727, "rewards/weighted_accuracies": 0.612500011920929, "rewards/weighted_chosen": -0.08293457329273224, "rewards/weighted_margins": 0.25919800996780396, "rewards/weighted_rejected": -0.34211426973342896, "step": 200 }, { "epoch": 0.10991886940591468, "grad_norm": 19.709537506103516, "learning_rate": 9.997587035630105e-07, "logits/chosen": -0.4373016357421875, "logits/rejected": -0.47842711210250854, "logps/chosen": -298.6578063964844, "logps/rejected": -305.07733154296875, "logps/weighted_chosen": -2.4120116233825684, "logps/weighted_rejected": -3.137524366378784, "loss": 0.6448, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -11.547070503234863, "rewards/margins": 8.876855850219727, "rewards/rejected": -20.425390243530273, "rewards/weighted_accuracies": 0.659375011920929, "rewards/weighted_chosen": -0.12418060004711151, "rewards/weighted_margins": 0.24833527207374573, "rewards/weighted_rejected": -0.37247925996780396, "step": 210 }, { "epoch": 0.11515310128238682, "grad_norm": 39.1947135925293, "learning_rate": 9.99391406364405e-07, "logits/chosen": -0.424050897359848, "logits/rejected": -0.4263656735420227, "logps/chosen": -306.5453186035156, "logps/rejected": -291.21405029296875, "logps/weighted_chosen": -2.8793702125549316, "logps/weighted_rejected": -3.2673583030700684, "loss": 0.6969, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -14.446874618530273, "rewards/margins": 7.897851467132568, "rewards/rejected": -22.343358993530273, "rewards/weighted_accuracies": 0.606249988079071, "rewards/weighted_chosen": -0.2553466856479645, "rewards/weighted_margins": 0.21260376274585724, "rewards/weighted_rejected": -0.46800535917282104, "step": 220 }, { "epoch": 0.12038733315885894, "grad_norm": 437.028564453125, "learning_rate": 9.988573164927884e-07, "logits/chosen": -0.3824310302734375, "logits/rejected": -0.4174056947231293, "logps/chosen": -282.2710876464844, "logps/rejected": -277.2734375, "logps/weighted_chosen": -2.547900438308716, "logps/weighted_rejected": -2.85693359375, "loss": 0.7367, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -13.438867568969727, "rewards/margins": 12.128320693969727, "rewards/rejected": -25.565820693969727, "rewards/weighted_accuracies": 0.621874988079071, "rewards/weighted_chosen": -0.24434813857078552, "rewards/weighted_margins": 0.2164306640625, "rewards/weighted_rejected": -0.4607299864292145, "step": 230 }, { "epoch": 0.12562156503533106, "grad_norm": 139.88975524902344, "learning_rate": 9.98156612329838e-07, "logits/chosen": -0.4804244935512543, "logits/rejected": -0.527056872844696, "logps/chosen": -279.6773376464844, "logps/rejected": -310.64532470703125, "logps/weighted_chosen": -2.464794874191284, "logps/weighted_rejected": -3.058520555496216, "loss": 0.6735, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -14.966699600219727, "rewards/margins": 13.450976371765137, "rewards/rejected": -28.419336318969727, "rewards/weighted_accuracies": 0.6625000238418579, "rewards/weighted_chosen": -0.11341552436351776, "rewards/weighted_margins": 0.26569825410842896, "rewards/weighted_rejected": -0.37919312715530396, "step": 240 }, { "epoch": 0.13085579691180318, "grad_norm": 939.5078125, "learning_rate": 9.97289527905053e-07, "logits/chosen": -0.4899856448173523, "logits/rejected": -0.4979141354560852, "logps/chosen": -281.9039001464844, "logps/rejected": -282.93438720703125, "logps/weighted_chosen": -2.8607420921325684, "logps/weighted_rejected": -3.0235595703125, "loss": 0.7231, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -17.384668350219727, "rewards/margins": 9.600194931030273, "rewards/rejected": -26.986328125, "rewards/weighted_accuracies": 0.5843750238418579, "rewards/weighted_chosen": -0.20379944145679474, "rewards/weighted_margins": 0.18174438178539276, "rewards/weighted_rejected": -0.3856750428676605, "step": 250 }, { "epoch": 0.1360900287882753, "grad_norm": 24.300647735595703, "learning_rate": 9.962563528175875e-07, "logits/chosen": -0.4584365785121918, "logits/rejected": -0.4911254942417145, "logps/chosen": -317.06329345703125, "logps/rejected": -289.06561279296875, "logps/weighted_chosen": -2.4532713890075684, "logps/weighted_rejected": -3.330212354660034, "loss": 0.6636, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -18.153711318969727, "rewards/margins": 10.150195121765137, "rewards/rejected": -28.3046875, "rewards/weighted_accuracies": 0.6187499761581421, "rewards/weighted_chosen": -0.18357238173484802, "rewards/weighted_margins": 0.269927978515625, "rewards/weighted_rejected": -0.4535202085971832, "step": 260 }, { "epoch": 0.14132426066474746, "grad_norm": 228.46092224121094, "learning_rate": 9.950574321395277e-07, "logits/chosen": -0.4871993958950043, "logits/rejected": -0.5071777105331421, "logps/chosen": -310.2124938964844, "logps/rejected": -291.5687561035156, "logps/weighted_chosen": -2.747607469558716, "logps/weighted_rejected": -3.0499024391174316, "loss": 0.6525, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -20.7822265625, "rewards/margins": 7.598730564117432, "rewards/rejected": -28.382030487060547, "rewards/weighted_accuracies": 0.6499999761581421, "rewards/weighted_chosen": -0.2510986328125, "rewards/weighted_margins": 0.2596893310546875, "rewards/weighted_rejected": -0.5105804204940796, "step": 270 }, { "epoch": 0.14655849254121958, "grad_norm": 713.1409912109375, "learning_rate": 9.936931663006413e-07, "logits/chosen": -0.4856887757778168, "logits/rejected": -0.4791274964809418, "logps/chosen": -323.875, "logps/rejected": -312.1109313964844, "logps/weighted_chosen": -2.7660155296325684, "logps/weighted_rejected": -3.293591260910034, "loss": 0.6155, "rewards/accuracies": 0.671875, "rewards/chosen": -20.981639862060547, "rewards/margins": 11.869140625, "rewards/rejected": -32.850975036621094, "rewards/weighted_accuracies": 0.675000011920929, "rewards/weighted_chosen": -0.13939818739891052, "rewards/weighted_margins": 0.34244996309280396, "rewards/weighted_rejected": -0.48178404569625854, "step": 280 }, { "epoch": 0.1517927244176917, "grad_norm": 10845.4873046875, "learning_rate": 9.921640109546357e-07, "logits/chosen": -0.433615118265152, "logits/rejected": -0.5038871765136719, "logps/chosen": -285.6937561035156, "logps/rejected": -280.76251220703125, "logps/weighted_chosen": -2.63427734375, "logps/weighted_rejected": -3.711669921875, "loss": 0.65, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -18.829492568969727, "rewards/margins": 10.345703125, "rewards/rejected": -29.1728515625, "rewards/weighted_accuracies": 0.6187499761581421, "rewards/weighted_chosen": -0.12302856147289276, "rewards/weighted_margins": 0.3319152891635895, "rewards/weighted_rejected": -0.45494383573532104, "step": 290 }, { "epoch": 0.15702695629416383, "grad_norm": 47.70283889770508, "learning_rate": 9.90470476826975e-07, "logits/chosen": -0.4355606138706207, "logits/rejected": -0.4278717041015625, "logps/chosen": -288.78045654296875, "logps/rejected": -296.9765625, "logps/weighted_chosen": -2.532849073410034, "logps/weighted_rejected": -3.0221924781799316, "loss": 0.6674, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -19.346094131469727, "rewards/margins": 10.255078315734863, "rewards/rejected": -29.59765625, "rewards/weighted_accuracies": 0.6187499761581421, "rewards/weighted_chosen": -0.21412047743797302, "rewards/weighted_margins": 0.2544006407260895, "rewards/weighted_rejected": -0.4681999087333679, "step": 300 }, { "epoch": 0.16226118817063595, "grad_norm": 644.2051391601562, "learning_rate": 9.886131295443002e-07, "logits/chosen": -0.462362676858902, "logits/rejected": -0.5182906985282898, "logps/chosen": -301.2421875, "logps/rejected": -280.125, "logps/weighted_chosen": -2.5293211936950684, "logps/weighted_rejected": -2.9417481422424316, "loss": 0.6339, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -19.940235137939453, "rewards/margins": 9.952539443969727, "rewards/rejected": -29.892187118530273, "rewards/weighted_accuracies": 0.65625, "rewards/weighted_chosen": 0.0126800537109375, "rewards/weighted_margins": 0.4334716796875, "rewards/weighted_rejected": -0.42060548067092896, "step": 310 }, { "epoch": 0.16749542004710807, "grad_norm": 42.43592834472656, "learning_rate": 9.865925894455166e-07, "logits/chosen": -0.538470447063446, "logits/rejected": -0.5473419427871704, "logps/chosen": -323.84686279296875, "logps/rejected": -290.7406311035156, "logps/weighted_chosen": -2.8351807594299316, "logps/weighted_rejected": -3.2470703125, "loss": 0.6762, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -23.906444549560547, "rewards/margins": 9.430078506469727, "rewards/rejected": -33.3359375, "rewards/weighted_accuracies": 0.6187499761581421, "rewards/weighted_chosen": -0.23797301948070526, "rewards/weighted_margins": 0.272064208984375, "rewards/weighted_rejected": -0.5102294683456421, "step": 320 }, { "epoch": 0.17272965192358022, "grad_norm": 46.229000091552734, "learning_rate": 9.84409531374603e-07, "logits/chosen": -0.5096954107284546, "logits/rejected": -0.4907272458076477, "logps/chosen": -332.18280029296875, "logps/rejected": -298.5718688964844, "logps/weighted_chosen": -2.775341749191284, "logps/weighted_rejected": -3.165026903152466, "loss": 0.6741, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -27.408203125, "rewards/margins": 8.212109565734863, "rewards/rejected": -35.61640548706055, "rewards/weighted_accuracies": 0.6312500238418579, "rewards/weighted_chosen": -0.17764969170093536, "rewards/weighted_margins": 0.241455078125, "rewards/weighted_rejected": -0.41897353529930115, "step": 330 }, { "epoch": 0.17796388380005235, "grad_norm": 70.134521484375, "learning_rate": 9.820646844552219e-07, "logits/chosen": -0.5079803466796875, "logits/rejected": -0.5622657537460327, "logps/chosen": -301.3257751464844, "logps/rejected": -303.1468811035156, "logps/weighted_chosen": -2.852612257003784, "logps/weighted_rejected": -3.0445556640625, "loss": 0.6485, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -24.6875, "rewards/margins": 12.933984756469727, "rewards/rejected": -37.61992263793945, "rewards/weighted_accuracies": 0.65625, "rewards/weighted_chosen": -0.19311293959617615, "rewards/weighted_margins": 0.3020080626010895, "rewards/weighted_rejected": -0.4951171875, "step": 340 }, { "epoch": 0.18319811567652447, "grad_norm": 37.5682373046875, "learning_rate": 9.795588318471964e-07, "logits/chosen": -0.5787566900253296, "logits/rejected": -0.5869846343994141, "logps/chosen": -283.0796813964844, "logps/rejected": -310.00469970703125, "logps/weighted_chosen": -2.5658936500549316, "logps/weighted_rejected": -2.95947265625, "loss": 0.6307, "rewards/accuracies": 0.625, "rewards/chosen": -23.71875, "rewards/margins": 9.896288871765137, "rewards/rejected": -33.61249923706055, "rewards/weighted_accuracies": 0.6468750238418579, "rewards/weighted_chosen": -0.1531631499528885, "rewards/weighted_margins": 0.32664793729782104, "rewards/weighted_rejected": -0.479653924703598, "step": 350 }, { "epoch": 0.1884323475529966, "grad_norm": 145.69015502929688, "learning_rate": 9.768928104849415e-07, "logits/chosen": -0.6053832769393921, "logits/rejected": -0.6036437749862671, "logps/chosen": -306.9140625, "logps/rejected": -283.77655029296875, "logps/weighted_chosen": -2.835888624191284, "logps/weighted_rejected": -2.9885010719299316, "loss": 0.6741, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -24.237499237060547, "rewards/margins": 10.624218940734863, "rewards/rejected": -34.85859298706055, "rewards/weighted_accuracies": 0.640625, "rewards/weighted_chosen": -0.1663818359375, "rewards/weighted_margins": 0.3080078065395355, "rewards/weighted_rejected": -0.4744934141635895, "step": 360 }, { "epoch": 0.19366657942946872, "grad_norm": 1347.0242919921875, "learning_rate": 9.740675107979355e-07, "logits/chosen": -0.5699676275253296, "logits/rejected": -0.5878998041152954, "logps/chosen": -342.36248779296875, "logps/rejected": -311.7203063964844, "logps/weighted_chosen": -2.303784132003784, "logps/weighted_rejected": -3.164013624191284, "loss": 0.6624, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -27.253124237060547, "rewards/margins": 9.823046684265137, "rewards/rejected": -37.07597732543945, "rewards/weighted_accuracies": 0.643750011920929, "rewards/weighted_chosen": -0.24745483696460724, "rewards/weighted_margins": 0.27247315645217896, "rewards/weighted_rejected": -0.5200088620185852, "step": 370 }, { "epoch": 0.19890081130594087, "grad_norm": 1152.541748046875, "learning_rate": 9.71083876413323e-07, "logits/chosen": -0.5426742434501648, "logits/rejected": -0.5510879755020142, "logps/chosen": -331.25390625, "logps/rejected": -310.33123779296875, "logps/weighted_chosen": -2.4026856422424316, "logps/weighted_rejected": -3.172229051589966, "loss": 0.6676, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -28.869922637939453, "rewards/margins": 11.865234375, "rewards/rejected": -40.73320388793945, "rewards/weighted_accuracies": 0.6187499761581421, "rewards/weighted_chosen": -0.24112549424171448, "rewards/weighted_margins": 0.2662292420864105, "rewards/weighted_rejected": -0.5070465207099915, "step": 380 }, { "epoch": 0.204135043182413, "grad_norm": 69.06751251220703, "learning_rate": 9.67942903840751e-07, "logits/chosen": -0.5844222903251648, "logits/rejected": -0.6357940435409546, "logps/chosen": -331.53436279296875, "logps/rejected": -316.50469970703125, "logps/weighted_chosen": -2.523181200027466, "logps/weighted_rejected": -3.170581102371216, "loss": 0.6281, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -27.312108993530273, "rewards/margins": 14.989062309265137, "rewards/rejected": -42.309181213378906, "rewards/weighted_accuracies": 0.675000011920929, "rewards/weighted_chosen": -0.17550353705883026, "rewards/weighted_margins": 0.357177734375, "rewards/weighted_rejected": -0.532818615436554, "step": 390 }, { "epoch": 0.2093692750588851, "grad_norm": 64.58200073242188, "learning_rate": 9.646456421395447e-07, "logits/chosen": -0.573455810546875, "logits/rejected": -0.5838836431503296, "logps/chosen": -348.1484375, "logps/rejected": -349.8687438964844, "logps/weighted_chosen": -2.479809522628784, "logps/weighted_rejected": -3.226489305496216, "loss": 0.6551, "rewards/accuracies": 0.65625, "rewards/chosen": -29.957422256469727, "rewards/margins": 16.478321075439453, "rewards/rejected": -46.43476486206055, "rewards/weighted_accuracies": 0.640625, "rewards/weighted_chosen": -0.203724667429924, "rewards/weighted_margins": 0.254974365234375, "rewards/weighted_rejected": -0.4586502015590668, "step": 400 }, { "epoch": 0.21460350693535724, "grad_norm": 19.509164810180664, "learning_rate": 9.611931925683266e-07, "logits/chosen": -0.5067977905273438, "logits/rejected": -0.5407257080078125, "logps/chosen": -337.97967529296875, "logps/rejected": -311.12811279296875, "logps/weighted_chosen": -2.369091749191284, "logps/weighted_rejected": -2.994799852371216, "loss": 0.5919, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -31.741016387939453, "rewards/margins": 15.541796684265137, "rewards/rejected": -47.293357849121094, "rewards/weighted_accuracies": 0.6937500238418579, "rewards/weighted_chosen": -0.17744140326976776, "rewards/weighted_margins": 0.386505126953125, "rewards/weighted_rejected": -0.563507080078125, "step": 410 }, { "epoch": 0.21983773881182936, "grad_norm": 29.50520133972168, "learning_rate": 9.575867082172085e-07, "logits/chosen": -0.5286194086074829, "logits/rejected": -0.5688308477401733, "logps/chosen": -344.96563720703125, "logps/rejected": -328.64532470703125, "logps/weighted_chosen": -2.8730225563049316, "logps/weighted_rejected": -2.930432081222534, "loss": 0.6442, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -40.091796875, "rewards/margins": 18.596485137939453, "rewards/rejected": -58.685935974121094, "rewards/weighted_accuracies": 0.628125011920929, "rewards/weighted_chosen": -0.38405150175094604, "rewards/weighted_margins": 0.39207762479782104, "rewards/weighted_rejected": -0.7763046026229858, "step": 420 }, { "epoch": 0.22507197068830148, "grad_norm": 15.081392288208008, "learning_rate": 9.538273936226673e-07, "logits/chosen": -0.5923309326171875, "logits/rejected": -0.625653088092804, "logps/chosen": -301.640625, "logps/rejected": -313.77032470703125, "logps/weighted_chosen": -3.0785889625549316, "logps/weighted_rejected": -3.5872559547424316, "loss": 0.668, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -36.12675857543945, "rewards/margins": 13.111328125, "rewards/rejected": -49.236328125, "rewards/weighted_accuracies": 0.6312500238418579, "rewards/weighted_chosen": -0.28452759981155396, "rewards/weighted_margins": 0.3635620176792145, "rewards/weighted_rejected": -0.64788818359375, "step": 430 }, { "epoch": 0.23030620256477363, "grad_norm": 23.0412540435791, "learning_rate": 9.499165043652391e-07, "logits/chosen": -0.5760772824287415, "logits/rejected": -0.5774017572402954, "logps/chosen": -328.1015625, "logps/rejected": -318.84454345703125, "logps/weighted_chosen": -3.0999755859375, "logps/weighted_rejected": -3.371142625808716, "loss": 0.6452, "rewards/accuracies": 0.640625, "rewards/chosen": -36.97431564331055, "rewards/margins": 15.057812690734863, "rewards/rejected": -52.0390625, "rewards/weighted_accuracies": 0.628125011920929, "rewards/weighted_chosen": -0.39061737060546875, "rewards/weighted_margins": 0.3595825135707855, "rewards/weighted_rejected": -0.7500244379043579, "step": 440 }, { "epoch": 0.23554043444124576, "grad_norm": 22.85993003845215, "learning_rate": 9.458553466501665e-07, "logits/chosen": -0.6301513910293579, "logits/rejected": -0.6610687375068665, "logps/chosen": -324.77032470703125, "logps/rejected": -298.57501220703125, "logps/weighted_chosen": -3.1078124046325684, "logps/weighted_rejected": -3.38037109375, "loss": 0.6501, "rewards/accuracies": 0.640625, "rewards/chosen": -38.617774963378906, "rewards/margins": 16.141407012939453, "rewards/rejected": -54.763671875, "rewards/weighted_accuracies": 0.653124988079071, "rewards/weighted_chosen": -0.4693542420864105, "rewards/weighted_margins": 0.4438629150390625, "rewards/weighted_rejected": -0.9127838015556335, "step": 450 }, { "epoch": 0.24077466631771788, "grad_norm": 11.338366508483887, "learning_rate": 9.416452768711366e-07, "logits/chosen": -0.6437957882881165, "logits/rejected": -0.674633800983429, "logps/chosen": -333.87188720703125, "logps/rejected": -318.10467529296875, "logps/weighted_chosen": -2.8603515625, "logps/weighted_rejected": -3.472119092941284, "loss": 0.6881, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -41.862693786621094, "rewards/margins": 16.437694549560547, "rewards/rejected": -58.305076599121094, "rewards/weighted_accuracies": 0.653124988079071, "rewards/weighted_chosen": -0.43403321504592896, "rewards/weighted_margins": 0.4345336854457855, "rewards/weighted_rejected": -0.8688415288925171, "step": 460 }, { "epoch": 0.24600889819419, "grad_norm": 35.14228057861328, "learning_rate": 9.372877011572557e-07, "logits/chosen": -0.5738860964775085, "logits/rejected": -0.5953949093818665, "logps/chosen": -350.23046875, "logps/rejected": -329.484375, "logps/weighted_chosen": -2.7630372047424316, "logps/weighted_rejected": -3.2092041969299316, "loss": 0.6424, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -39.640037536621094, "rewards/margins": 15.155468940734863, "rewards/rejected": -54.796485900878906, "rewards/weighted_accuracies": 0.671875, "rewards/weighted_chosen": -0.338470458984375, "rewards/weighted_margins": 0.4430099427700043, "rewards/weighted_rejected": -0.781768798828125, "step": 470 }, { "epoch": 0.2512431300706621, "grad_norm": 55.78861618041992, "learning_rate": 9.327840749034141e-07, "logits/chosen": -0.6301849484443665, "logits/rejected": -0.6710245013237, "logps/chosen": -324.8578186035156, "logps/rejected": -336.734375, "logps/weighted_chosen": -2.5625243186950684, "logps/weighted_rejected": -3.774975538253784, "loss": 0.636, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -37.658203125, "rewards/margins": 22.687694549560547, "rewards/rejected": -60.357421875, "rewards/weighted_accuracies": 0.6812499761581421, "rewards/weighted_chosen": -0.2594970762729645, "rewards/weighted_margins": 0.48378294706344604, "rewards/weighted_rejected": -0.7435302734375, "step": 480 }, { "epoch": 0.2564773619471343, "grad_norm": 38.52030563354492, "learning_rate": 9.281359022841965e-07, "logits/chosen": -0.5341835021972656, "logits/rejected": -0.550585925579071, "logps/chosen": -320.76171875, "logps/rejected": -314.70391845703125, "logps/weighted_chosen": -2.91943359375, "logps/weighted_rejected": -4.169653415679932, "loss": 0.6375, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -45.831443786621094, "rewards/margins": 24.026952743530273, "rewards/rejected": -69.88359069824219, "rewards/weighted_accuracies": 0.668749988079071, "rewards/weighted_chosen": -0.516644299030304, "rewards/weighted_margins": 0.518634021282196, "rewards/weighted_rejected": -1.0354125499725342, "step": 490 }, { "epoch": 0.26171159382360637, "grad_norm": 15.318764686584473, "learning_rate": 9.233447357514989e-07, "logits/chosen": -0.5592590570449829, "logits/rejected": -0.6022475957870483, "logps/chosen": -347.77423095703125, "logps/rejected": -342.21875, "logps/weighted_chosen": -3.2605223655700684, "logps/weighted_rejected": -3.667773485183716, "loss": 0.6494, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -49.72968673706055, "rewards/margins": 22.665430068969727, "rewards/rejected": -72.4097671508789, "rewards/weighted_accuracies": 0.65625, "rewards/weighted_chosen": -0.4633804261684418, "rewards/weighted_margins": 0.527294933795929, "rewards/weighted_rejected": -0.990936279296875, "step": 500 }, { "epoch": 0.26171159382360637, "eval_logits/chosen": -0.6649160385131836, "eval_logits/rejected": -0.682785153388977, "eval_logps/chosen": -345.551513671875, "eval_logps/rejected": -345.9989929199219, "eval_logps/weighted_chosen": -2.992426872253418, "eval_logps/weighted_rejected": -3.637266159057617, "eval_loss": 0.6663314700126648, "eval_rewards/accuracies": 0.6305000185966492, "eval_rewards/chosen": -56.88743591308594, "eval_rewards/margins": 21.448062896728516, "eval_rewards/rejected": -78.32637786865234, "eval_rewards/weighted_accuracies": 0.6480000019073486, "eval_rewards/weighted_chosen": -0.5315031409263611, "eval_rewards/weighted_margins": 0.43671876192092896, "eval_rewards/weighted_rejected": -0.9682218432426453, "eval_runtime": 1455.1048, "eval_samples_per_second": 1.374, "eval_steps_per_second": 0.344, "step": 500 } ], "logging_steps": 10, "max_steps": 1911, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }