{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8037508372404555, "eval_steps": 500, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005358338914936369, "grad_norm": 31.966846466064453, "learning_rate": 0.0, "logits/chosen": -2.52783203125, "logits/rejected": -2.498046875, "logps/chosen": -277.71875, "logps/rejected": -223.125, "logps/weighted_chosen": -0.533447265625, "logps/weighted_rejected": -0.5252685546875, "loss": 0.6914, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "rewards/weighted_accuracies": 0.0, "rewards/weighted_chosen": 0.0, "rewards/weighted_margins": 0.0, "rewards/weighted_rejected": 0.0, "step": 1 }, { "epoch": 0.0053583389149363695, "grad_norm": 31.4305419921875, "learning_rate": 4.81283422459893e-08, "logits/chosen": -2.42626953125, "logits/rejected": -2.3970539569854736, "logps/chosen": -195.6822967529297, "logps/rejected": -195.890625, "logps/weighted_chosen": -0.5433688759803772, "logps/weighted_rejected": -0.5479600429534912, "loss": 0.6923, "rewards/accuracies": 0.2951388955116272, "rewards/chosen": -0.0735677108168602, "rewards/margins": 0.00434027798473835, "rewards/rejected": -0.0779079869389534, "rewards/weighted_accuracies": 0.34375, "rewards/weighted_chosen": 0.00025431314134038985, "rewards/weighted_margins": 0.0005137125845067203, "rewards/weighted_rejected": -0.0002593994140625, "step": 10 }, { "epoch": 0.010716677829872739, "grad_norm": 17.30971908569336, "learning_rate": 1.0160427807486631e-07, "logits/chosen": -2.400830030441284, "logits/rejected": -2.399658203125, "logps/chosen": -199.91366577148438, "logps/rejected": -202.21328735351562, "logps/weighted_chosen": -0.537811279296875, "logps/weighted_rejected": -0.548992931842804, "loss": 0.6926, "rewards/accuracies": 0.359375, "rewards/chosen": -0.04570312425494194, "rewards/margins": 0.02001953125, "rewards/rejected": -0.06572265923023224, "rewards/weighted_accuracies": 0.3656249940395355, "rewards/weighted_chosen": -0.00011291504051769152, "rewards/weighted_margins": 0.00011787414405262098, "rewards/weighted_rejected": -0.00023076534853316844, "step": 20 }, { "epoch": 0.016075016744809108, "grad_norm": 30.744884490966797, "learning_rate": 1.5508021390374333e-07, "logits/chosen": -2.3249268531799316, "logits/rejected": -2.299609422683716, "logps/chosen": -198.44375610351562, "logps/rejected": -212.16641235351562, "logps/weighted_chosen": -0.555804431438446, "logps/weighted_rejected": -0.547528088092804, "loss": 0.6927, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.0771484375, "rewards/margins": 0.18007811903953552, "rewards/rejected": -0.10292968899011612, "rewards/weighted_accuracies": 0.3656249940395355, "rewards/weighted_chosen": 0.0004203796270303428, "rewards/weighted_margins": -0.0001464843808207661, "rewards/weighted_rejected": 0.0005667686345987022, "step": 30 }, { "epoch": 0.021433355659745478, "grad_norm": 51.52167510986328, "learning_rate": 2.085561497326203e-07, "logits/chosen": -2.3595948219299316, "logits/rejected": -2.375, "logps/chosen": -192.6171875, "logps/rejected": -207.1945343017578, "logps/weighted_chosen": -0.544903576374054, "logps/weighted_rejected": -0.588610827922821, "loss": 0.6926, "rewards/accuracies": 0.31562501192092896, "rewards/chosen": -0.08066406100988388, "rewards/margins": -0.11210937798023224, "rewards/rejected": 0.03144531324505806, "rewards/weighted_accuracies": 0.40937501192092896, "rewards/weighted_chosen": 0.0021041869185864925, "rewards/weighted_margins": 0.0001922607480082661, "rewards/weighted_rejected": 0.0019119263160973787, "step": 40 }, { "epoch": 0.02679169457468185, "grad_norm": 29.50680160522461, "learning_rate": 2.620320855614973e-07, "logits/chosen": -2.3987059593200684, "logits/rejected": -2.3811278343200684, "logps/chosen": -213.0945281982422, "logps/rejected": -228.8328094482422, "logps/weighted_chosen": -0.5421508550643921, "logps/weighted_rejected": -0.557452380657196, "loss": 0.6925, "rewards/accuracies": 0.390625, "rewards/chosen": 0.06367187201976776, "rewards/margins": 0.06484375149011612, "rewards/rejected": -0.0011718750465661287, "rewards/weighted_accuracies": 0.45625001192092896, "rewards/weighted_chosen": 0.0056938170455396175, "rewards/weighted_margins": 0.000339508056640625, "rewards/weighted_rejected": 0.005351257510483265, "step": 50 }, { "epoch": 0.032150033489618215, "grad_norm": 30.643386840820312, "learning_rate": 3.155080213903743e-07, "logits/chosen": -2.347216844558716, "logits/rejected": -2.3227295875549316, "logps/chosen": -211.7421875, "logps/rejected": -218.0234375, "logps/weighted_chosen": -0.536822497844696, "logps/weighted_rejected": -0.568676769733429, "loss": 0.6923, "rewards/accuracies": 0.421875, "rewards/chosen": 0.09257812798023224, "rewards/margins": 0.10820312798023224, "rewards/rejected": -0.015625, "rewards/weighted_accuracies": 0.45625001192092896, "rewards/weighted_chosen": 0.009661102667450905, "rewards/weighted_margins": 0.000804901123046875, "rewards/weighted_rejected": 0.00885620154440403, "step": 60 }, { "epoch": 0.03750837240455459, "grad_norm": 27.744924545288086, "learning_rate": 3.689839572192513e-07, "logits/chosen": -2.442822217941284, "logits/rejected": -2.4500975608825684, "logps/chosen": -194.47891235351562, "logps/rejected": -200.30624389648438, "logps/weighted_chosen": -0.5030456781387329, "logps/weighted_rejected": -0.5135132074356079, "loss": 0.6906, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": 0.03203124925494194, "rewards/margins": 0.20937499403953552, "rewards/rejected": -0.17734375596046448, "rewards/weighted_accuracies": 0.5218750238418579, "rewards/weighted_chosen": 0.01783904992043972, "rewards/weighted_margins": 0.0045639039017260075, "rewards/weighted_rejected": 0.013275146484375, "step": 70 }, { "epoch": 0.042866711319490956, "grad_norm": 38.061683654785156, "learning_rate": 4.2245989304812833e-07, "logits/chosen": -2.403271436691284, "logits/rejected": -2.430908203125, "logps/chosen": -219.08438110351562, "logps/rejected": -239.91561889648438, "logps/weighted_chosen": -0.529132068157196, "logps/weighted_rejected": -0.5560547113418579, "loss": 0.6903, "rewards/accuracies": 0.43437498807907104, "rewards/chosen": 0.29023438692092896, "rewards/margins": 0.27910155057907104, "rewards/rejected": 0.01113281212747097, "rewards/weighted_accuracies": 0.518750011920929, "rewards/weighted_chosen": 0.022306060418486595, "rewards/weighted_margins": 0.005640792660415173, "rewards/weighted_rejected": 0.016665268689393997, "step": 80 }, { "epoch": 0.04822505023442733, "grad_norm": 28.250593185424805, "learning_rate": 4.7593582887700533e-07, "logits/chosen": -2.392260789871216, "logits/rejected": -2.4065184593200684, "logps/chosen": -195.1953125, "logps/rejected": -207.7804718017578, "logps/weighted_chosen": -0.51409912109375, "logps/weighted_rejected": -0.526226818561554, "loss": 0.6906, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.10097656399011612, "rewards/margins": 0.724414050579071, "rewards/rejected": -0.6234375238418579, "rewards/weighted_accuracies": 0.559374988079071, "rewards/weighted_chosen": 0.025403594598174095, "rewards/weighted_margins": 0.006229400634765625, "rewards/weighted_rejected": 0.01916809007525444, "step": 90 }, { "epoch": 0.0535833891493637, "grad_norm": 18.92244529724121, "learning_rate": 5.294117647058823e-07, "logits/chosen": -2.3767333030700684, "logits/rejected": -2.3611817359924316, "logps/chosen": -224.2859344482422, "logps/rejected": -229.6570281982422, "logps/weighted_chosen": -0.534497082233429, "logps/weighted_rejected": -0.5674392580986023, "loss": 0.6903, "rewards/accuracies": 0.5093749761581421, "rewards/chosen": -0.8511718511581421, "rewards/margins": 1.123437523841858, "rewards/rejected": -1.974609375, "rewards/weighted_accuracies": 0.53125, "rewards/weighted_chosen": 0.023969268426299095, "rewards/weighted_margins": 0.00682754535228014, "rewards/weighted_rejected": 0.01714172400534153, "step": 100 }, { "epoch": 0.058941728064300064, "grad_norm": 50.693965911865234, "learning_rate": 5.828877005347593e-07, "logits/chosen": -2.3811888694763184, "logits/rejected": -2.382458448410034, "logps/chosen": -217.30703735351562, "logps/rejected": -225.4343719482422, "logps/weighted_chosen": -0.54705810546875, "logps/weighted_rejected": -0.5594238042831421, "loss": 0.6951, "rewards/accuracies": 0.46562498807907104, "rewards/chosen": -2.3193359375, "rewards/margins": 0.624218761920929, "rewards/rejected": -2.942578077316284, "rewards/weighted_accuracies": 0.4781250059604645, "rewards/weighted_chosen": 0.022613525390625, "rewards/weighted_margins": -0.0009094238048419356, "rewards/weighted_rejected": 0.02351989783346653, "step": 110 }, { "epoch": 0.06430006697923643, "grad_norm": 27.526296615600586, "learning_rate": 6.363636363636363e-07, "logits/chosen": -2.459423780441284, "logits/rejected": -2.4720702171325684, "logps/chosen": -187.80313110351562, "logps/rejected": -226.0906219482422, "logps/weighted_chosen": -0.533001720905304, "logps/weighted_rejected": -0.605639636516571, "loss": 0.6903, "rewards/accuracies": 0.49687498807907104, "rewards/chosen": -3.5208983421325684, "rewards/margins": 1.0193359851837158, "rewards/rejected": -4.540820121765137, "rewards/weighted_accuracies": 0.5062500238418579, "rewards/weighted_chosen": 0.005961227230727673, "rewards/weighted_margins": 0.009808349423110485, "rewards/weighted_rejected": -0.0038513182662427425, "step": 120 }, { "epoch": 0.06965840589417281, "grad_norm": 26.822662353515625, "learning_rate": 6.898395721925134e-07, "logits/chosen": -2.565673828125, "logits/rejected": -2.564648389816284, "logps/chosen": -231.10000610351562, "logps/rejected": -240.86093139648438, "logps/weighted_chosen": -0.537158191204071, "logps/weighted_rejected": -0.5712524652481079, "loss": 0.6953, "rewards/accuracies": 0.53125, "rewards/chosen": -2.906054735183716, "rewards/margins": 1.6062500476837158, "rewards/rejected": -4.512109279632568, "rewards/weighted_accuracies": 0.49687498807907104, "rewards/weighted_chosen": 0.00572967529296875, "rewards/weighted_margins": -0.00041961669921875, "rewards/weighted_rejected": 0.0061210631392896175, "step": 130 }, { "epoch": 0.07501674480910918, "grad_norm": 19.93072509765625, "learning_rate": 7.433155080213903e-07, "logits/chosen": -2.53515625, "logits/rejected": -2.5311522483825684, "logps/chosen": -198.80313110351562, "logps/rejected": -211.29061889648438, "logps/weighted_chosen": -0.5693206787109375, "logps/weighted_rejected": -0.592010498046875, "loss": 0.694, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -4.0048828125, "rewards/margins": 2.3238282203674316, "rewards/rejected": -6.327929496765137, "rewards/weighted_accuracies": 0.518750011920929, "rewards/weighted_chosen": -0.017972564324736595, "rewards/weighted_margins": 0.005100250244140625, "rewards/weighted_rejected": -0.02306365966796875, "step": 140 }, { "epoch": 0.08037508372404555, "grad_norm": 26.39690399169922, "learning_rate": 7.967914438502673e-07, "logits/chosen": -2.509326219558716, "logits/rejected": -2.508056640625, "logps/chosen": -214.25, "logps/rejected": -237.84530639648438, "logps/weighted_chosen": -0.5889526605606079, "logps/weighted_rejected": -0.585980236530304, "loss": 0.7007, "rewards/accuracies": 0.546875, "rewards/chosen": -3.867382764816284, "rewards/margins": 2.546581983566284, "rewards/rejected": -6.4140625, "rewards/weighted_accuracies": 0.550000011920929, "rewards/weighted_chosen": -0.041257478296756744, "rewards/weighted_margins": -0.005215453915297985, "rewards/weighted_rejected": -0.03600044175982475, "step": 150 }, { "epoch": 0.08573342263898191, "grad_norm": 25.74460792541504, "learning_rate": 8.502673796791443e-07, "logits/chosen": -2.433154344558716, "logits/rejected": -2.414794921875, "logps/chosen": -200.64999389648438, "logps/rejected": -229.2624969482422, "logps/weighted_chosen": -0.589752197265625, "logps/weighted_rejected": -0.612139880657196, "loss": 0.6933, "rewards/accuracies": 0.5406249761581421, "rewards/chosen": -6.800390720367432, "rewards/margins": 0.5577148199081421, "rewards/rejected": -7.36279296875, "rewards/weighted_accuracies": 0.5218750238418579, "rewards/weighted_chosen": -0.05205688625574112, "rewards/weighted_margins": 0.006626891903579235, "rewards/weighted_rejected": -0.05869255214929581, "step": 160 }, { "epoch": 0.09109176155391828, "grad_norm": 24.530479431152344, "learning_rate": 9.037433155080213e-07, "logits/chosen": -2.5284180641174316, "logits/rejected": -2.5411133766174316, "logps/chosen": -204.92031860351562, "logps/rejected": -217.046875, "logps/weighted_chosen": -0.600720226764679, "logps/weighted_rejected": -0.6278625726699829, "loss": 0.6787, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -7.408789157867432, "rewards/margins": 4.765625, "rewards/rejected": -12.178515434265137, "rewards/weighted_accuracies": 0.612500011920929, "rewards/weighted_chosen": -0.05335540696978569, "rewards/weighted_margins": 0.03420104831457138, "rewards/weighted_rejected": -0.08757324516773224, "step": 170 }, { "epoch": 0.09645010046885466, "grad_norm": 17.329570770263672, "learning_rate": 9.572192513368984e-07, "logits/chosen": -2.638134717941284, "logits/rejected": -2.6302247047424316, "logps/chosen": -219.87655639648438, "logps/rejected": -240.1531219482422, "logps/weighted_chosen": -0.637438952922821, "logps/weighted_rejected": -0.700207531452179, "loss": 0.682, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -10.272656440734863, "rewards/margins": 4.742578029632568, "rewards/rejected": -15.017578125, "rewards/weighted_accuracies": 0.581250011920929, "rewards/weighted_chosen": -0.09931640326976776, "rewards/weighted_margins": 0.0342864990234375, "rewards/weighted_rejected": -0.13359375298023224, "step": 180 }, { "epoch": 0.10180843938379103, "grad_norm": 40.06588363647461, "learning_rate": 9.999965031204306e-07, "logits/chosen": -2.677978515625, "logits/rejected": -2.667724609375, "logps/chosen": -217.265625, "logps/rejected": -217.9968719482422, "logps/weighted_chosen": -0.713732898235321, "logps/weighted_rejected": -0.745983898639679, "loss": 0.6972, "rewards/accuracies": 0.559374988079071, "rewards/chosen": -16.422557830810547, "rewards/margins": 3.5362305641174316, "rewards/rejected": -19.955078125, "rewards/weighted_accuracies": 0.578125, "rewards/weighted_chosen": -0.1743011474609375, "rewards/weighted_margins": 0.01756439171731472, "rewards/weighted_rejected": -0.19183501601219177, "step": 190 }, { "epoch": 0.1071667782987274, "grad_norm": 26.449254989624023, "learning_rate": 9.998741174712533e-07, "logits/chosen": -2.758007764816284, "logits/rejected": -2.752197265625, "logps/chosen": -251.4640655517578, "logps/rejected": -266.7734375, "logps/weighted_chosen": -0.721697986125946, "logps/weighted_rejected": -0.7427002191543579, "loss": 0.6956, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -23.8759765625, "rewards/margins": 3.2962889671325684, "rewards/rejected": -27.181835174560547, "rewards/weighted_accuracies": 0.5, "rewards/weighted_chosen": -0.14485931396484375, "rewards/weighted_margins": 0.00913848914206028, "rewards/weighted_rejected": -0.153900146484375, "step": 200 }, { "epoch": 0.11252511721366376, "grad_norm": 25.405826568603516, "learning_rate": 9.995769367531952e-07, "logits/chosen": -2.8843750953674316, "logits/rejected": -2.8621582984924316, "logps/chosen": -249.39999389648438, "logps/rejected": -249.5625, "logps/weighted_chosen": -0.6254638433456421, "logps/weighted_rejected": -0.677978515625, "loss": 0.6845, "rewards/accuracies": 0.5531250238418579, "rewards/chosen": -31.167186737060547, "rewards/margins": 2.1792969703674316, "rewards/rejected": -33.351173400878906, "rewards/weighted_accuracies": 0.528124988079071, "rewards/weighted_chosen": -0.11136627197265625, "rewards/weighted_margins": 0.02726135216653347, "rewards/weighted_rejected": -0.1386566162109375, "step": 210 }, { "epoch": 0.11788345612860013, "grad_norm": 22.92214012145996, "learning_rate": 9.991050648838675e-07, "logits/chosen": -3.027539014816284, "logits/rejected": -3.0234375, "logps/chosen": -249.0265655517578, "logps/rejected": -264.1703186035156, "logps/weighted_chosen": -0.72125244140625, "logps/weighted_rejected": -0.7577880620956421, "loss": 0.6887, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -44.36796951293945, "rewards/margins": 2.092968702316284, "rewards/rejected": -46.47265625, "rewards/weighted_accuracies": 0.5218750238418579, "rewards/weighted_chosen": -0.15605469048023224, "rewards/weighted_margins": 0.022979736328125, "rewards/weighted_rejected": -0.17911987006664276, "step": 220 }, { "epoch": 0.12324179504353651, "grad_norm": 18.913394927978516, "learning_rate": 9.98458666866564e-07, "logits/chosen": -2.95361328125, "logits/rejected": -2.9599609375, "logps/chosen": -252.1765594482422, "logps/rejected": -273.97186279296875, "logps/weighted_chosen": -0.7530273199081421, "logps/weighted_rejected": -0.7951415777206421, "loss": 0.6805, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -55.478126525878906, "rewards/margins": 9.685155868530273, "rewards/rejected": -65.15702819824219, "rewards/weighted_accuracies": 0.546875, "rewards/weighted_chosen": -0.202239990234375, "rewards/weighted_margins": 0.04668273776769638, "rewards/weighted_rejected": -0.248951718211174, "step": 230 }, { "epoch": 0.12860013395847286, "grad_norm": 18.727752685546875, "learning_rate": 9.97637968732563e-07, "logits/chosen": -2.9483885765075684, "logits/rejected": -2.934033155441284, "logps/chosen": -264.1937561035156, "logps/rejected": -281.26873779296875, "logps/weighted_chosen": -0.7586425542831421, "logps/weighted_rejected": -0.777172863483429, "loss": 0.7011, "rewards/accuracies": 0.565625011920929, "rewards/chosen": -60.15625, "rewards/margins": 7.514062404632568, "rewards/rejected": -67.6640625, "rewards/weighted_accuracies": 0.528124988079071, "rewards/weighted_chosen": -0.22589111328125, "rewards/weighted_margins": 0.0014251709217205644, "rewards/weighted_rejected": -0.22734984755516052, "step": 240 }, { "epoch": 0.13395847287340926, "grad_norm": 17.17424774169922, "learning_rate": 9.966432574620906e-07, "logits/chosen": -2.994140625, "logits/rejected": -2.995654344558716, "logps/chosen": -250.4296875, "logps/rejected": -282.0859375, "logps/weighted_chosen": -0.707995593547821, "logps/weighted_rejected": -0.76885986328125, "loss": 0.6897, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -55.573829650878906, "rewards/margins": 7.142187595367432, "rewards/rejected": -62.73515701293945, "rewards/weighted_accuracies": 0.528124988079071, "rewards/weighted_chosen": -0.18481139838695526, "rewards/weighted_margins": 0.01998596265912056, "rewards/weighted_rejected": -0.20475539565086365, "step": 250 }, { "epoch": 0.13931681178834562, "grad_norm": 18.692733764648438, "learning_rate": 9.954748808839674e-07, "logits/chosen": -2.887255907058716, "logits/rejected": -2.898193359375, "logps/chosen": -280.46875, "logps/rejected": -280.6640625, "logps/weighted_chosen": -0.7401489019393921, "logps/weighted_rejected": -0.776611328125, "loss": 0.6869, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -53.8828125, "rewards/margins": 5.946484565734863, "rewards/rejected": -59.81757736206055, "rewards/weighted_accuracies": 0.53125, "rewards/weighted_chosen": -0.18103942275047302, "rewards/weighted_margins": 0.02960510179400444, "rewards/weighted_rejected": -0.21054688096046448, "step": 260 }, { "epoch": 0.144675150703282, "grad_norm": 16.58523941040039, "learning_rate": 9.941332475539824e-07, "logits/chosen": -2.8628907203674316, "logits/rejected": -2.87939453125, "logps/chosen": -267.26251220703125, "logps/rejected": -306.49688720703125, "logps/weighted_chosen": -0.7835937738418579, "logps/weighted_rejected": -0.8957885503768921, "loss": 0.6712, "rewards/accuracies": 0.625, "rewards/chosen": -57.08281326293945, "rewards/margins": 12.824609756469727, "rewards/rejected": -69.8984375, "rewards/weighted_accuracies": 0.596875011920929, "rewards/weighted_chosen": -0.2109939604997635, "rewards/weighted_margins": 0.07429657131433487, "rewards/weighted_rejected": -0.2853530943393707, "step": 270 }, { "epoch": 0.15003348961821836, "grad_norm": 23.606122970581055, "learning_rate": 9.926188266120295e-07, "logits/chosen": -2.9261717796325684, "logits/rejected": -2.9302000999450684, "logps/chosen": -252.35781860351562, "logps/rejected": -270.3921813964844, "logps/weighted_chosen": -0.757946789264679, "logps/weighted_rejected": -0.8466796875, "loss": 0.6669, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -56.718360900878906, "rewards/margins": 10.275781631469727, "rewards/rejected": -66.990234375, "rewards/weighted_accuracies": 0.606249988079071, "rewards/weighted_chosen": -0.2161209136247635, "rewards/weighted_margins": 0.08074493706226349, "rewards/weighted_rejected": -0.2967941164970398, "step": 280 }, { "epoch": 0.15539182853315472, "grad_norm": 25.32463836669922, "learning_rate": 9.909321476180591e-07, "logits/chosen": -2.9480957984924316, "logits/rejected": -2.9312500953674316, "logps/chosen": -281.33123779296875, "logps/rejected": -317.3374938964844, "logps/weighted_chosen": -0.870861828327179, "logps/weighted_rejected": -0.9639892578125, "loss": 0.6868, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -78.2476577758789, "rewards/margins": 12.921093940734863, "rewards/rejected": -91.15312194824219, "rewards/weighted_accuracies": 0.534375011920929, "rewards/weighted_chosen": -0.32458800077438354, "rewards/weighted_margins": 0.04551086574792862, "rewards/weighted_rejected": -0.37019044160842896, "step": 290 }, { "epoch": 0.1607501674480911, "grad_norm": 21.94011116027832, "learning_rate": 9.890738003669027e-07, "logits/chosen": -3.082812547683716, "logits/rejected": -3.089111328125, "logps/chosen": -288.62811279296875, "logps/rejected": -311.38751220703125, "logps/weighted_chosen": -0.955078125, "logps/weighted_rejected": -1.0396239757537842, "loss": 0.687, "rewards/accuracies": 0.5625, "rewards/chosen": -93.6109390258789, "rewards/margins": 9.185546875, "rewards/rejected": -102.78202819824219, "rewards/weighted_accuracies": 0.559374988079071, "rewards/weighted_chosen": -0.40625, "rewards/weighted_margins": 0.07210388034582138, "rewards/weighted_rejected": -0.478240966796875, "step": 300 }, { "epoch": 0.16610850636302746, "grad_norm": 25.38623046875, "learning_rate": 9.870444346820348e-07, "logits/chosen": -3.173828125, "logits/rejected": -3.147998094558716, "logps/chosen": -297.15936279296875, "logps/rejected": -320.50311279296875, "logps/weighted_chosen": -0.89190673828125, "logps/weighted_rejected": -0.923266589641571, "loss": 0.6987, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -83.2046890258789, "rewards/margins": 12.489062309265137, "rewards/rejected": -95.6937484741211, "rewards/weighted_accuracies": 0.581250011920929, "rewards/weighted_chosen": -0.33665162324905396, "rewards/weighted_margins": 0.00975952111184597, "rewards/weighted_rejected": -0.34637755155563354, "step": 310 }, { "epoch": 0.17146684527796383, "grad_norm": 17.789676666259766, "learning_rate": 9.848447601883433e-07, "logits/chosen": -3.236083984375, "logits/rejected": -3.2168946266174316, "logps/chosen": -267.09844970703125, "logps/rejected": -310.3031311035156, "logps/weighted_chosen": -0.8338623046875, "logps/weighted_rejected": -0.918957531452179, "loss": 0.6805, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -70.2816390991211, "rewards/margins": 19.578907012939453, "rewards/rejected": -89.81953430175781, "rewards/weighted_accuracies": 0.590624988079071, "rewards/weighted_chosen": -0.2871032655239105, "rewards/weighted_margins": 0.06427612155675888, "rewards/weighted_rejected": -0.3513946533203125, "step": 320 }, { "epoch": 0.1768251841929002, "grad_norm": 24.225061416625977, "learning_rate": 9.824755460639899e-07, "logits/chosen": -3.1796875, "logits/rejected": -3.179150342941284, "logps/chosen": -265.12030029296875, "logps/rejected": -308.19842529296875, "logps/weighted_chosen": -0.833850085735321, "logps/weighted_rejected": -0.8948730230331421, "loss": 0.6836, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -73.27030944824219, "rewards/margins": 16.651952743530273, "rewards/rejected": -89.87968444824219, "rewards/weighted_accuracies": 0.5531250238418579, "rewards/weighted_chosen": -0.31196290254592896, "rewards/weighted_margins": 0.04750366136431694, "rewards/weighted_rejected": -0.35954588651657104, "step": 330 }, { "epoch": 0.18218352310783656, "grad_norm": 19.119462966918945, "learning_rate": 9.799376207714444e-07, "logits/chosen": -3.1602540016174316, "logits/rejected": -3.154492139816284, "logps/chosen": -277.9624938964844, "logps/rejected": -296.7093811035156, "logps/weighted_chosen": -0.902087390422821, "logps/weighted_rejected": -0.974658191204071, "loss": 0.6727, "rewards/accuracies": 0.5406249761581421, "rewards/chosen": -77.0367202758789, "rewards/margins": 9.926562309265137, "rewards/rejected": -86.9476547241211, "rewards/weighted_accuracies": 0.559374988079071, "rewards/weighted_chosen": -0.334219366312027, "rewards/weighted_margins": 0.06939239799976349, "rewards/weighted_rejected": -0.4036361575126648, "step": 340 }, { "epoch": 0.18754186202277295, "grad_norm": 22.155990600585938, "learning_rate": 9.772318717677903e-07, "logits/chosen": -3.130908250808716, "logits/rejected": -3.132128953933716, "logps/chosen": -288.41094970703125, "logps/rejected": -313.0843811035156, "logps/weighted_chosen": -0.9267333745956421, "logps/weighted_rejected": -1.040771484375, "loss": 0.6683, "rewards/accuracies": 0.625, "rewards/chosen": -88.24609375, "rewards/margins": 18.800390243530273, "rewards/rejected": -107.046875, "rewards/weighted_accuracies": 0.6156250238418579, "rewards/weighted_chosen": -0.39622801542282104, "rewards/weighted_margins": 0.08176574856042862, "rewards/weighted_rejected": -0.47796326875686646, "step": 350 }, { "epoch": 0.19290020093770932, "grad_norm": 26.421598434448242, "learning_rate": 9.743592451943998e-07, "logits/chosen": -3.229687452316284, "logits/rejected": -3.228222608566284, "logps/chosen": -327.7328186035156, "logps/rejected": -361.0015563964844, "logps/weighted_chosen": -0.990966796875, "logps/weighted_rejected": -1.057519555091858, "loss": 0.6726, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -116.12812805175781, "rewards/margins": 18.506641387939453, "rewards/rejected": -134.654296875, "rewards/weighted_accuracies": 0.543749988079071, "rewards/weighted_chosen": -0.4190429747104645, "rewards/weighted_margins": 0.08066101372241974, "rewards/weighted_rejected": -0.499612420797348, "step": 360 }, { "epoch": 0.1982585398526457, "grad_norm": 19.50035858154297, "learning_rate": 9.713207455460892e-07, "logits/chosen": -3.184375047683716, "logits/rejected": -3.209912061691284, "logps/chosen": -326.4937438964844, "logps/rejected": -343.4593811035156, "logps/weighted_chosen": -1.00506591796875, "logps/weighted_rejected": -1.092797875404358, "loss": 0.6762, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -120.6937484741211, "rewards/margins": 6.901953220367432, "rewards/rejected": -127.60469055175781, "rewards/weighted_accuracies": 0.5531250238418579, "rewards/weighted_chosen": -0.4098266661167145, "rewards/weighted_margins": 0.08511047065258026, "rewards/weighted_rejected": -0.495004266500473, "step": 370 }, { "epoch": 0.20361687876758205, "grad_norm": 17.131114959716797, "learning_rate": 9.681174353198686e-07, "logits/chosen": -3.2510743141174316, "logits/rejected": -3.23828125, "logps/chosen": -330.140625, "logps/rejected": -368.10467529296875, "logps/weighted_chosen": -0.9594482183456421, "logps/weighted_rejected": -1.056616187095642, "loss": 0.6719, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -119.54219055175781, "rewards/margins": 20.350391387939453, "rewards/rejected": -139.82461547851562, "rewards/weighted_accuracies": 0.574999988079071, "rewards/weighted_chosen": -0.415090948343277, "rewards/weighted_margins": 0.086700439453125, "rewards/weighted_rejected": -0.501788318157196, "step": 380 }, { "epoch": 0.20897521768251842, "grad_norm": 38.318756103515625, "learning_rate": 9.647504346434103e-07, "logits/chosen": -3.445019483566284, "logits/rejected": -3.4564452171325684, "logps/chosen": -371.9156188964844, "logps/rejected": -389.43438720703125, "logps/weighted_chosen": -1.062597632408142, "logps/weighted_rejected": -1.165747046470642, "loss": 0.6656, "rewards/accuracies": 0.578125, "rewards/chosen": -150.7859344482422, "rewards/margins": 19.3828125, "rewards/rejected": -170.19686889648438, "rewards/weighted_accuracies": 0.578125, "rewards/weighted_chosen": -0.49369508028030396, "rewards/weighted_margins": 0.09916381537914276, "rewards/weighted_rejected": -0.5928589105606079, "step": 390 }, { "epoch": 0.2143335565974548, "grad_norm": 37.75257873535156, "learning_rate": 9.612209208833646e-07, "logits/chosen": -3.4891600608825684, "logits/rejected": -3.4913086891174316, "logps/chosen": -373.2046813964844, "logps/rejected": -387.3968811035156, "logps/weighted_chosen": -1.159387230873108, "logps/weighted_rejected": -1.2549560070037842, "loss": 0.6802, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -173.24063110351562, "rewards/margins": 12.66796875, "rewards/rejected": -185.9562530517578, "rewards/weighted_accuracies": 0.5531250238418579, "rewards/weighted_chosen": -0.619030773639679, "rewards/weighted_margins": 0.07501526176929474, "rewards/weighted_rejected": -0.694287121295929, "step": 400 }, { "epoch": 0.21969189551239116, "grad_norm": 62.04045486450195, "learning_rate": 9.5753012823366e-07, "logits/chosen": -3.5625, "logits/rejected": -3.555224657058716, "logps/chosen": -392.5843811035156, "logps/rejected": -429.76873779296875, "logps/weighted_chosen": -1.109643578529358, "logps/weighted_rejected": -1.2004883289337158, "loss": 0.6903, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -180.3562469482422, "rewards/margins": 26.413280487060547, "rewards/rejected": -206.75936889648438, "rewards/weighted_accuracies": 0.5375000238418579, "rewards/weighted_chosen": -0.5794036984443665, "rewards/weighted_margins": 0.08088378608226776, "rewards/weighted_rejected": -0.660137951374054, "step": 410 }, { "epoch": 0.22505023442732752, "grad_norm": 29.432497024536133, "learning_rate": 9.536793472839324e-07, "logits/chosen": -3.7582030296325684, "logits/rejected": -3.77783203125, "logps/chosen": -413.109375, "logps/rejected": -459.015625, "logps/weighted_chosen": -1.0903198719024658, "logps/weighted_rejected": -1.1765625476837158, "loss": 0.6833, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -182.9140625, "rewards/margins": 33.59453201293945, "rewards/rejected": -216.5343780517578, "rewards/weighted_accuracies": 0.590624988079071, "rewards/weighted_chosen": -0.528045654296875, "rewards/weighted_margins": 0.08157958835363388, "rewards/weighted_rejected": -0.609661877155304, "step": 420 }, { "epoch": 0.2304085733422639, "grad_norm": 34.61969757080078, "learning_rate": 9.496699245682351e-07, "logits/chosen": -3.876757860183716, "logits/rejected": -3.873339891433716, "logps/chosen": -411.23748779296875, "logps/rejected": -474.3687438964844, "logps/weighted_chosen": -1.104589819908142, "logps/weighted_rejected": -1.18817138671875, "loss": 0.6878, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -190.3468780517578, "rewards/margins": 43.072265625, "rewards/rejected": -233.43124389648438, "rewards/weighted_accuracies": 0.5531250238418579, "rewards/weighted_chosen": -0.543225109577179, "rewards/weighted_margins": 0.05898132175207138, "rewards/weighted_rejected": -0.6022491455078125, "step": 430 }, { "epoch": 0.23576691225720026, "grad_norm": 25.689876556396484, "learning_rate": 9.455032620941839e-07, "logits/chosen": -3.893261671066284, "logits/rejected": -3.866406202316284, "logps/chosen": -390.3531188964844, "logps/rejected": -474.80780029296875, "logps/weighted_chosen": -1.0908081531524658, "logps/weighted_rejected": -1.19012451171875, "loss": 0.6673, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -196.52188110351562, "rewards/margins": 52.310546875, "rewards/rejected": -248.7218780517578, "rewards/weighted_accuracies": 0.5718749761581421, "rewards/weighted_chosen": -0.555419921875, "rewards/weighted_margins": 0.09650878608226776, "rewards/weighted_rejected": -0.651898205280304, "step": 440 }, { "epoch": 0.24112525117213665, "grad_norm": 54.604705810546875, "learning_rate": 9.411808168527066e-07, "logits/chosen": -3.88916015625, "logits/rejected": -3.8941407203674316, "logps/chosen": -411.3031311035156, "logps/rejected": -472.203125, "logps/weighted_chosen": -1.082067847251892, "logps/weighted_rejected": -1.1839110851287842, "loss": 0.6816, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -205.703125, "rewards/margins": 46.81132888793945, "rewards/rejected": -252.5343780517578, "rewards/weighted_accuracies": 0.590624988079071, "rewards/weighted_chosen": -0.554046630859375, "rewards/weighted_margins": 0.07362671196460724, "rewards/weighted_rejected": -0.6277831792831421, "step": 450 }, { "epoch": 0.24648359008707302, "grad_norm": 26.44339942932129, "learning_rate": 9.367041003085648e-07, "logits/chosen": -3.8785157203674316, "logits/rejected": -3.866894483566284, "logps/chosen": -523.2593994140625, "logps/rejected": -523.2750244140625, "logps/weighted_chosen": -1.2444579601287842, "logps/weighted_rejected": -1.3485107421875, "loss": 0.6775, "rewards/accuracies": 0.5625, "rewards/chosen": -304.52813720703125, "rewards/margins": 4.764062404632568, "rewards/rejected": -309.3812561035156, "rewards/weighted_accuracies": 0.590624988079071, "rewards/weighted_chosen": -0.6716858148574829, "rewards/weighted_margins": 0.08544921875, "rewards/weighted_rejected": -0.757153332233429, "step": 460 }, { "epoch": 0.25184192900200936, "grad_norm": 28.781457901000977, "learning_rate": 9.320746778718274e-07, "logits/chosen": -4.042578220367432, "logits/rejected": -4.053515434265137, "logps/chosen": -444.046875, "logps/rejected": -514.0531005859375, "logps/weighted_chosen": -1.281274437904358, "logps/weighted_rejected": -1.4283936023712158, "loss": 0.6526, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -256.4828186035156, "rewards/margins": 53.646095275878906, "rewards/rejected": -310.15313720703125, "rewards/weighted_accuracies": 0.596875011920929, "rewards/weighted_chosen": -0.714306652545929, "rewards/weighted_margins": 0.13612060248851776, "rewards/weighted_rejected": -0.8504638671875, "step": 470 }, { "epoch": 0.2572002679169457, "grad_norm": 58.31265640258789, "learning_rate": 9.272941683504808e-07, "logits/chosen": -4.060449123382568, "logits/rejected": -4.083691596984863, "logps/chosen": -564.7312622070312, "logps/rejected": -582.9124755859375, "logps/weighted_chosen": -1.305688500404358, "logps/weighted_rejected": -1.470678687095642, "loss": 0.6627, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -346.97344970703125, "rewards/margins": 15.486719131469727, "rewards/rejected": -362.28594970703125, "rewards/weighted_accuracies": 0.596875011920929, "rewards/weighted_chosen": -0.7694152593612671, "rewards/weighted_margins": 0.14488831162452698, "rewards/weighted_rejected": -0.9143127202987671, "step": 480 }, { "epoch": 0.2625586068318821, "grad_norm": 28.284282684326172, "learning_rate": 9.223642433843679e-07, "logits/chosen": -3.8697266578674316, "logits/rejected": -3.857128858566284, "logps/chosen": -500.0718688964844, "logps/rejected": -534.6281127929688, "logps/weighted_chosen": -1.1939697265625, "logps/weighted_rejected": -1.323388695716858, "loss": 0.6567, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -298.09063720703125, "rewards/margins": 29.775781631469727, "rewards/rejected": -328.0640563964844, "rewards/weighted_accuracies": 0.59375, "rewards/weighted_chosen": -0.631103515625, "rewards/weighted_margins": 0.13239136338233948, "rewards/weighted_rejected": -0.763378918170929, "step": 490 }, { "epoch": 0.2679169457468185, "grad_norm": 32.619327545166016, "learning_rate": 9.172866268606513e-07, "logits/chosen": -3.828320264816284, "logits/rejected": -3.82373046875, "logps/chosen": -527.2687377929688, "logps/rejected": -588.8343505859375, "logps/weighted_chosen": -1.291723608970642, "logps/weighted_rejected": -1.4711182117462158, "loss": 0.6559, "rewards/accuracies": 0.625, "rewards/chosen": -312.36248779296875, "rewards/margins": 48.47734451293945, "rewards/rejected": -360.8890686035156, "rewards/weighted_accuracies": 0.59375, "rewards/weighted_chosen": -0.727709949016571, "rewards/weighted_margins": 0.145904541015625, "rewards/weighted_rejected": -0.8731445074081421, "step": 500 }, { "epoch": 0.2679169457468185, "eval_logits/chosen": -3.994471788406372, "eval_logits/rejected": -4.005757808685303, "eval_logps/chosen": -530.9920043945312, "eval_logps/rejected": -592.9425659179688, "eval_logps/weighted_chosen": -1.3731945753097534, "eval_logps/weighted_rejected": -1.5277278423309326, "eval_loss": 0.6660141348838806, "eval_rewards/accuracies": 0.5874499082565308, "eval_rewards/chosen": -328.0513916015625, "eval_rewards/margins": 48.42765426635742, "eval_rewards/rejected": -376.4005432128906, "eval_rewards/weighted_accuracies": 0.5901201367378235, "eval_rewards/weighted_chosen": -0.8177139759063721, "eval_rewards/weighted_margins": 0.13399113714694977, "eval_rewards/weighted_rejected": -0.9517050385475159, "eval_runtime": 1889.7494, "eval_samples_per_second": 1.584, "eval_steps_per_second": 0.396, "step": 500 }, { "epoch": 0.2732752846617549, "grad_norm": 36.31141662597656, "learning_rate": 9.120630943110077e-07, "logits/chosen": -4.06494140625, "logits/rejected": -4.080273628234863, "logps/chosen": -546.140625, "logps/rejected": -658.2093505859375, "logps/weighted_chosen": -1.464135766029358, "logps/weighted_rejected": -1.644873023033142, "loss": 0.6696, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -355.78594970703125, "rewards/margins": 91.36328125, "rewards/rejected": -447.06561279296875, "rewards/weighted_accuracies": 0.6187499761581421, "rewards/weighted_chosen": -0.9257446527481079, "rewards/weighted_margins": 0.15970154106616974, "rewards/weighted_rejected": -1.085382103919983, "step": 510 }, { "epoch": 0.27863362357669125, "grad_norm": 35.842044830322266, "learning_rate": 9.066954722907638e-07, "logits/chosen": -4.306347846984863, "logits/rejected": -4.325781345367432, "logps/chosen": -809.96875, "logps/rejected": -865.5875244140625, "logps/weighted_chosen": -1.7161865234375, "logps/weighted_rejected": -1.847753882408142, "loss": 0.6806, "rewards/accuracies": 0.609375, "rewards/chosen": -589.8624877929688, "rewards/margins": 49.43046951293945, "rewards/rejected": -639.21875, "rewards/weighted_accuracies": 0.581250011920929, "rewards/weighted_chosen": -1.1625487804412842, "rewards/weighted_margins": 0.126800537109375, "rewards/weighted_rejected": -1.289453148841858, "step": 520 }, { "epoch": 0.2839919624916276, "grad_norm": 35.471004486083984, "learning_rate": 9.01185637740189e-07, "logits/chosen": -4.103613376617432, "logits/rejected": -4.137109279632568, "logps/chosen": -668.6375122070312, "logps/rejected": -714.5843505859375, "logps/weighted_chosen": -1.6459472179412842, "logps/weighted_rejected": -1.787329077720642, "loss": 0.6869, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -462.6499938964844, "rewards/margins": 39.26484298706055, "rewards/rejected": -501.71875, "rewards/weighted_accuracies": 0.565625011920929, "rewards/weighted_chosen": -1.0964477062225342, "rewards/weighted_margins": 0.13413086533546448, "rewards/weighted_rejected": -1.2302672863006592, "step": 530 }, { "epoch": 0.289350301406564, "grad_norm": 22.4523868560791, "learning_rate": 8.955355173281707e-07, "logits/chosen": -3.6913084983825684, "logits/rejected": -3.6981444358825684, "logps/chosen": -521.2437744140625, "logps/rejected": -594.4593505859375, "logps/weighted_chosen": -1.2616455554962158, "logps/weighted_rejected": -1.4534423351287842, "loss": 0.6462, "rewards/accuracies": 0.625, "rewards/chosen": -323.74688720703125, "rewards/margins": 58.17578125, "rewards/rejected": -381.83282470703125, "rewards/weighted_accuracies": 0.612500011920929, "rewards/weighted_chosen": -0.7408813238143921, "rewards/weighted_margins": 0.15880127251148224, "rewards/weighted_rejected": -0.89971923828125, "step": 540 }, { "epoch": 0.29470864032150035, "grad_norm": 53.5711784362793, "learning_rate": 8.897470867785002e-07, "logits/chosen": -3.683789014816284, "logits/rejected": -3.6670899391174316, "logps/chosen": -669.2562255859375, "logps/rejected": -769.59375, "logps/weighted_chosen": -1.6165649890899658, "logps/weighted_rejected": -1.8085448741912842, "loss": 0.6561, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -457.6937561035156, "rewards/margins": 77.93281555175781, "rewards/rejected": -535.5343627929688, "rewards/weighted_accuracies": 0.606249988079071, "rewards/weighted_chosen": -1.055206298828125, "rewards/weighted_margins": 0.17585143446922302, "rewards/weighted_rejected": -1.230871558189392, "step": 550 }, { "epoch": 0.3000669792364367, "grad_norm": 49.29216384887695, "learning_rate": 8.838223701790055e-07, "logits/chosen": -3.755566358566284, "logits/rejected": -3.782177686691284, "logps/chosen": -865.2218627929688, "logps/rejected": -996.9562377929688, "logps/weighted_chosen": -1.795434594154358, "logps/weighted_rejected": -2.018505811691284, "loss": 0.648, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -615.8656005859375, "rewards/margins": 98.30000305175781, "rewards/rejected": -714.1875, "rewards/weighted_accuracies": 0.606249988079071, "rewards/weighted_chosen": -1.202539086341858, "rewards/weighted_margins": 0.18366089463233948, "rewards/weighted_rejected": -1.3866698741912842, "step": 560 }, { "epoch": 0.3054253181513731, "grad_norm": 50.555702209472656, "learning_rate": 8.777634392737718e-07, "logits/chosen": -3.892578125, "logits/rejected": -3.912402391433716, "logps/chosen": -780.0984497070312, "logps/rejected": -869.4921875, "logps/weighted_chosen": -1.9073486328125, "logps/weighted_rejected": -2.052197217941284, "loss": 0.6896, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -588.2203369140625, "rewards/margins": 76.50312805175781, "rewards/rejected": -664.6828002929688, "rewards/weighted_accuracies": 0.5687500238418579, "rewards/weighted_chosen": -1.3397338390350342, "rewards/weighted_margins": 0.13177490234375, "rewards/weighted_rejected": -1.472259521484375, "step": 570 }, { "epoch": 0.31078365706630945, "grad_norm": 29.3996524810791, "learning_rate": 8.71572412738697e-07, "logits/chosen": -3.8792967796325684, "logits/rejected": -3.884960889816284, "logps/chosen": -791.7625122070312, "logps/rejected": -904.4625244140625, "logps/weighted_chosen": -1.7471435070037842, "logps/weighted_rejected": -1.941796898841858, "loss": 0.662, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -584.3125, "rewards/margins": 103.31172180175781, "rewards/rejected": -687.5281372070312, "rewards/weighted_accuracies": 0.612500011920929, "rewards/weighted_chosen": -1.219970703125, "rewards/weighted_margins": 0.17452391982078552, "rewards/weighted_rejected": -1.3947632312774658, "step": 580 }, { "epoch": 0.3161419959812458, "grad_norm": 62.64840316772461, "learning_rate": 8.652514554406387e-07, "logits/chosen": -3.75537109375, "logits/rejected": -3.780566453933716, "logps/chosen": -766.2156372070312, "logps/rejected": -876.4124755859375, "logps/weighted_chosen": -1.5784912109375, "logps/weighted_rejected": -1.806738257408142, "loss": 0.6446, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -561.0531005859375, "rewards/margins": 91.5546875, "rewards/rejected": -652.6312255859375, "rewards/weighted_accuracies": 0.590624988079071, "rewards/weighted_chosen": -1.047125220298767, "rewards/weighted_margins": 0.18712158501148224, "rewards/weighted_rejected": -1.234130859375, "step": 590 }, { "epoch": 0.3215003348961822, "grad_norm": 37.892852783203125, "learning_rate": 8.588027776804058e-07, "logits/chosen": -4.111035346984863, "logits/rejected": -4.132616996765137, "logps/chosen": -954.3093872070312, "logps/rejected": -1287.2874755859375, "logps/weighted_chosen": -1.9705321788787842, "logps/weighted_rejected": -2.290820360183716, "loss": 0.6267, "rewards/accuracies": 0.65625, "rewards/chosen": -767.3687744140625, "rewards/margins": 301.2730407714844, "rewards/rejected": -1068.2515869140625, "rewards/weighted_accuracies": 0.6468750238418579, "rewards/weighted_chosen": -1.4273865222930908, "rewards/weighted_margins": 0.2945709228515625, "rewards/weighted_rejected": -1.72186279296875, "step": 600 }, { "epoch": 0.32685867381111855, "grad_norm": 65.1230239868164, "learning_rate": 8.522286344198657e-07, "logits/chosen": -4.340234279632568, "logits/rejected": -4.395800590515137, "logps/chosen": -1246.2874755859375, "logps/rejected": -1507.5562744140625, "logps/weighted_chosen": -2.40673828125, "logps/weighted_rejected": -2.769335985183716, "loss": 0.6572, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1049.7093505859375, "rewards/margins": 244.8625030517578, "rewards/rejected": -1294.578125, "rewards/weighted_accuracies": 0.628125011920929, "rewards/weighted_chosen": -1.853796362876892, "rewards/weighted_margins": 0.33940428495407104, "rewards/weighted_rejected": -2.1933226585388184, "step": 610 }, { "epoch": 0.3322170127260549, "grad_norm": 29.82529640197754, "learning_rate": 8.455313244934324e-07, "logits/chosen": -4.122754096984863, "logits/rejected": -4.150390625, "logps/chosen": -1042.171875, "logps/rejected": -1219.421875, "logps/weighted_chosen": -2.124218702316284, "logps/weighted_rejected": -2.362744092941284, "loss": 0.657, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -852.1281127929688, "rewards/margins": 169.6007843017578, "rewards/rejected": -1021.8499755859375, "rewards/weighted_accuracies": 0.659375011920929, "rewards/weighted_chosen": -1.55322265625, "rewards/weighted_margins": 0.25230103731155396, "rewards/weighted_rejected": -1.8051025867462158, "step": 620 }, { "epoch": 0.3375753516409913, "grad_norm": 37.24652862548828, "learning_rate": 8.38713189804215e-07, "logits/chosen": -3.853710889816284, "logits/rejected": -3.8750977516174316, "logps/chosen": -865.640625, "logps/rejected": -959.234375, "logps/weighted_chosen": -1.747460961341858, "logps/weighted_rejected": -1.9114258289337158, "loss": 0.6618, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -657.8468627929688, "rewards/margins": 90.248046875, "rewards/rejected": -747.7999877929688, "rewards/weighted_accuracies": 0.606249988079071, "rewards/weighted_chosen": -1.1962401866912842, "rewards/weighted_margins": 0.14417724311351776, "rewards/weighted_rejected": -1.3405029773712158, "step": 630 }, { "epoch": 0.34293369055592765, "grad_norm": 121.3833999633789, "learning_rate": 8.317766145051057e-07, "logits/chosen": -4.014550685882568, "logits/rejected": -4.060400485992432, "logps/chosen": -1148.1968994140625, "logps/rejected": -1330.6624755859375, "logps/weighted_chosen": -2.155810594558716, "logps/weighted_rejected": -2.436279296875, "loss": 0.6494, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -948.5125122070312, "rewards/margins": 166.63320922851562, "rewards/rejected": -1115.5250244140625, "rewards/weighted_accuracies": 0.653124988079071, "rewards/weighted_chosen": -1.6124267578125, "rewards/weighted_margins": 0.25153809785842896, "rewards/weighted_rejected": -1.8647949695587158, "step": 640 }, { "epoch": 0.348292029470864, "grad_norm": 80.68321990966797, "learning_rate": 8.247240241650917e-07, "logits/chosen": -4.319043159484863, "logits/rejected": -4.347851753234863, "logps/chosen": -1620.6312255859375, "logps/rejected": -1847.887451171875, "logps/weighted_chosen": -2.8577637672424316, "logps/weighted_rejected": -3.252002000808716, "loss": 0.6113, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1411.9906005859375, "rewards/margins": 225.7062530517578, "rewards/rejected": -1637.65625, "rewards/weighted_accuracies": 0.675000011920929, "rewards/weighted_chosen": -2.3053221702575684, "rewards/weighted_margins": 0.40415650606155396, "rewards/weighted_rejected": -2.710131883621216, "step": 650 }, { "epoch": 0.3536503683858004, "grad_norm": 84.14534759521484, "learning_rate": 8.175578849210894e-07, "logits/chosen": -4.389941215515137, "logits/rejected": -4.449902534484863, "logps/chosen": -1601.6624755859375, "logps/rejected": -1866.512451171875, "logps/weighted_chosen": -2.7519774436950684, "logps/weighted_rejected": -3.2392578125, "loss": 0.5898, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -1391.625, "rewards/margins": 258.7890625, "rewards/rejected": -1651.15625, "rewards/weighted_accuracies": 0.6656249761581421, "rewards/weighted_chosen": -2.201342821121216, "rewards/weighted_margins": 0.4779724180698395, "rewards/weighted_rejected": -2.679443359375, "step": 660 }, { "epoch": 0.35900870730073675, "grad_norm": 39.30218505859375, "learning_rate": 8.102807026155873e-07, "logits/chosen": -4.323534965515137, "logits/rejected": -4.3671875, "logps/chosen": -1877.043701171875, "logps/rejected": -2115.753173828125, "logps/weighted_chosen": -3.111035108566284, "logps/weighted_rejected": -3.53955078125, "loss": 0.6449, "rewards/accuracies": 0.609375, "rewards/chosen": -1668.4375, "rewards/margins": 238.2890625, "rewards/rejected": -1906.921875, "rewards/weighted_accuracies": 0.659375011920929, "rewards/weighted_chosen": -2.5547852516174316, "rewards/weighted_margins": 0.4420410096645355, "rewards/weighted_rejected": -2.9969725608825684, "step": 670 }, { "epoch": 0.3643670462156731, "grad_norm": 49.729007720947266, "learning_rate": 8.028950219204099e-07, "logits/chosen": -3.9090819358825684, "logits/rejected": -3.9154295921325684, "logps/chosen": -1154.637451171875, "logps/rejected": -1364.612548828125, "logps/weighted_chosen": -2.158496141433716, "logps/weighted_rejected": -2.4698243141174316, "loss": 0.6288, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -945.5218505859375, "rewards/margins": 190.38906860351562, "rewards/rejected": -1135.9156494140625, "rewards/weighted_accuracies": 0.6468750238418579, "rewards/weighted_chosen": -1.6334960460662842, "rewards/weighted_margins": 0.27833253145217896, "rewards/weighted_rejected": -1.9119141101837158, "step": 680 }, { "epoch": 0.3697253851306095, "grad_norm": 51.215213775634766, "learning_rate": 7.954034254469e-07, "logits/chosen": -3.8226561546325684, "logits/rejected": -3.847851514816284, "logps/chosen": -1110.840576171875, "logps/rejected": -1422.15625, "logps/weighted_chosen": -2.027099609375, "logps/weighted_rejected": -2.3323731422424316, "loss": 0.6364, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -902.6968994140625, "rewards/margins": 299.96954345703125, "rewards/rejected": -1202.6640625, "rewards/weighted_accuracies": 0.628125011920929, "rewards/weighted_chosen": -1.477014183998108, "rewards/weighted_margins": 0.2761291563510895, "rewards/weighted_rejected": -1.753381371498108, "step": 690 }, { "epoch": 0.3750837240455459, "grad_norm": 75.38801574707031, "learning_rate": 7.878085328428368e-07, "logits/chosen": -3.9351563453674316, "logits/rejected": -3.950878858566284, "logps/chosen": -1454.487548828125, "logps/rejected": -1619.846923828125, "logps/weighted_chosen": -2.549121141433716, "logps/weighted_rejected": -2.860668897628784, "loss": 0.6352, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1233.6937255859375, "rewards/margins": 161.18359375, "rewards/rejected": -1394.9234619140625, "rewards/weighted_accuracies": 0.6625000238418579, "rewards/weighted_chosen": -1.9811279773712158, "rewards/weighted_margins": 0.295562744140625, "rewards/weighted_rejected": -2.277050733566284, "step": 700 }, { "epoch": 0.3804420629604823, "grad_norm": 47.24467468261719, "learning_rate": 7.801129998764014e-07, "logits/chosen": -4.013671875, "logits/rejected": -4.046777248382568, "logps/chosen": -1444.081298828125, "logps/rejected": -1578.3359375, "logps/weighted_chosen": -2.5169920921325684, "logps/weighted_rejected": -2.897216796875, "loss": 0.6181, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1228.768798828125, "rewards/margins": 132.5593719482422, "rewards/rejected": -1361.543701171875, "rewards/weighted_accuracies": 0.653124988079071, "rewards/weighted_chosen": -1.987060546875, "rewards/weighted_margins": 0.3536315858364105, "rewards/weighted_rejected": -2.3398680686950684, "step": 710 }, { "epoch": 0.38580040187541864, "grad_norm": 48.87159729003906, "learning_rate": 7.723195175075135e-07, "logits/chosen": -3.8736329078674316, "logits/rejected": -3.909863233566284, "logps/chosen": -1098.1187744140625, "logps/rejected": -1359.378173828125, "logps/weighted_chosen": -2.243359327316284, "logps/weighted_rejected": -2.5437254905700684, "loss": 0.6502, "rewards/accuracies": 0.671875, "rewards/chosen": -911.5625, "rewards/margins": 230.42422485351562, "rewards/rejected": -1141.9156494140625, "rewards/weighted_accuracies": 0.6343749761581421, "rewards/weighted_chosen": -1.6927001476287842, "rewards/weighted_margins": 0.2898498475551605, "rewards/weighted_rejected": -1.982519507408142, "step": 720 }, { "epoch": 0.391158740790355, "grad_norm": 46.05818176269531, "learning_rate": 7.644308109468609e-07, "logits/chosen": -3.802734375, "logits/rejected": -3.804003953933716, "logps/chosen": -1067.456298828125, "logps/rejected": -1283.940673828125, "logps/weighted_chosen": -2.007983446121216, "logps/weighted_rejected": -2.337109327316284, "loss": 0.6166, "rewards/accuracies": 0.625, "rewards/chosen": -863.9593505859375, "rewards/margins": 216.0382843017578, "rewards/rejected": -1079.768798828125, "rewards/weighted_accuracies": 0.6625000238418579, "rewards/weighted_chosen": -1.4805786609649658, "rewards/weighted_margins": 0.3025146424770355, "rewards/weighted_rejected": -1.7832763195037842, "step": 730 }, { "epoch": 0.3965170797052914, "grad_norm": 55.81437683105469, "learning_rate": 7.564496387029531e-07, "logits/chosen": -4.071093559265137, "logits/rejected": -4.120214939117432, "logps/chosen": -1393.003173828125, "logps/rejected": -1683.3375244140625, "logps/weighted_chosen": -2.53515625, "logps/weighted_rejected": -2.929443359375, "loss": 0.6251, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1187.206298828125, "rewards/margins": 281.1695251464844, "rewards/rejected": -1468.278076171875, "rewards/weighted_accuracies": 0.621874988079071, "rewards/weighted_chosen": -1.9933350086212158, "rewards/weighted_margins": 0.38404542207717896, "rewards/weighted_rejected": -2.378063917160034, "step": 740 }, { "epoch": 0.40187541862022774, "grad_norm": 70.06423950195312, "learning_rate": 7.483787916175306e-07, "logits/chosen": -4.136034965515137, "logits/rejected": -4.169043064117432, "logps/chosen": -1402.487548828125, "logps/rejected": -1632.203125, "logps/weighted_chosen": -2.555224657058716, "logps/weighted_rejected": -2.929248094558716, "loss": 0.6341, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -1186.949951171875, "rewards/margins": 219.8984375, "rewards/rejected": -1406.706298828125, "rewards/weighted_accuracies": 0.65625, "rewards/weighted_chosen": -2.005810499191284, "rewards/weighted_margins": 0.36415404081344604, "rewards/weighted_rejected": -2.3702635765075684, "step": 750 }, { "epoch": 0.4072337575351641, "grad_norm": 70.14170837402344, "learning_rate": 7.402210918896689e-07, "logits/chosen": -4.0087890625, "logits/rejected": -4.039746284484863, "logps/chosen": -1279.356201171875, "logps/rejected": -1483.34375, "logps/weighted_chosen": -2.5243163108825684, "logps/weighted_rejected": -2.907763719558716, "loss": 0.624, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -1078.0062255859375, "rewards/margins": 200.91171264648438, "rewards/rejected": -1279.0718994140625, "rewards/weighted_accuracies": 0.671875, "rewards/weighted_chosen": -1.9707763195037842, "rewards/weighted_margins": 0.36932373046875, "rewards/weighted_rejected": -2.340039014816284, "step": 760 }, { "epoch": 0.4125920964501005, "grad_norm": 42.04954147338867, "learning_rate": 7.31979392088917e-07, "logits/chosen": -4.008203029632568, "logits/rejected": -4.041699409484863, "logps/chosen": -1906.106201171875, "logps/rejected": -2324.574951171875, "logps/weighted_chosen": -3.2344727516174316, "logps/weighted_rejected": -3.656445264816284, "loss": 0.6464, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -1681.8343505859375, "rewards/margins": 403.7398376464844, "rewards/rejected": -2085.55615234375, "rewards/weighted_accuracies": 0.653124988079071, "rewards/weighted_chosen": -2.6626219749450684, "rewards/weighted_margins": 0.41541749238967896, "rewards/weighted_rejected": -3.079028367996216, "step": 770 }, { "epoch": 0.41795043536503684, "grad_norm": 60.82102966308594, "learning_rate": 7.236565741578162e-07, "logits/chosen": -3.94140625, "logits/rejected": -3.970410108566284, "logps/chosen": -1391.1500244140625, "logps/rejected": -1722.4625244140625, "logps/weighted_chosen": -2.736621141433716, "logps/weighted_rejected": -3.1324706077575684, "loss": 0.6103, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1210.75, "rewards/margins": 304.0874938964844, "rewards/rejected": -1515.206298828125, "rewards/weighted_accuracies": 0.6781250238418579, "rewards/weighted_chosen": -2.1943359375, "rewards/weighted_margins": 0.36918336153030396, "rewards/weighted_rejected": -2.5628418922424316, "step": 780 }, { "epoch": 0.4233087742799732, "grad_norm": 57.50530242919922, "learning_rate": 7.152555484041475e-07, "logits/chosen": -3.9559569358825684, "logits/rejected": -3.97998046875, "logps/chosen": -2113.05615234375, "logps/rejected": -2432.09375, "logps/weighted_chosen": -3.4646973609924316, "logps/weighted_rejected": -3.819775342941284, "loss": 0.665, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -1889.3656005859375, "rewards/margins": 320.93048095703125, "rewards/rejected": -2209.949951171875, "rewards/weighted_accuracies": 0.606249988079071, "rewards/weighted_chosen": -2.8851561546325684, "rewards/weighted_margins": 0.3280883729457855, "rewards/weighted_rejected": -3.2130370140075684, "step": 790 }, { "epoch": 0.4286671131949096, "grad_norm": 66.44673919677734, "learning_rate": 7.067792524832603e-07, "logits/chosen": -3.87353515625, "logits/rejected": -3.876953125, "logps/chosen": -1906.387451171875, "logps/rejected": -2406.27490234375, "logps/weighted_chosen": -3.2735352516174316, "logps/weighted_rejected": -3.800585985183716, "loss": 0.5965, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -1719.875, "rewards/margins": 478.44219970703125, "rewards/rejected": -2197.93115234375, "rewards/weighted_accuracies": 0.668749988079071, "rewards/weighted_chosen": -2.7166504859924316, "rewards/weighted_margins": 0.4966064393520355, "rewards/weighted_rejected": -3.2127442359924316, "step": 800 }, { "epoch": 0.43402545210984594, "grad_norm": 46.25076675415039, "learning_rate": 6.982306503708387e-07, "logits/chosen": -3.9263672828674316, "logits/rejected": -3.9388670921325684, "logps/chosen": -2048.074951171875, "logps/rejected": -2567.831298828125, "logps/weighted_chosen": -3.1552734375, "logps/weighted_rejected": -3.6259765625, "loss": 0.6162, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1836.296875, "rewards/margins": 497.3296813964844, "rewards/rejected": -2333.86865234375, "rewards/weighted_accuracies": 0.643750011920929, "rewards/weighted_chosen": -2.6156249046325684, "rewards/weighted_margins": 0.43950194120407104, "rewards/weighted_rejected": -3.054370164871216, "step": 810 }, { "epoch": 0.4393837910247823, "grad_norm": 49.19358825683594, "learning_rate": 6.896127313264642e-07, "logits/chosen": -3.8050780296325684, "logits/rejected": -3.810253858566284, "logps/chosen": -1704.9000244140625, "logps/rejected": -2074.984375, "logps/weighted_chosen": -2.6828856468200684, "logps/weighted_rejected": -3.102343797683716, "loss": 0.6091, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1484.278076171875, "rewards/margins": 349.5960998535156, "rewards/rejected": -1833.125, "rewards/weighted_accuracies": 0.668749988079071, "rewards/weighted_chosen": -2.126025438308716, "rewards/weighted_margins": 0.3870300352573395, "rewards/weighted_rejected": -2.5131592750549316, "step": 820 }, { "epoch": 0.4447421299397187, "grad_norm": 60.69411849975586, "learning_rate": 6.809285088483361e-07, "logits/chosen": -3.877636671066284, "logits/rejected": -3.895800828933716, "logps/chosen": -2122.018798828125, "logps/rejected": -2766.80615234375, "logps/weighted_chosen": -3.4732422828674316, "logps/weighted_rejected": -4.013476371765137, "loss": 0.589, "rewards/accuracies": 0.625, "rewards/chosen": -1923.1968994140625, "rewards/margins": 628.5843505859375, "rewards/rejected": -2552.231201171875, "rewards/weighted_accuracies": 0.6812499761581421, "rewards/weighted_chosen": -2.9307618141174316, "rewards/weighted_margins": 0.514050304889679, "rewards/weighted_rejected": -3.4447021484375, "step": 830 }, { "epoch": 0.45010046885465504, "grad_norm": 41.00968551635742, "learning_rate": 6.721810196195174e-07, "logits/chosen": -3.9095702171325684, "logits/rejected": -3.932910203933716, "logps/chosen": -2189.53125, "logps/rejected": -2920.6875, "logps/weighted_chosen": -3.7161622047424316, "logps/weighted_rejected": -4.303027153015137, "loss": 0.5847, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -1995.074951171875, "rewards/margins": 712.40625, "rewards/rejected": -2706.63134765625, "rewards/weighted_accuracies": 0.671875, "rewards/weighted_chosen": -3.1697754859924316, "rewards/weighted_margins": 0.5831848382949829, "rewards/weighted_rejected": -3.75390625, "step": 840 }, { "epoch": 0.4554588077695914, "grad_norm": 35.70737838745117, "learning_rate": 6.633733224460737e-07, "logits/chosen": -3.749706983566284, "logits/rejected": -3.76806640625, "logps/chosen": -1573.5374755859375, "logps/rejected": -2201.737548828125, "logps/weighted_chosen": -2.6988282203674316, "logps/weighted_rejected": -3.203369140625, "loss": 0.5615, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1371.199951171875, "rewards/margins": 607.2484130859375, "rewards/rejected": -1978.456298828125, "rewards/weighted_accuracies": 0.706250011920929, "rewards/weighted_chosen": -2.152587890625, "rewards/weighted_margins": 0.48516845703125, "rewards/weighted_rejected": -2.6385498046875, "step": 850 }, { "epoch": 0.4608171466845278, "grad_norm": 73.64376831054688, "learning_rate": 6.545084971874736e-07, "logits/chosen": -3.9317383766174316, "logits/rejected": -3.955859422683716, "logps/chosen": -2716.58740234375, "logps/rejected": -3423.94384765625, "logps/weighted_chosen": -4.377148628234863, "logps/weighted_rejected": -5.021288871765137, "loss": 0.583, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -2526.012451171875, "rewards/margins": 687.0515747070312, "rewards/rejected": -3212.625, "rewards/weighted_accuracies": 0.699999988079071, "rewards/weighted_chosen": -3.8341307640075684, "rewards/weighted_margins": 0.6270385980606079, "rewards/weighted_rejected": -4.462500095367432, "step": 860 }, { "epoch": 0.46617548559946415, "grad_norm": 54.467132568359375, "learning_rate": 6.455896436796313e-07, "logits/chosen": -3.9200196266174316, "logits/rejected": -3.932910203933716, "logps/chosen": -3902.112548828125, "logps/rejected": -4818.7626953125, "logps/weighted_chosen": -5.9072265625, "logps/weighted_rejected": -6.536718845367432, "loss": 0.6265, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -3707.512451171875, "rewards/margins": 898.1077880859375, "rewards/rejected": -4605.3125, "rewards/weighted_accuracies": 0.671875, "rewards/weighted_chosen": -5.350195407867432, "rewards/weighted_margins": 0.618518054485321, "rewards/weighted_rejected": -5.966601371765137, "step": 870 }, { "epoch": 0.4715338245144005, "grad_norm": 58.258487701416016, "learning_rate": 6.3661988065096e-07, "logits/chosen": -3.58642578125, "logits/rejected": -3.57080078125, "logps/chosen": -2729.300048828125, "logps/rejected": -3327.96875, "logps/weighted_chosen": -4.433691501617432, "logps/weighted_rejected": -4.90625, "loss": 0.5904, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2531.981201171875, "rewards/margins": 591.4375, "rewards/rejected": -3122.64990234375, "rewards/weighted_accuracies": 0.671875, "rewards/weighted_chosen": -3.89013671875, "rewards/weighted_margins": 0.4582275450229645, "rewards/weighted_rejected": -4.347851753234863, "step": 880 }, { "epoch": 0.4768921634293369, "grad_norm": 47.88425827026367, "learning_rate": 6.276023446318213e-07, "logits/chosen": -3.6739258766174316, "logits/rejected": -3.68017578125, "logps/chosen": -2387.86865234375, "logps/rejected": -2776.918701171875, "logps/weighted_chosen": -3.9383788108825684, "logps/weighted_rejected": -4.536913871765137, "loss": 0.5673, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -2187.94677734375, "rewards/margins": 379.22186279296875, "rewards/rejected": -2567.375, "rewards/weighted_accuracies": 0.746874988079071, "rewards/weighted_chosen": -3.3819336891174316, "rewards/weighted_margins": 0.5686889886856079, "rewards/weighted_rejected": -3.9501953125, "step": 890 }, { "epoch": 0.4822505023442733, "grad_norm": 66.42100524902344, "learning_rate": 6.185401888577487e-07, "logits/chosen": -3.93115234375, "logits/rejected": -3.9623045921325684, "logps/chosen": -2366.125, "logps/rejected": -3888.5625, "logps/weighted_chosen": -3.873828172683716, "logps/weighted_rejected": -4.703711032867432, "loss": 0.5527, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -2176.15625, "rewards/margins": 1495.7890625, "rewards/rejected": -3671.89990234375, "rewards/weighted_accuracies": 0.6812499761581421, "rewards/weighted_chosen": -3.337158203125, "rewards/weighted_margins": 0.809399425983429, "rewards/weighted_rejected": -4.147070407867432, "step": 900 }, { "epoch": 0.48760884125920967, "grad_norm": 55.539608001708984, "learning_rate": 6.094365821668307e-07, "logits/chosen": -3.839550733566284, "logits/rejected": -3.8837890625, "logps/chosen": -1797.953125, "logps/rejected": -2595.637451171875, "logps/weighted_chosen": -2.9969725608825684, "logps/weighted_rejected": -3.619189500808716, "loss": 0.5987, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1617.2125244140625, "rewards/margins": 776.7586059570312, "rewards/rejected": -2393.4765625, "rewards/weighted_accuracies": 0.684374988079071, "rewards/weighted_chosen": -2.473498582839966, "rewards/weighted_margins": 0.5845702886581421, "rewards/weighted_rejected": -3.0577635765075684, "step": 910 }, { "epoch": 0.49296718017414604, "grad_norm": 45.468833923339844, "learning_rate": 6.002947078916364e-07, "logits/chosen": -3.766308546066284, "logits/rejected": -3.795214891433716, "logps/chosen": -1605.893798828125, "logps/rejected": -2098.050048828125, "logps/weighted_chosen": -2.6654295921325684, "logps/weighted_rejected": -3.1903319358825684, "loss": 0.5926, "rewards/accuracies": 0.640625, "rewards/chosen": -1392.9937744140625, "rewards/margins": 466.5140686035156, "rewards/rejected": -1860.2249755859375, "rewards/weighted_accuracies": 0.6812499761581421, "rewards/weighted_chosen": -2.1322021484375, "rewards/weighted_margins": 0.5062255859375, "rewards/weighted_rejected": -2.6379637718200684, "step": 920 }, { "epoch": 0.4983255190890824, "grad_norm": 40.75309753417969, "learning_rate": 5.911177627460738e-07, "logits/chosen": -3.7451171875, "logits/rejected": -3.78662109375, "logps/chosen": -1467.300048828125, "logps/rejected": -2011.8218994140625, "logps/weighted_chosen": -2.669970750808716, "logps/weighted_rejected": -3.193066358566284, "loss": 0.6164, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -1248.856201171875, "rewards/margins": 535.0531005859375, "rewards/rejected": -1784.440673828125, "rewards/weighted_accuracies": 0.6656249761581421, "rewards/weighted_chosen": -2.093676805496216, "rewards/weighted_margins": 0.520129382610321, "rewards/weighted_rejected": -2.613964796066284, "step": 930 }, { "epoch": 0.5036838580040187, "grad_norm": 56.69281768798828, "learning_rate": 5.819089557075688e-07, "logits/chosen": -3.661328077316284, "logits/rejected": -3.725781202316284, "logps/chosen": -1287.831298828125, "logps/rejected": -1729.199951171875, "logps/weighted_chosen": -2.5040040016174316, "logps/weighted_rejected": -3.1222167015075684, "loss": 0.5475, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -1086.8968505859375, "rewards/margins": 426.1015625, "rewards/rejected": -1512.9375, "rewards/weighted_accuracies": 0.737500011920929, "rewards/weighted_chosen": -1.9895751476287842, "rewards/weighted_margins": 0.597973644733429, "rewards/weighted_rejected": -2.5875487327575684, "step": 940 }, { "epoch": 0.5090421969189551, "grad_norm": 198.49781799316406, "learning_rate": 5.726715068949564e-07, "logits/chosen": -3.756054639816284, "logits/rejected": -3.787304639816284, "logps/chosen": -2703.00634765625, "logps/rejected": -2929.199951171875, "logps/weighted_chosen": -4.483007907867432, "logps/weighted_rejected": -5.202538967132568, "loss": 0.6233, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -2486.63134765625, "rewards/margins": 220.6328125, "rewards/rejected": -2707.15625, "rewards/weighted_accuracies": 0.675000011920929, "rewards/weighted_chosen": -3.906005859375, "rewards/weighted_margins": 0.69610595703125, "rewards/weighted_rejected": -4.601171970367432, "step": 950 }, { "epoch": 0.5144005358338914, "grad_norm": 94.89183044433594, "learning_rate": 5.634086464424742e-07, "logits/chosen": -3.745898485183716, "logits/rejected": -3.765429735183716, "logps/chosen": -2798.706298828125, "logps/rejected": -3669.106201171875, "logps/weighted_chosen": -5.272851467132568, "logps/weighted_rejected": -6.047753810882568, "loss": 0.5864, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -2610.640625, "rewards/margins": 852.515625, "rewards/rejected": -3462.06884765625, "rewards/weighted_accuracies": 0.715624988079071, "rewards/weighted_chosen": -4.717529296875, "rewards/weighted_margins": 0.750353991985321, "rewards/weighted_rejected": -5.467114448547363, "step": 960 }, { "epoch": 0.5197588747488279, "grad_norm": 46.3341178894043, "learning_rate": 5.54123613370256e-07, "logits/chosen": -3.57080078125, "logits/rejected": -3.592968702316284, "logps/chosen": -2180.831298828125, "logps/rejected": -2691.08740234375, "logps/weighted_chosen": -4.2587890625, "logps/weighted_rejected": -4.9375, "loss": 0.562, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1985.668701171875, "rewards/margins": 503.3203125, "rewards/rejected": -2488.737548828125, "rewards/weighted_accuracies": 0.690625011920929, "rewards/weighted_chosen": -3.743603467941284, "rewards/weighted_margins": 0.6626037359237671, "rewards/weighted_rejected": -4.405859470367432, "step": 970 }, { "epoch": 0.5251172136637642, "grad_norm": 44.62388610839844, "learning_rate": 5.448196544517167e-07, "logits/chosen": -3.4229493141174316, "logits/rejected": -3.43896484375, "logps/chosen": -1911.8687744140625, "logps/rejected": -2449.68115234375, "logps/weighted_chosen": -3.70849609375, "logps/weighted_rejected": -4.321630954742432, "loss": 0.5762, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1712.768798828125, "rewards/margins": 526.8828125, "rewards/rejected": -2239.606201171875, "rewards/weighted_accuracies": 0.659375011920929, "rewards/weighted_chosen": -3.1871094703674316, "rewards/weighted_margins": 0.5860961675643921, "rewards/weighted_rejected": -3.773242235183716, "step": 980 }, { "epoch": 0.5304755525787006, "grad_norm": 49.71703338623047, "learning_rate": 5.355000230782267e-07, "logits/chosen": -3.386523485183716, "logits/rejected": -3.404003858566284, "logps/chosen": -1865.487548828125, "logps/rejected": -2300.90625, "logps/weighted_chosen": -3.5951170921325684, "logps/weighted_rejected": -4.100878715515137, "loss": 0.602, "rewards/accuracies": 0.625, "rewards/chosen": -1660.425048828125, "rewards/margins": 418.15234375, "rewards/rejected": -2078.49072265625, "rewards/weighted_accuracies": 0.684374988079071, "rewards/weighted_chosen": -3.0364990234375, "rewards/weighted_margins": 0.4848266541957855, "rewards/weighted_rejected": -3.5208983421325684, "step": 990 }, { "epoch": 0.535833891493637, "grad_norm": 54.951995849609375, "learning_rate": 5.26167978121472e-07, "logits/chosen": -3.4642577171325684, "logits/rejected": -3.4644532203674316, "logps/chosen": -2232.06884765625, "logps/rejected": -2963.16259765625, "logps/weighted_chosen": -4.036523342132568, "logps/weighted_rejected": -4.75390625, "loss": 0.5535, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2023.6187744140625, "rewards/margins": 708.0984497070312, "rewards/rejected": -2731.19384765625, "rewards/weighted_accuracies": 0.7250000238418579, "rewards/weighted_chosen": -3.497631788253784, "rewards/weighted_margins": 0.680041491985321, "rewards/weighted_rejected": -4.178027153015137, "step": 1000 }, { "epoch": 0.535833891493637, "eval_logits/chosen": -3.549294948577881, "eval_logits/rejected": -3.559162139892578, "eval_logps/chosen": -2485.90380859375, "eval_logps/rejected": -3043.268310546875, "eval_logps/weighted_chosen": -4.452103137969971, "eval_logps/weighted_rejected": -5.134407043457031, "eval_loss": 0.5839847326278687, "eval_rewards/accuracies": 0.6418558359146118, "eval_rewards/chosen": -2282.807861328125, "eval_rewards/margins": 543.8858642578125, "eval_rewards/rejected": -2826.91064453125, "eval_rewards/weighted_accuracies": 0.6829105615615845, "eval_rewards/weighted_chosen": -3.896622657775879, "eval_rewards/weighted_margins": 0.6617620587348938, "eval_rewards/weighted_rejected": -4.558384418487549, "eval_runtime": 2155.1416, "eval_samples_per_second": 1.389, "eval_steps_per_second": 0.348, "step": 1000 }, { "epoch": 0.5411922304085733, "grad_norm": 79.4778060913086, "learning_rate": 5.16826782793897e-07, "logits/chosen": -3.510449171066284, "logits/rejected": -3.534863233566284, "logps/chosen": -2491.237548828125, "logps/rejected": -3156.91259765625, "logps/weighted_chosen": -4.569384574890137, "logps/weighted_rejected": -5.425585746765137, "loss": 0.5358, "rewards/accuracies": 0.65625, "rewards/chosen": -2292.84375, "rewards/margins": 644.8546752929688, "rewards/rejected": -2937.61865234375, "rewards/weighted_accuracies": 0.71875, "rewards/weighted_chosen": -4.032910346984863, "rewards/weighted_margins": 0.826770007610321, "rewards/weighted_rejected": -4.859228610992432, "step": 1010 }, { "epoch": 0.5465505693235098, "grad_norm": 61.079864501953125, "learning_rate": 5.074797035076318e-07, "logits/chosen": -3.5478515625, "logits/rejected": -3.567187547683716, "logps/chosen": -2653.21240234375, "logps/rejected": -3437.8125, "logps/weighted_chosen": -4.941064357757568, "logps/weighted_rejected": -5.822851657867432, "loss": 0.5441, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -2446.54052734375, "rewards/margins": 765.1218872070312, "rewards/rejected": -3211.3125, "rewards/weighted_accuracies": 0.746874988079071, "rewards/weighted_chosen": -4.367761135101318, "rewards/weighted_margins": 0.8629516363143921, "rewards/weighted_rejected": -5.231005668640137, "step": 1020 }, { "epoch": 0.5519089082384461, "grad_norm": 47.398704528808594, "learning_rate": 4.981300087322984e-07, "logits/chosen": -3.4212889671325684, "logits/rejected": -3.440722703933716, "logps/chosen": -2479.63134765625, "logps/rejected": -3158.550048828125, "logps/weighted_chosen": -4.55029296875, "logps/weighted_rejected": -5.351855278015137, "loss": 0.5583, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2293.41259765625, "rewards/margins": 655.875, "rewards/rejected": -2951.34375, "rewards/weighted_accuracies": 0.715624988079071, "rewards/weighted_chosen": -3.9914307594299316, "rewards/weighted_margins": 0.7806396484375, "rewards/weighted_rejected": -4.77197265625, "step": 1030 }, { "epoch": 0.5572672471533825, "grad_norm": 63.054168701171875, "learning_rate": 4.887809678520975e-07, "logits/chosen": -3.3487305641174316, "logits/rejected": -3.3558592796325684, "logps/chosen": -2364.0625, "logps/rejected": -2929.425048828125, "logps/weighted_chosen": -4.341748237609863, "logps/weighted_rejected": -5.078515529632568, "loss": 0.5706, "rewards/accuracies": 0.640625, "rewards/chosen": -2166.137451171875, "rewards/margins": 556.125, "rewards/rejected": -2720.5625, "rewards/weighted_accuracies": 0.7281249761581421, "rewards/weighted_chosen": -3.79736328125, "rewards/weighted_margins": 0.707104504108429, "rewards/weighted_rejected": -4.505615234375, "step": 1040 }, { "epoch": 0.5626255860683188, "grad_norm": 66.39554595947266, "learning_rate": 4.794358500225781e-07, "logits/chosen": -3.335742235183716, "logits/rejected": -3.342968702316284, "logps/chosen": -2968.112548828125, "logps/rejected": -3496.21240234375, "logps/weighted_chosen": -4.970947265625, "logps/weighted_rejected": -5.766699314117432, "loss": 0.5347, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2753.925048828125, "rewards/margins": 525.3312377929688, "rewards/rejected": -3278.762451171875, "rewards/weighted_accuracies": 0.737500011920929, "rewards/weighted_chosen": -4.405615329742432, "rewards/weighted_margins": 0.7887328863143921, "rewards/weighted_rejected": -5.193163871765137, "step": 1050 }, { "epoch": 0.5679839249832552, "grad_norm": 57.971885681152344, "learning_rate": 4.700979230274829e-07, "logits/chosen": -3.3427734375, "logits/rejected": -3.34375, "logps/chosen": -3232.53759765625, "logps/rejected": -3996.81884765625, "logps/weighted_chosen": -5.7021484375, "logps/weighted_rejected": -6.397656440734863, "loss": 0.5907, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -3012.375, "rewards/margins": 746.2734375, "rewards/rejected": -3758.175048828125, "rewards/weighted_accuracies": 0.6968749761581421, "rewards/weighted_chosen": -5.131152153015137, "rewards/weighted_margins": 0.684863269329071, "rewards/weighted_rejected": -5.815527439117432, "step": 1060 }, { "epoch": 0.5733422638981915, "grad_norm": 48.712318420410156, "learning_rate": 4.6077045213607755e-07, "logits/chosen": -3.2083983421325684, "logits/rejected": -3.209277391433716, "logps/chosen": -2662.112548828125, "logps/rejected": -3147.550048828125, "logps/weighted_chosen": -4.856835842132568, "logps/weighted_rejected": -5.518847465515137, "loss": 0.5756, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -2463.800048828125, "rewards/margins": 479.9437561035156, "rewards/rejected": -2944.550048828125, "rewards/weighted_accuracies": 0.699999988079071, "rewards/weighted_chosen": -4.350390434265137, "rewards/weighted_margins": 0.623095691204071, "rewards/weighted_rejected": -4.972216606140137, "step": 1070 }, { "epoch": 0.578700602813128, "grad_norm": 48.57645797729492, "learning_rate": 4.514566989613559e-07, "logits/chosen": -3.207324266433716, "logits/rejected": -3.2095704078674316, "logps/chosen": -2496.25634765625, "logps/rejected": -3165.72509765625, "logps/weighted_chosen": -4.495214939117432, "logps/weighted_rejected": -5.168164253234863, "loss": 0.5612, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -2264.86865234375, "rewards/margins": 653.3562622070312, "rewards/rejected": -2917.50634765625, "rewards/weighted_accuracies": 0.706250011920929, "rewards/weighted_chosen": -3.945605516433716, "rewards/weighted_margins": 0.647961437702179, "rewards/weighted_rejected": -4.594580173492432, "step": 1080 }, { "epoch": 0.5840589417280643, "grad_norm": 65.56918334960938, "learning_rate": 4.4215992031952614e-07, "logits/chosen": -3.2230467796325684, "logits/rejected": -3.2120118141174316, "logps/chosen": -2563.293701171875, "logps/rejected": -3421.875, "logps/weighted_chosen": -5.21826171875, "logps/weighted_rejected": -5.978906154632568, "loss": 0.5596, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2379.03759765625, "rewards/margins": 820.65625, "rewards/rejected": -3198.41259765625, "rewards/weighted_accuracies": 0.7093750238418579, "rewards/weighted_chosen": -4.678418159484863, "rewards/weighted_margins": 0.7149902582168579, "rewards/weighted_rejected": -5.3935546875, "step": 1090 }, { "epoch": 0.5894172806430007, "grad_norm": 68.06615447998047, "learning_rate": 4.328833670911724e-07, "logits/chosen": -3.170605421066284, "logits/rejected": -3.166699171066284, "logps/chosen": -3257.112548828125, "logps/rejected": -4018.00634765625, "logps/weighted_chosen": -5.70556640625, "logps/weighted_rejected": -6.41455078125, "loss": 0.5918, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -3049.296875, "rewards/margins": 746.9202880859375, "rewards/rejected": -3795.425048828125, "rewards/weighted_accuracies": 0.671875, "rewards/weighted_chosen": -5.149609565734863, "rewards/weighted_margins": 0.6976073980331421, "rewards/weighted_rejected": -5.848535060882568, "step": 1100 }, { "epoch": 0.594775619557937, "grad_norm": 47.62029266357422, "learning_rate": 4.236302830844931e-07, "logits/chosen": -3.06396484375, "logits/rejected": -3.0479493141174316, "logps/chosen": -2672.137451171875, "logps/rejected": -3189.27490234375, "logps/weighted_chosen": -4.959668159484863, "logps/weighted_rejected": -5.608105659484863, "loss": 0.6, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -2458.83740234375, "rewards/margins": 508.86407470703125, "rewards/rejected": -2967.31884765625, "rewards/weighted_accuracies": 0.706250011920929, "rewards/weighted_chosen": -4.393310546875, "rewards/weighted_margins": 0.6424316167831421, "rewards/weighted_rejected": -5.03564453125, "step": 1110 }, { "epoch": 0.6001339584728734, "grad_norm": 59.24471664428711, "learning_rate": 4.144039039010124e-07, "logits/chosen": -2.9786133766174316, "logits/rejected": -2.9564452171325684, "logps/chosen": -2158.97509765625, "logps/rejected": -2407.59375, "logps/weighted_chosen": -4.17529296875, "logps/weighted_rejected": -4.825976371765137, "loss": 0.546, "rewards/accuracies": 0.640625, "rewards/chosen": -1953.484375, "rewards/margins": 245.921875, "rewards/rejected": -2199.78759765625, "rewards/weighted_accuracies": 0.753125011920929, "rewards/weighted_chosen": -3.6058592796325684, "rewards/weighted_margins": 0.63897705078125, "rewards/weighted_rejected": -4.246069431304932, "step": 1120 }, { "epoch": 0.6054922973878097, "grad_norm": 63.221038818359375, "learning_rate": 4.052074558041608e-07, "logits/chosen": -3.0283203125, "logits/rejected": -3.017285108566284, "logps/chosen": -2134.97509765625, "logps/rejected": -2573.737548828125, "logps/weighted_chosen": -4.003759860992432, "logps/weighted_rejected": -4.656933784484863, "loss": 0.5513, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -1908.206298828125, "rewards/margins": 434.62811279296875, "rewards/rejected": -2343.28125, "rewards/weighted_accuracies": 0.753125011920929, "rewards/weighted_chosen": -3.4220213890075684, "rewards/weighted_margins": 0.6371825933456421, "rewards/weighted_rejected": -4.058789253234863, "step": 1130 }, { "epoch": 0.6108506363027462, "grad_norm": 52.066368103027344, "learning_rate": 3.960441545911204e-07, "logits/chosen": -3.1263670921325684, "logits/rejected": -3.1119141578674316, "logps/chosen": -1908.762451171875, "logps/rejected": -2521.012451171875, "logps/weighted_chosen": -4.003027439117432, "logps/weighted_rejected": -4.765527248382568, "loss": 0.5214, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1710.4625244140625, "rewards/margins": 589.6312255859375, "rewards/rejected": -2299.95947265625, "rewards/weighted_accuracies": 0.7406250238418579, "rewards/weighted_chosen": -3.4476075172424316, "rewards/weighted_margins": 0.731738269329071, "rewards/weighted_rejected": -4.180151462554932, "step": 1140 }, { "epoch": 0.6162089752176825, "grad_norm": 70.68743896484375, "learning_rate": 3.8691720446833187e-07, "logits/chosen": -3.215527296066284, "logits/rejected": -3.2021484375, "logps/chosen": -2513.35009765625, "logps/rejected": -3176.1875, "logps/weighted_chosen": -5.13623046875, "logps/weighted_rejected": -5.89794921875, "loss": 0.5521, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2315.543701171875, "rewards/margins": 643.4359130859375, "rewards/rejected": -2958.637451171875, "rewards/weighted_accuracies": 0.731249988079071, "rewards/weighted_chosen": -4.585058689117432, "rewards/weighted_margins": 0.741162121295929, "rewards/weighted_rejected": -5.324120998382568, "step": 1150 }, { "epoch": 0.6215673141326189, "grad_norm": 58.93326950073242, "learning_rate": 3.778297969310529e-07, "logits/chosen": -3.2992186546325684, "logits/rejected": -3.297168016433716, "logps/chosen": -2992.449951171875, "logps/rejected": -3617.85009765625, "logps/weighted_chosen": -5.724609375, "logps/weighted_rejected": -6.642480373382568, "loss": 0.5462, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -2794.61865234375, "rewards/margins": 608.9249877929688, "rewards/rejected": -3405.16259765625, "rewards/weighted_accuracies": 0.71875, "rewards/weighted_chosen": -5.183398246765137, "rewards/weighted_margins": 0.877758800983429, "rewards/weighted_rejected": -6.062597751617432, "step": 1160 }, { "epoch": 0.6269256530475552, "grad_norm": 84.15338134765625, "learning_rate": 3.687851096473624e-07, "logits/chosen": -3.316210985183716, "logits/rejected": -3.3095703125, "logps/chosen": -2626.550048828125, "logps/rejected": -3306.75, "logps/weighted_chosen": -5.133008003234863, "logps/weighted_rejected": -5.990136623382568, "loss": 0.5551, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -2415.487548828125, "rewards/margins": 686.6953125, "rewards/rejected": -3102.28759765625, "rewards/weighted_accuracies": 0.7124999761581421, "rewards/weighted_chosen": -4.54345703125, "rewards/weighted_margins": 0.861572265625, "rewards/weighted_rejected": -5.404882907867432, "step": 1170 }, { "epoch": 0.6322839919624916, "grad_norm": 71.16552734375, "learning_rate": 3.5978630534699865e-07, "logits/chosen": -3.3482422828674316, "logits/rejected": -3.3451170921325684, "logps/chosen": -2592.78759765625, "logps/rejected": -3643.425048828125, "logps/weighted_chosen": -5.055273532867432, "logps/weighted_rejected": -5.8125, "loss": 0.5426, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -2389.77490234375, "rewards/margins": 1034.065673828125, "rewards/rejected": -3422.65625, "rewards/weighted_accuracies": 0.6875, "rewards/weighted_chosen": -4.496972560882568, "rewards/weighted_margins": 0.748461902141571, "rewards/weighted_rejected": -5.245409965515137, "step": 1180 }, { "epoch": 0.6376423308774279, "grad_norm": 67.54759979248047, "learning_rate": 3.5083653071542197e-07, "logits/chosen": -3.2964844703674316, "logits/rejected": -3.294238328933716, "logps/chosen": -2341.16259765625, "logps/rejected": -2609.731201171875, "logps/weighted_chosen": -4.109082221984863, "logps/weighted_rejected": -4.714648246765137, "loss": 0.6061, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -2109.64697265625, "rewards/margins": 264.6039123535156, "rewards/rejected": -2374.91259765625, "rewards/weighted_accuracies": 0.637499988079071, "rewards/weighted_chosen": -3.56103515625, "rewards/weighted_margins": 0.5668579339981079, "rewards/weighted_rejected": -4.129150390625, "step": 1190 }, { "epoch": 0.6430006697923644, "grad_norm": 78.90850067138672, "learning_rate": 3.4193891529348795e-07, "logits/chosen": -3.225878953933716, "logits/rejected": -3.225878953933716, "logps/chosen": -2252.49365234375, "logps/rejected": -3008.043701171875, "logps/weighted_chosen": -4.048047065734863, "logps/weighted_rejected": -4.911328315734863, "loss": 0.4958, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -2030.903076171875, "rewards/margins": 770.2398681640625, "rewards/rejected": -2799.190673828125, "rewards/weighted_accuracies": 0.762499988079071, "rewards/weighted_chosen": -3.5079102516174316, "rewards/weighted_margins": 0.8606201410293579, "rewards/weighted_rejected": -4.367968559265137, "step": 1200 }, { "epoch": 0.6483590087073008, "grad_norm": 84.27739715576172, "learning_rate": 3.330965703831146e-07, "logits/chosen": -3.2769532203674316, "logits/rejected": -3.26123046875, "logps/chosen": -2761.58740234375, "logps/rejected": -3984.36865234375, "logps/weighted_chosen": -5.098242282867432, "logps/weighted_rejected": -6.088086128234863, "loss": 0.5283, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2558.112548828125, "rewards/margins": 1182.4781494140625, "rewards/rejected": -3739.737548828125, "rewards/weighted_accuracies": 0.7406250238418579, "rewards/weighted_chosen": -4.5693359375, "rewards/weighted_margins": 0.9654296636581421, "rewards/weighted_rejected": -5.533789157867432, "step": 1210 }, { "epoch": 0.6537173476222371, "grad_norm": 90.22477722167969, "learning_rate": 3.243125879593286e-07, "logits/chosen": -3.3125977516174316, "logits/rejected": -3.286816358566284, "logps/chosen": -2985.77490234375, "logps/rejected": -3794.83740234375, "logps/weighted_chosen": -5.321044921875, "logps/weighted_rejected": -6.415722846984863, "loss": 0.5187, "rewards/accuracies": 0.640625, "rewards/chosen": -2769.6875, "rewards/margins": 801.0875244140625, "rewards/rejected": -3571.24365234375, "rewards/weighted_accuracies": 0.737500011920929, "rewards/weighted_chosen": -4.761767387390137, "rewards/weighted_margins": 1.061132788658142, "rewards/weighted_rejected": -5.822265625, "step": 1220 }, { "epoch": 0.6590756865371735, "grad_norm": 69.05809783935547, "learning_rate": 3.1559003958906903e-07, "logits/chosen": -3.3125977516174316, "logits/rejected": -3.315234422683716, "logps/chosen": -2401.887451171875, "logps/rejected": -3324.375, "logps/weighted_chosen": -5.186327934265137, "logps/weighted_rejected": -6.095703125, "loss": 0.5257, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -2212.887451171875, "rewards/margins": 904.328125, "rewards/rejected": -3118.925048828125, "rewards/weighted_accuracies": 0.75, "rewards/weighted_chosen": -4.607031345367432, "rewards/weighted_margins": 0.905322253704071, "rewards/weighted_rejected": -5.51513671875, "step": 1230 }, { "epoch": 0.6644340254521098, "grad_norm": 70.57830047607422, "learning_rate": 3.069319753571269e-07, "logits/chosen": -3.2847657203674316, "logits/rejected": -3.280468702316284, "logps/chosen": -2278.425048828125, "logps/rejected": -3086.425048828125, "logps/weighted_chosen": -4.578027248382568, "logps/weighted_rejected": -5.426171779632568, "loss": 0.5812, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -2084.93115234375, "rewards/margins": 796.6468505859375, "rewards/rejected": -2879.862548828125, "rewards/weighted_accuracies": 0.6968749761581421, "rewards/weighted_chosen": -4.033203125, "rewards/weighted_margins": 0.810620129108429, "rewards/weighted_rejected": -4.844140529632568, "step": 1240 }, { "epoch": 0.6697923643670463, "grad_norm": 60.08575439453125, "learning_rate": 2.983414227995975e-07, "logits/chosen": -3.1739258766174316, "logits/rejected": -3.1656250953674316, "logps/chosen": -2231.875, "logps/rejected": -2701.074951171875, "logps/weighted_chosen": -4.2490234375, "logps/weighted_rejected": -4.935449123382568, "loss": 0.5667, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -2014.362548828125, "rewards/margins": 461.45001220703125, "rewards/rejected": -2474.199951171875, "rewards/weighted_accuracies": 0.703125, "rewards/weighted_chosen": -3.686474561691284, "rewards/weighted_margins": 0.6810302734375, "rewards/weighted_rejected": -4.368066310882568, "step": 1250 }, { "epoch": 0.6751507032819826, "grad_norm": 76.56852722167969, "learning_rate": 2.898213858452173e-07, "logits/chosen": -3.127246141433716, "logits/rejected": -3.1114258766174316, "logps/chosen": -2059.981201171875, "logps/rejected": -2350.03759765625, "logps/weighted_chosen": -4.0869140625, "logps/weighted_rejected": -4.69140625, "loss": 0.5527, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1861.9312744140625, "rewards/margins": 285.4671936035156, "rewards/rejected": -2147.18115234375, "rewards/weighted_accuracies": 0.734375, "rewards/weighted_chosen": -3.5528807640075684, "rewards/weighted_margins": 0.589099109172821, "rewards/weighted_rejected": -4.142333984375, "step": 1260 }, { "epoch": 0.680509042196919, "grad_norm": 74.49163055419922, "learning_rate": 2.8137484376495506e-07, "logits/chosen": -3.12646484375, "logits/rejected": -3.0918946266174316, "logps/chosen": -2617.00634765625, "logps/rejected": -3119.440673828125, "logps/weighted_chosen": -4.663671970367432, "logps/weighted_rejected": -5.35205078125, "loss": 0.5693, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2397.121826171875, "rewards/margins": 509.5687561035156, "rewards/rejected": -2906.49365234375, "rewards/weighted_accuracies": 0.7250000238418579, "rewards/weighted_chosen": -4.127978324890137, "rewards/weighted_margins": 0.684863269329071, "rewards/weighted_rejected": -4.812304496765137, "step": 1270 }, { "epoch": 0.6858673811118553, "grad_norm": 93.80892181396484, "learning_rate": 2.730047501302266e-07, "logits/chosen": -3.093554735183716, "logits/rejected": -3.0672850608825684, "logps/chosen": -2769.96240234375, "logps/rejected": -3371.99365234375, "logps/weighted_chosen": -5.237597465515137, "logps/weighted_rejected": -6.099804878234863, "loss": 0.5304, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2569.33740234375, "rewards/margins": 579.9187622070312, "rewards/rejected": -3150.10009765625, "rewards/weighted_accuracies": 0.737500011920929, "rewards/weighted_chosen": -4.700976371765137, "rewards/weighted_margins": 0.8484131097793579, "rewards/weighted_rejected": -5.551074028015137, "step": 1280 }, { "epoch": 0.6912257200267917, "grad_norm": 67.59648895263672, "learning_rate": 2.647140317800944e-07, "logits/chosen": -3.073046922683716, "logits/rejected": -3.0606446266174316, "logps/chosen": -2785.61865234375, "logps/rejected": -3411.47509765625, "logps/weighted_chosen": -5.348340034484863, "logps/weighted_rejected": -6.2333984375, "loss": 0.5383, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -2587.71875, "rewards/margins": 618.9249877929688, "rewards/rejected": -3206.293701171875, "rewards/weighted_accuracies": 0.71875, "rewards/weighted_chosen": -4.78515625, "rewards/weighted_margins": 0.86767578125, "rewards/weighted_rejected": -5.651659965515137, "step": 1290 }, { "epoch": 0.696584058941728, "grad_norm": 101.80194854736328, "learning_rate": 2.5650558779781635e-07, "logits/chosen": -3.100390672683716, "logits/rejected": -3.0810546875, "logps/chosen": -2814.846923828125, "logps/rejected": -3689.050048828125, "logps/weighted_chosen": -5.2373046875, "logps/weighted_rejected": -6.142382621765137, "loss": 0.534, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2618.82177734375, "rewards/margins": 850.7999877929688, "rewards/rejected": -3470.30615234375, "rewards/weighted_accuracies": 0.71875, "rewards/weighted_chosen": -4.687329292297363, "rewards/weighted_margins": 0.886914074420929, "rewards/weighted_rejected": -5.5751953125, "step": 1300 }, { "epoch": 0.7019423978566645, "grad_norm": 54.778236389160156, "learning_rate": 2.4838228849709997e-07, "logits/chosen": -3.131542921066284, "logits/rejected": -3.1089844703674316, "logps/chosen": -2901.106201171875, "logps/rejected": -3760.03759765625, "logps/weighted_chosen": -5.477831840515137, "logps/weighted_rejected": -6.385839939117432, "loss": 0.5305, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2706.14990234375, "rewards/margins": 837.2843627929688, "rewards/rejected": -3543.5625, "rewards/weighted_accuracies": 0.715624988079071, "rewards/weighted_chosen": -4.89501953125, "rewards/weighted_margins": 0.8981689214706421, "rewards/weighted_rejected": -5.79296875, "step": 1310 }, { "epoch": 0.7073007367716008, "grad_norm": 71.93656921386719, "learning_rate": 2.403469744184154e-07, "logits/chosen": -3.1357421875, "logits/rejected": -3.116015672683716, "logps/chosen": -2888.69384765625, "logps/rejected": -3361.96875, "logps/weighted_chosen": -5.206933498382568, "logps/weighted_rejected": -6.030077934265137, "loss": 0.5677, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -2675.22802734375, "rewards/margins": 466.41326904296875, "rewards/rejected": -3142.762451171875, "rewards/weighted_accuracies": 0.706250011920929, "rewards/weighted_chosen": -4.642333984375, "rewards/weighted_margins": 0.812817394733429, "rewards/weighted_rejected": -5.456250190734863, "step": 1320 }, { "epoch": 0.7126590756865372, "grad_norm": 59.53675842285156, "learning_rate": 2.3240245533572167e-07, "logits/chosen": -3.1578125953674316, "logits/rejected": -3.1273436546325684, "logps/chosen": -2736.11865234375, "logps/rejected": -3383.52490234375, "logps/weighted_chosen": -5.336230278015137, "logps/weighted_rejected": -6.180273532867432, "loss": 0.5341, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2544.64990234375, "rewards/margins": 625.8937377929688, "rewards/rejected": -3169.925048828125, "rewards/weighted_accuracies": 0.762499988079071, "rewards/weighted_chosen": -4.8037109375, "rewards/weighted_margins": 0.7956176996231079, "rewards/weighted_rejected": -5.6015625, "step": 1330 }, { "epoch": 0.7180174146014735, "grad_norm": 63.56354522705078, "learning_rate": 2.2455150927394878e-07, "logits/chosen": -3.2085938453674316, "logits/rejected": -3.174121141433716, "logps/chosen": -2826.33740234375, "logps/rejected": -3696.35009765625, "logps/weighted_chosen": -5.438574314117432, "logps/weighted_rejected": -6.519629001617432, "loss": 0.4721, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2625.762451171875, "rewards/margins": 855.0718994140625, "rewards/rejected": -3480.175048828125, "rewards/weighted_accuracies": 0.778124988079071, "rewards/weighted_chosen": -4.86962890625, "rewards/weighted_margins": 1.0476806163787842, "rewards/weighted_rejected": -5.919140815734863, "step": 1340 }, { "epoch": 0.7233757535164099, "grad_norm": 65.85667419433594, "learning_rate": 2.167968815375837e-07, "logits/chosen": -3.3223633766174316, "logits/rejected": -3.31201171875, "logps/chosen": -2891.875, "logps/rejected": -3532.35009765625, "logps/weighted_chosen": -5.588574409484863, "logps/weighted_rejected": -6.560546875, "loss": 0.5111, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -2693.418701171875, "rewards/margins": 642.3468627929688, "rewards/rejected": -3337.02490234375, "rewards/weighted_accuracies": 0.7406250238418579, "rewards/weighted_chosen": -5.040429592132568, "rewards/weighted_margins": 0.9643799066543579, "rewards/weighted_rejected": -6.002245903015137, "step": 1350 }, { "epoch": 0.7287340924313462, "grad_norm": 60.59130096435547, "learning_rate": 2.0914128375069722e-07, "logits/chosen": -3.3700194358825684, "logits/rejected": -3.3509764671325684, "logps/chosen": -2665.44384765625, "logps/rejected": -3530.987548828125, "logps/weighted_chosen": -5.399316310882568, "logps/weighted_rejected": -6.340136528015137, "loss": 0.5659, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2463.637451171875, "rewards/margins": 849.4015502929688, "rewards/rejected": -3314.018798828125, "rewards/weighted_accuracies": 0.7281249761581421, "rewards/weighted_chosen": -4.841796875, "rewards/weighted_margins": 0.9273926019668579, "rewards/weighted_rejected": -5.770410060882568, "step": 1360 }, { "epoch": 0.7340924313462827, "grad_norm": 104.74171447753906, "learning_rate": 2.015873929087482e-07, "logits/chosen": -3.359179735183716, "logits/rejected": -3.3487305641174316, "logps/chosen": -2628.96240234375, "logps/rejected": -3298.78125, "logps/weighted_chosen": -4.932714939117432, "logps/weighted_rejected": -5.869042873382568, "loss": 0.5271, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2423.85009765625, "rewards/margins": 660.5452880859375, "rewards/rejected": -3083.10009765625, "rewards/weighted_accuracies": 0.734375, "rewards/weighted_chosen": -4.387646675109863, "rewards/weighted_margins": 0.9255615472793579, "rewards/weighted_rejected": -5.314843654632568, "step": 1370 }, { "epoch": 0.739450770261219, "grad_norm": 88.1015625, "learning_rate": 1.9413785044249676e-07, "logits/chosen": -3.3573241233825684, "logits/rejected": -3.3525390625, "logps/chosen": -2197.106201171875, "logps/rejected": -2856.64990234375, "logps/weighted_chosen": -4.386181831359863, "logps/weighted_rejected": -5.436425685882568, "loss": 0.486, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1992.543701171875, "rewards/margins": 648.078125, "rewards/rejected": -2640.41259765625, "rewards/weighted_accuracies": 0.762499988079071, "rewards/weighted_chosen": -3.8707518577575684, "rewards/weighted_margins": 1.014892578125, "rewards/weighted_rejected": -4.8857421875, "step": 1380 }, { "epoch": 0.7448091091761554, "grad_norm": 62.56149673461914, "learning_rate": 1.8679526129435358e-07, "logits/chosen": -3.3702149391174316, "logits/rejected": -3.3526368141174316, "logps/chosen": -2864.96875, "logps/rejected": -3427.824951171875, "logps/weighted_chosen": -5.074316501617432, "logps/weighted_rejected": -6.08056640625, "loss": 0.5087, "rewards/accuracies": 0.640625, "rewards/chosen": -2643.831298828125, "rewards/margins": 562.1812744140625, "rewards/rejected": -3208.293701171875, "rewards/weighted_accuracies": 0.762499988079071, "rewards/weighted_chosen": -4.50439453125, "rewards/weighted_margins": 0.972485363483429, "rewards/weighted_rejected": -5.479687690734863, "step": 1390 }, { "epoch": 0.7501674480910918, "grad_norm": 89.40715026855469, "learning_rate": 1.7956219300748792e-07, "logits/chosen": -3.3399415016174316, "logits/rejected": -3.330273389816284, "logps/chosen": -3021.71240234375, "logps/rejected": -3707.03759765625, "logps/weighted_chosen": -5.5947265625, "logps/weighted_rejected": -6.605370998382568, "loss": 0.5289, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -2813.96875, "rewards/margins": 679.5452880859375, "rewards/rejected": -3492.41259765625, "rewards/weighted_accuracies": 0.731249988079071, "rewards/weighted_chosen": -5.048632621765137, "rewards/weighted_margins": 0.9895263910293579, "rewards/weighted_rejected": -6.037695407867432, "step": 1400 }, { "epoch": 0.7555257870060281, "grad_norm": 80.76519012451172, "learning_rate": 1.7244117482801457e-07, "logits/chosen": -3.2992186546325684, "logits/rejected": -3.2899413108825684, "logps/chosen": -3175.44384765625, "logps/rejected": -4062.262451171875, "logps/weighted_chosen": -5.7138671875, "logps/weighted_rejected": -6.551953315734863, "loss": 0.5516, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2974.456298828125, "rewards/margins": 867.3734130859375, "rewards/rejected": -3842.03759765625, "rewards/weighted_accuracies": 0.7093750238418579, "rewards/weighted_chosen": -5.172167778015137, "rewards/weighted_margins": 0.8321533203125, "rewards/weighted_rejected": -6.002831935882568, "step": 1410 }, { "epoch": 0.7608841259209645, "grad_norm": 72.215087890625, "learning_rate": 1.6543469682057104e-07, "logits/chosen": -3.291308641433716, "logits/rejected": -3.2666015625, "logps/chosen": -2498.987548828125, "logps/rejected": -3458.824951171875, "logps/weighted_chosen": -4.884472846984863, "logps/weighted_rejected": -5.822167873382568, "loss": 0.5045, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -2304.59375, "rewards/margins": 943.5999755859375, "rewards/rejected": -3247.59375, "rewards/weighted_accuracies": 0.737500011920929, "rewards/weighted_chosen": -4.341552734375, "rewards/weighted_margins": 0.907763659954071, "rewards/weighted_rejected": -5.25, "step": 1420 }, { "epoch": 0.7662424648359009, "grad_norm": 51.354888916015625, "learning_rate": 1.5854520899759656e-07, "logits/chosen": -3.2826170921325684, "logits/rejected": -3.265820264816284, "logps/chosen": -2747.356201171875, "logps/rejected": -3650.862548828125, "logps/weighted_chosen": -5.053515434265137, "logps/weighted_rejected": -6.081445217132568, "loss": 0.4876, "rewards/accuracies": 0.6875, "rewards/chosen": -2536.012451171875, "rewards/margins": 885.2625122070312, "rewards/rejected": -3421.18115234375, "rewards/weighted_accuracies": 0.768750011920929, "rewards/weighted_chosen": -4.487597465515137, "rewards/weighted_margins": 1.0113036632537842, "rewards/weighted_rejected": -5.4990234375, "step": 1430 }, { "epoch": 0.7716008037508373, "grad_norm": 129.9513397216797, "learning_rate": 1.5177512046261666e-07, "logits/chosen": -3.2759766578674316, "logits/rejected": -3.2642579078674316, "logps/chosen": -2872.762451171875, "logps/rejected": -3759.550048828125, "logps/weighted_chosen": -5.417382717132568, "logps/weighted_rejected": -6.450293064117432, "loss": 0.5024, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -2680.324951171875, "rewards/margins": 868.4000244140625, "rewards/rejected": -3547.94384765625, "rewards/weighted_accuracies": 0.753125011920929, "rewards/weighted_chosen": -4.894238471984863, "rewards/weighted_margins": 1.013818383216858, "rewards/weighted_rejected": -5.909081935882568, "step": 1440 }, { "epoch": 0.7769591426657736, "grad_norm": 65.20556640625, "learning_rate": 1.4512679856783123e-07, "logits/chosen": -3.28271484375, "logits/rejected": -3.2655272483825684, "logps/chosen": -3131.39990234375, "logps/rejected": -3905.887451171875, "logps/weighted_chosen": -5.764355659484863, "logps/weighted_rejected": -6.713574409484863, "loss": 0.5299, "rewards/accuracies": 0.671875, "rewards/chosen": -2915.362548828125, "rewards/margins": 759.78125, "rewards/rejected": -3675.112548828125, "rewards/weighted_accuracies": 0.734375, "rewards/weighted_chosen": -5.2080078125, "rewards/weighted_margins": 0.9625244140625, "rewards/weighted_rejected": -6.171484470367432, "step": 1450 }, { "epoch": 0.78231748158071, "grad_norm": 63.20075988769531, "learning_rate": 1.3860256808630427e-07, "logits/chosen": -3.2359375953674316, "logits/rejected": -3.2249999046325684, "logps/chosen": -2864.375, "logps/rejected": -3565.1875, "logps/weighted_chosen": -5.420361518859863, "logps/weighted_rejected": -6.3603515625, "loss": 0.5327, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2649.88134765625, "rewards/margins": 697.9484252929688, "rewards/rejected": -3349.800048828125, "rewards/weighted_accuracies": 0.7437499761581421, "rewards/weighted_chosen": -4.828271389007568, "rewards/weighted_margins": 0.9333130121231079, "rewards/weighted_rejected": -5.76318359375, "step": 1460 }, { "epoch": 0.7876758204956463, "grad_norm": 66.50836181640625, "learning_rate": 1.3220471039904047e-07, "logits/chosen": -3.240234375, "logits/rejected": -3.222851514816284, "logps/chosen": -2597.887451171875, "logps/rejected": -3301.46240234375, "logps/weighted_chosen": -5.174902439117432, "logps/weighted_rejected": -6.165917873382568, "loss": 0.504, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -2409.58740234375, "rewards/margins": 691.7374877929688, "rewards/rejected": -3101.94384765625, "rewards/weighted_accuracies": 0.746874988079071, "rewards/weighted_chosen": -4.65380859375, "rewards/weighted_margins": 0.966931164264679, "rewards/weighted_rejected": -5.6201171875, "step": 1470 }, { "epoch": 0.7930341594105828, "grad_norm": 73.51492309570312, "learning_rate": 1.2593546269723647e-07, "logits/chosen": -3.24755859375, "logits/rejected": -3.22802734375, "logps/chosen": -2874.012451171875, "logps/rejected": -3520.91259765625, "logps/weighted_chosen": -5.251416206359863, "logps/weighted_rejected": -6.177832126617432, "loss": 0.5262, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2664.25634765625, "rewards/margins": 644.0562744140625, "rewards/rejected": -3306.612548828125, "rewards/weighted_accuracies": 0.731249988079071, "rewards/weighted_chosen": -4.698779106140137, "rewards/weighted_margins": 0.9187988042831421, "rewards/weighted_rejected": -5.615820407867432, "step": 1480 }, { "epoch": 0.7983924983255191, "grad_norm": 44.982112884521484, "learning_rate": 1.1979701719998454e-07, "logits/chosen": -3.227734327316284, "logits/rejected": -3.204296827316284, "logps/chosen": -2508.987548828125, "logps/rejected": -3531.949951171875, "logps/weighted_chosen": -5.145898342132568, "logps/weighted_rejected": -6.172753810882568, "loss": 0.4814, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2321.043701171875, "rewards/margins": 998.4359130859375, "rewards/rejected": -3319.78759765625, "rewards/weighted_accuracies": 0.7593749761581421, "rewards/weighted_chosen": -4.614648342132568, "rewards/weighted_margins": 0.997851550579071, "rewards/weighted_rejected": -5.612011909484863, "step": 1490 }, { "epoch": 0.8037508372404555, "grad_norm": 92.11717987060547, "learning_rate": 1.1379152038770029e-07, "logits/chosen": -3.220019578933716, "logits/rejected": -3.2080078125, "logps/chosen": -2920.581298828125, "logps/rejected": -3633.59375, "logps/weighted_chosen": -5.250292778015137, "logps/weighted_rejected": -6.166601657867432, "loss": 0.5298, "rewards/accuracies": 0.671875, "rewards/chosen": -2697.268798828125, "rewards/margins": 713.0906372070312, "rewards/rejected": -3409.893798828125, "rewards/weighted_accuracies": 0.6968749761581421, "rewards/weighted_chosen": -4.699120998382568, "rewards/weighted_margins": 0.915234386920929, "rewards/weighted_rejected": -5.615527153015137, "step": 1500 }, { "epoch": 0.8037508372404555, "eval_logits/chosen": -3.2061080932617188, "eval_logits/rejected": -3.1859562397003174, "eval_logps/chosen": -2936.50732421875, "eval_logps/rejected": -3657.35107421875, "eval_logps/weighted_chosen": -5.540689468383789, "eval_logps/weighted_rejected": -6.442659854888916, "eval_loss": 0.5436545610427856, "eval_rewards/accuracies": 0.6598798632621765, "eval_rewards/chosen": -2733.882568359375, "eval_rewards/margins": 707.40771484375, "eval_rewards/rejected": -3441.0439453125, "eval_rewards/weighted_accuracies": 0.7239652872085571, "eval_rewards/weighted_chosen": -4.985209941864014, "eval_rewards/weighted_margins": 0.8814277052879333, "eval_rewards/weighted_rejected": -5.866636753082275, "eval_runtime": 1716.6228, "eval_samples_per_second": 1.744, "eval_steps_per_second": 0.436, "step": 1500 } ], "logging_steps": 10, "max_steps": 1867, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }