{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2679169457468185, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005358338914936369, "grad_norm": 31.966846466064453, "learning_rate": 0.0, "logits/chosen": -2.52783203125, "logits/rejected": -2.498046875, "logps/chosen": -277.71875, "logps/rejected": -223.125, "logps/weighted_chosen": -0.533447265625, "logps/weighted_rejected": -0.5252685546875, "loss": 0.6914, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "rewards/weighted_accuracies": 0.0, "rewards/weighted_chosen": 0.0, "rewards/weighted_margins": 0.0, "rewards/weighted_rejected": 0.0, "step": 1 }, { "epoch": 0.0053583389149363695, "grad_norm": 31.4305419921875, "learning_rate": 4.81283422459893e-08, "logits/chosen": -2.42626953125, "logits/rejected": -2.3970539569854736, "logps/chosen": -195.6822967529297, "logps/rejected": -195.890625, "logps/weighted_chosen": -0.5433688759803772, "logps/weighted_rejected": -0.5479600429534912, "loss": 0.6923, "rewards/accuracies": 0.2951388955116272, "rewards/chosen": -0.0735677108168602, "rewards/margins": 0.00434027798473835, "rewards/rejected": -0.0779079869389534, "rewards/weighted_accuracies": 0.34375, "rewards/weighted_chosen": 0.00025431314134038985, "rewards/weighted_margins": 0.0005137125845067203, "rewards/weighted_rejected": -0.0002593994140625, "step": 10 }, { "epoch": 0.010716677829872739, "grad_norm": 17.30971908569336, "learning_rate": 1.0160427807486631e-07, "logits/chosen": -2.400830030441284, "logits/rejected": -2.399658203125, "logps/chosen": -199.91366577148438, "logps/rejected": -202.21328735351562, "logps/weighted_chosen": -0.537811279296875, "logps/weighted_rejected": -0.548992931842804, "loss": 0.6926, "rewards/accuracies": 0.359375, "rewards/chosen": -0.04570312425494194, "rewards/margins": 0.02001953125, "rewards/rejected": -0.06572265923023224, "rewards/weighted_accuracies": 0.3656249940395355, "rewards/weighted_chosen": -0.00011291504051769152, "rewards/weighted_margins": 0.00011787414405262098, "rewards/weighted_rejected": -0.00023076534853316844, "step": 20 }, { "epoch": 0.016075016744809108, "grad_norm": 30.744884490966797, "learning_rate": 1.5508021390374333e-07, "logits/chosen": -2.3249268531799316, "logits/rejected": -2.299609422683716, "logps/chosen": -198.44375610351562, "logps/rejected": -212.16641235351562, "logps/weighted_chosen": -0.555804431438446, "logps/weighted_rejected": -0.547528088092804, "loss": 0.6927, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.0771484375, "rewards/margins": 0.18007811903953552, "rewards/rejected": -0.10292968899011612, "rewards/weighted_accuracies": 0.3656249940395355, "rewards/weighted_chosen": 0.0004203796270303428, "rewards/weighted_margins": -0.0001464843808207661, "rewards/weighted_rejected": 0.0005667686345987022, "step": 30 }, { "epoch": 0.021433355659745478, "grad_norm": 51.52167510986328, "learning_rate": 2.085561497326203e-07, "logits/chosen": -2.3595948219299316, "logits/rejected": -2.375, "logps/chosen": -192.6171875, "logps/rejected": -207.1945343017578, "logps/weighted_chosen": -0.544903576374054, "logps/weighted_rejected": -0.588610827922821, "loss": 0.6926, "rewards/accuracies": 0.31562501192092896, "rewards/chosen": -0.08066406100988388, "rewards/margins": -0.11210937798023224, "rewards/rejected": 0.03144531324505806, "rewards/weighted_accuracies": 0.40937501192092896, "rewards/weighted_chosen": 0.0021041869185864925, "rewards/weighted_margins": 0.0001922607480082661, "rewards/weighted_rejected": 0.0019119263160973787, "step": 40 }, { "epoch": 0.02679169457468185, "grad_norm": 29.50680160522461, "learning_rate": 2.620320855614973e-07, "logits/chosen": -2.3987059593200684, "logits/rejected": -2.3811278343200684, "logps/chosen": -213.0945281982422, "logps/rejected": -228.8328094482422, "logps/weighted_chosen": -0.5421508550643921, "logps/weighted_rejected": -0.557452380657196, "loss": 0.6925, "rewards/accuracies": 0.390625, "rewards/chosen": 0.06367187201976776, "rewards/margins": 0.06484375149011612, "rewards/rejected": -0.0011718750465661287, "rewards/weighted_accuracies": 0.45625001192092896, "rewards/weighted_chosen": 0.0056938170455396175, "rewards/weighted_margins": 0.000339508056640625, "rewards/weighted_rejected": 0.005351257510483265, "step": 50 }, { "epoch": 0.032150033489618215, "grad_norm": 30.643386840820312, "learning_rate": 3.155080213903743e-07, "logits/chosen": -2.347216844558716, "logits/rejected": -2.3227295875549316, "logps/chosen": -211.7421875, "logps/rejected": -218.0234375, "logps/weighted_chosen": -0.536822497844696, "logps/weighted_rejected": -0.568676769733429, "loss": 0.6923, "rewards/accuracies": 0.421875, "rewards/chosen": 0.09257812798023224, "rewards/margins": 0.10820312798023224, "rewards/rejected": -0.015625, "rewards/weighted_accuracies": 0.45625001192092896, "rewards/weighted_chosen": 0.009661102667450905, "rewards/weighted_margins": 0.000804901123046875, "rewards/weighted_rejected": 0.00885620154440403, "step": 60 }, { "epoch": 0.03750837240455459, "grad_norm": 27.744924545288086, "learning_rate": 3.689839572192513e-07, "logits/chosen": -2.442822217941284, "logits/rejected": -2.4500975608825684, "logps/chosen": -194.47891235351562, "logps/rejected": -200.30624389648438, "logps/weighted_chosen": -0.5030456781387329, "logps/weighted_rejected": -0.5135132074356079, "loss": 0.6906, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": 0.03203124925494194, "rewards/margins": 0.20937499403953552, "rewards/rejected": -0.17734375596046448, "rewards/weighted_accuracies": 0.5218750238418579, "rewards/weighted_chosen": 0.01783904992043972, "rewards/weighted_margins": 0.0045639039017260075, "rewards/weighted_rejected": 0.013275146484375, "step": 70 }, { "epoch": 0.042866711319490956, "grad_norm": 38.061683654785156, "learning_rate": 4.2245989304812833e-07, "logits/chosen": -2.403271436691284, "logits/rejected": -2.430908203125, "logps/chosen": -219.08438110351562, "logps/rejected": -239.91561889648438, "logps/weighted_chosen": -0.529132068157196, "logps/weighted_rejected": -0.5560547113418579, "loss": 0.6903, "rewards/accuracies": 0.43437498807907104, "rewards/chosen": 0.29023438692092896, "rewards/margins": 0.27910155057907104, "rewards/rejected": 0.01113281212747097, "rewards/weighted_accuracies": 0.518750011920929, "rewards/weighted_chosen": 0.022306060418486595, "rewards/weighted_margins": 0.005640792660415173, "rewards/weighted_rejected": 0.016665268689393997, "step": 80 }, { "epoch": 0.04822505023442733, "grad_norm": 28.250593185424805, "learning_rate": 4.7593582887700533e-07, "logits/chosen": -2.392260789871216, "logits/rejected": -2.4065184593200684, "logps/chosen": -195.1953125, "logps/rejected": -207.7804718017578, "logps/weighted_chosen": -0.51409912109375, "logps/weighted_rejected": -0.526226818561554, "loss": 0.6906, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.10097656399011612, "rewards/margins": 0.724414050579071, "rewards/rejected": -0.6234375238418579, "rewards/weighted_accuracies": 0.559374988079071, "rewards/weighted_chosen": 0.025403594598174095, "rewards/weighted_margins": 0.006229400634765625, "rewards/weighted_rejected": 0.01916809007525444, "step": 90 }, { "epoch": 0.0535833891493637, "grad_norm": 18.92244529724121, "learning_rate": 5.294117647058823e-07, "logits/chosen": -2.3767333030700684, "logits/rejected": -2.3611817359924316, "logps/chosen": -224.2859344482422, "logps/rejected": -229.6570281982422, "logps/weighted_chosen": -0.534497082233429, "logps/weighted_rejected": -0.5674392580986023, "loss": 0.6903, "rewards/accuracies": 0.5093749761581421, "rewards/chosen": -0.8511718511581421, "rewards/margins": 1.123437523841858, "rewards/rejected": -1.974609375, "rewards/weighted_accuracies": 0.53125, "rewards/weighted_chosen": 0.023969268426299095, "rewards/weighted_margins": 0.00682754535228014, "rewards/weighted_rejected": 0.01714172400534153, "step": 100 }, { "epoch": 0.058941728064300064, "grad_norm": 50.693965911865234, "learning_rate": 5.828877005347593e-07, "logits/chosen": -2.3811888694763184, "logits/rejected": -2.382458448410034, "logps/chosen": -217.30703735351562, "logps/rejected": -225.4343719482422, "logps/weighted_chosen": -0.54705810546875, "logps/weighted_rejected": -0.5594238042831421, "loss": 0.6951, "rewards/accuracies": 0.46562498807907104, "rewards/chosen": -2.3193359375, "rewards/margins": 0.624218761920929, "rewards/rejected": -2.942578077316284, "rewards/weighted_accuracies": 0.4781250059604645, "rewards/weighted_chosen": 0.022613525390625, "rewards/weighted_margins": -0.0009094238048419356, "rewards/weighted_rejected": 0.02351989783346653, "step": 110 }, { "epoch": 0.06430006697923643, "grad_norm": 27.526296615600586, "learning_rate": 6.363636363636363e-07, "logits/chosen": -2.459423780441284, "logits/rejected": -2.4720702171325684, "logps/chosen": -187.80313110351562, "logps/rejected": -226.0906219482422, "logps/weighted_chosen": -0.533001720905304, "logps/weighted_rejected": -0.605639636516571, "loss": 0.6903, "rewards/accuracies": 0.49687498807907104, "rewards/chosen": -3.5208983421325684, "rewards/margins": 1.0193359851837158, "rewards/rejected": -4.540820121765137, "rewards/weighted_accuracies": 0.5062500238418579, "rewards/weighted_chosen": 0.005961227230727673, "rewards/weighted_margins": 0.009808349423110485, "rewards/weighted_rejected": -0.0038513182662427425, "step": 120 }, { "epoch": 0.06965840589417281, "grad_norm": 26.822662353515625, "learning_rate": 6.898395721925134e-07, "logits/chosen": -2.565673828125, "logits/rejected": -2.564648389816284, "logps/chosen": -231.10000610351562, "logps/rejected": -240.86093139648438, "logps/weighted_chosen": -0.537158191204071, "logps/weighted_rejected": -0.5712524652481079, "loss": 0.6953, "rewards/accuracies": 0.53125, "rewards/chosen": -2.906054735183716, "rewards/margins": 1.6062500476837158, "rewards/rejected": -4.512109279632568, "rewards/weighted_accuracies": 0.49687498807907104, "rewards/weighted_chosen": 0.00572967529296875, "rewards/weighted_margins": -0.00041961669921875, "rewards/weighted_rejected": 0.0061210631392896175, "step": 130 }, { "epoch": 0.07501674480910918, "grad_norm": 19.93072509765625, "learning_rate": 7.433155080213903e-07, "logits/chosen": -2.53515625, "logits/rejected": -2.5311522483825684, "logps/chosen": -198.80313110351562, "logps/rejected": -211.29061889648438, "logps/weighted_chosen": -0.5693206787109375, "logps/weighted_rejected": -0.592010498046875, "loss": 0.694, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -4.0048828125, "rewards/margins": 2.3238282203674316, "rewards/rejected": -6.327929496765137, "rewards/weighted_accuracies": 0.518750011920929, "rewards/weighted_chosen": -0.017972564324736595, "rewards/weighted_margins": 0.005100250244140625, "rewards/weighted_rejected": -0.02306365966796875, "step": 140 }, { "epoch": 0.08037508372404555, "grad_norm": 26.39690399169922, "learning_rate": 7.967914438502673e-07, "logits/chosen": -2.509326219558716, "logits/rejected": -2.508056640625, "logps/chosen": -214.25, "logps/rejected": -237.84530639648438, "logps/weighted_chosen": -0.5889526605606079, "logps/weighted_rejected": -0.585980236530304, "loss": 0.7007, "rewards/accuracies": 0.546875, "rewards/chosen": -3.867382764816284, "rewards/margins": 2.546581983566284, "rewards/rejected": -6.4140625, "rewards/weighted_accuracies": 0.550000011920929, "rewards/weighted_chosen": -0.041257478296756744, "rewards/weighted_margins": -0.005215453915297985, "rewards/weighted_rejected": -0.03600044175982475, "step": 150 }, { "epoch": 0.08573342263898191, "grad_norm": 25.74460792541504, "learning_rate": 8.502673796791443e-07, "logits/chosen": -2.433154344558716, "logits/rejected": -2.414794921875, "logps/chosen": -200.64999389648438, "logps/rejected": -229.2624969482422, "logps/weighted_chosen": -0.589752197265625, "logps/weighted_rejected": -0.612139880657196, "loss": 0.6933, "rewards/accuracies": 0.5406249761581421, "rewards/chosen": -6.800390720367432, "rewards/margins": 0.5577148199081421, "rewards/rejected": -7.36279296875, "rewards/weighted_accuracies": 0.5218750238418579, "rewards/weighted_chosen": -0.05205688625574112, "rewards/weighted_margins": 0.006626891903579235, "rewards/weighted_rejected": -0.05869255214929581, "step": 160 }, { "epoch": 0.09109176155391828, "grad_norm": 24.530479431152344, "learning_rate": 9.037433155080213e-07, "logits/chosen": -2.5284180641174316, "logits/rejected": -2.5411133766174316, "logps/chosen": -204.92031860351562, "logps/rejected": -217.046875, "logps/weighted_chosen": -0.600720226764679, "logps/weighted_rejected": -0.6278625726699829, "loss": 0.6787, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -7.408789157867432, "rewards/margins": 4.765625, "rewards/rejected": -12.178515434265137, "rewards/weighted_accuracies": 0.612500011920929, "rewards/weighted_chosen": -0.05335540696978569, "rewards/weighted_margins": 0.03420104831457138, "rewards/weighted_rejected": -0.08757324516773224, "step": 170 }, { "epoch": 0.09645010046885466, "grad_norm": 17.329570770263672, "learning_rate": 9.572192513368984e-07, "logits/chosen": -2.638134717941284, "logits/rejected": -2.6302247047424316, "logps/chosen": -219.87655639648438, "logps/rejected": -240.1531219482422, "logps/weighted_chosen": -0.637438952922821, "logps/weighted_rejected": -0.700207531452179, "loss": 0.682, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -10.272656440734863, "rewards/margins": 4.742578029632568, "rewards/rejected": -15.017578125, "rewards/weighted_accuracies": 0.581250011920929, "rewards/weighted_chosen": -0.09931640326976776, "rewards/weighted_margins": 0.0342864990234375, "rewards/weighted_rejected": -0.13359375298023224, "step": 180 }, { "epoch": 0.10180843938379103, "grad_norm": 40.06588363647461, "learning_rate": 9.999965031204306e-07, "logits/chosen": -2.677978515625, "logits/rejected": -2.667724609375, "logps/chosen": -217.265625, "logps/rejected": -217.9968719482422, "logps/weighted_chosen": -0.713732898235321, "logps/weighted_rejected": -0.745983898639679, "loss": 0.6972, "rewards/accuracies": 0.559374988079071, "rewards/chosen": -16.422557830810547, "rewards/margins": 3.5362305641174316, "rewards/rejected": -19.955078125, "rewards/weighted_accuracies": 0.578125, "rewards/weighted_chosen": -0.1743011474609375, "rewards/weighted_margins": 0.01756439171731472, "rewards/weighted_rejected": -0.19183501601219177, "step": 190 }, { "epoch": 0.1071667782987274, "grad_norm": 26.449254989624023, "learning_rate": 9.998741174712533e-07, "logits/chosen": -2.758007764816284, "logits/rejected": -2.752197265625, "logps/chosen": -251.4640655517578, "logps/rejected": -266.7734375, "logps/weighted_chosen": -0.721697986125946, "logps/weighted_rejected": -0.7427002191543579, "loss": 0.6956, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -23.8759765625, "rewards/margins": 3.2962889671325684, "rewards/rejected": -27.181835174560547, "rewards/weighted_accuracies": 0.5, "rewards/weighted_chosen": -0.14485931396484375, "rewards/weighted_margins": 0.00913848914206028, "rewards/weighted_rejected": -0.153900146484375, "step": 200 }, { "epoch": 0.11252511721366376, "grad_norm": 25.405826568603516, "learning_rate": 9.995769367531952e-07, "logits/chosen": -2.8843750953674316, "logits/rejected": -2.8621582984924316, "logps/chosen": -249.39999389648438, "logps/rejected": -249.5625, "logps/weighted_chosen": -0.6254638433456421, "logps/weighted_rejected": -0.677978515625, "loss": 0.6845, "rewards/accuracies": 0.5531250238418579, "rewards/chosen": -31.167186737060547, "rewards/margins": 2.1792969703674316, "rewards/rejected": -33.351173400878906, "rewards/weighted_accuracies": 0.528124988079071, "rewards/weighted_chosen": -0.11136627197265625, "rewards/weighted_margins": 0.02726135216653347, "rewards/weighted_rejected": -0.1386566162109375, "step": 210 }, { "epoch": 0.11788345612860013, "grad_norm": 22.92214012145996, "learning_rate": 9.991050648838675e-07, "logits/chosen": -3.027539014816284, "logits/rejected": -3.0234375, "logps/chosen": -249.0265655517578, "logps/rejected": -264.1703186035156, "logps/weighted_chosen": -0.72125244140625, "logps/weighted_rejected": -0.7577880620956421, "loss": 0.6887, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -44.36796951293945, "rewards/margins": 2.092968702316284, "rewards/rejected": -46.47265625, "rewards/weighted_accuracies": 0.5218750238418579, "rewards/weighted_chosen": -0.15605469048023224, "rewards/weighted_margins": 0.022979736328125, "rewards/weighted_rejected": -0.17911987006664276, "step": 220 }, { "epoch": 0.12324179504353651, "grad_norm": 18.913394927978516, "learning_rate": 9.98458666866564e-07, "logits/chosen": -2.95361328125, "logits/rejected": -2.9599609375, "logps/chosen": -252.1765594482422, "logps/rejected": -273.97186279296875, "logps/weighted_chosen": -0.7530273199081421, "logps/weighted_rejected": -0.7951415777206421, "loss": 0.6805, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -55.478126525878906, "rewards/margins": 9.685155868530273, "rewards/rejected": -65.15702819824219, "rewards/weighted_accuracies": 0.546875, "rewards/weighted_chosen": -0.202239990234375, "rewards/weighted_margins": 0.04668273776769638, "rewards/weighted_rejected": -0.248951718211174, "step": 230 }, { "epoch": 0.12860013395847286, "grad_norm": 18.727752685546875, "learning_rate": 9.97637968732563e-07, "logits/chosen": -2.9483885765075684, "logits/rejected": -2.934033155441284, "logps/chosen": -264.1937561035156, "logps/rejected": -281.26873779296875, "logps/weighted_chosen": -0.7586425542831421, "logps/weighted_rejected": -0.777172863483429, "loss": 0.7011, "rewards/accuracies": 0.565625011920929, "rewards/chosen": -60.15625, "rewards/margins": 7.514062404632568, "rewards/rejected": -67.6640625, "rewards/weighted_accuracies": 0.528124988079071, "rewards/weighted_chosen": -0.22589111328125, "rewards/weighted_margins": 0.0014251709217205644, "rewards/weighted_rejected": -0.22734984755516052, "step": 240 }, { "epoch": 0.13395847287340926, "grad_norm": 17.17424774169922, "learning_rate": 9.966432574620906e-07, "logits/chosen": -2.994140625, "logits/rejected": -2.995654344558716, "logps/chosen": -250.4296875, "logps/rejected": -282.0859375, "logps/weighted_chosen": -0.707995593547821, "logps/weighted_rejected": -0.76885986328125, "loss": 0.6897, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -55.573829650878906, "rewards/margins": 7.142187595367432, "rewards/rejected": -62.73515701293945, "rewards/weighted_accuracies": 0.528124988079071, "rewards/weighted_chosen": -0.18481139838695526, "rewards/weighted_margins": 0.01998596265912056, "rewards/weighted_rejected": -0.20475539565086365, "step": 250 }, { "epoch": 0.13931681178834562, "grad_norm": 18.692733764648438, "learning_rate": 9.954748808839674e-07, "logits/chosen": -2.887255907058716, "logits/rejected": -2.898193359375, "logps/chosen": -280.46875, "logps/rejected": -280.6640625, "logps/weighted_chosen": -0.7401489019393921, "logps/weighted_rejected": -0.776611328125, "loss": 0.6869, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -53.8828125, "rewards/margins": 5.946484565734863, "rewards/rejected": -59.81757736206055, "rewards/weighted_accuracies": 0.53125, "rewards/weighted_chosen": -0.18103942275047302, "rewards/weighted_margins": 0.02960510179400444, "rewards/weighted_rejected": -0.21054688096046448, "step": 260 }, { "epoch": 0.144675150703282, "grad_norm": 16.58523941040039, "learning_rate": 9.941332475539824e-07, "logits/chosen": -2.8628907203674316, "logits/rejected": -2.87939453125, "logps/chosen": -267.26251220703125, "logps/rejected": -306.49688720703125, "logps/weighted_chosen": -0.7835937738418579, "logps/weighted_rejected": -0.8957885503768921, "loss": 0.6712, "rewards/accuracies": 0.625, "rewards/chosen": -57.08281326293945, "rewards/margins": 12.824609756469727, "rewards/rejected": -69.8984375, "rewards/weighted_accuracies": 0.596875011920929, "rewards/weighted_chosen": -0.2109939604997635, "rewards/weighted_margins": 0.07429657131433487, "rewards/weighted_rejected": -0.2853530943393707, "step": 270 }, { "epoch": 0.15003348961821836, "grad_norm": 23.606122970581055, "learning_rate": 9.926188266120295e-07, "logits/chosen": -2.9261717796325684, "logits/rejected": -2.9302000999450684, "logps/chosen": -252.35781860351562, "logps/rejected": -270.3921813964844, "logps/weighted_chosen": -0.757946789264679, "logps/weighted_rejected": -0.8466796875, "loss": 0.6669, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -56.718360900878906, "rewards/margins": 10.275781631469727, "rewards/rejected": -66.990234375, "rewards/weighted_accuracies": 0.606249988079071, "rewards/weighted_chosen": -0.2161209136247635, "rewards/weighted_margins": 0.08074493706226349, "rewards/weighted_rejected": -0.2967941164970398, "step": 280 }, { "epoch": 0.15539182853315472, "grad_norm": 25.32463836669922, "learning_rate": 9.909321476180591e-07, "logits/chosen": -2.9480957984924316, "logits/rejected": -2.9312500953674316, "logps/chosen": -281.33123779296875, "logps/rejected": -317.3374938964844, "logps/weighted_chosen": -0.870861828327179, "logps/weighted_rejected": -0.9639892578125, "loss": 0.6868, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -78.2476577758789, "rewards/margins": 12.921093940734863, "rewards/rejected": -91.15312194824219, "rewards/weighted_accuracies": 0.534375011920929, "rewards/weighted_chosen": -0.32458800077438354, "rewards/weighted_margins": 0.04551086574792862, "rewards/weighted_rejected": -0.37019044160842896, "step": 290 }, { "epoch": 0.1607501674480911, "grad_norm": 21.94011116027832, "learning_rate": 9.890738003669027e-07, "logits/chosen": -3.082812547683716, "logits/rejected": -3.089111328125, "logps/chosen": -288.62811279296875, "logps/rejected": -311.38751220703125, "logps/weighted_chosen": -0.955078125, "logps/weighted_rejected": -1.0396239757537842, "loss": 0.687, "rewards/accuracies": 0.5625, "rewards/chosen": -93.6109390258789, "rewards/margins": 9.185546875, "rewards/rejected": -102.78202819824219, "rewards/weighted_accuracies": 0.559374988079071, "rewards/weighted_chosen": -0.40625, "rewards/weighted_margins": 0.07210388034582138, "rewards/weighted_rejected": -0.478240966796875, "step": 300 }, { "epoch": 0.16610850636302746, "grad_norm": 25.38623046875, "learning_rate": 9.870444346820348e-07, "logits/chosen": -3.173828125, "logits/rejected": -3.147998094558716, "logps/chosen": -297.15936279296875, "logps/rejected": -320.50311279296875, "logps/weighted_chosen": -0.89190673828125, "logps/weighted_rejected": -0.923266589641571, "loss": 0.6987, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -83.2046890258789, "rewards/margins": 12.489062309265137, "rewards/rejected": -95.6937484741211, "rewards/weighted_accuracies": 0.581250011920929, "rewards/weighted_chosen": -0.33665162324905396, "rewards/weighted_margins": 0.00975952111184597, "rewards/weighted_rejected": -0.34637755155563354, "step": 310 }, { "epoch": 0.17146684527796383, "grad_norm": 17.789676666259766, "learning_rate": 9.848447601883433e-07, "logits/chosen": -3.236083984375, "logits/rejected": -3.2168946266174316, "logps/chosen": -267.09844970703125, "logps/rejected": -310.3031311035156, "logps/weighted_chosen": -0.8338623046875, "logps/weighted_rejected": -0.918957531452179, "loss": 0.6805, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -70.2816390991211, "rewards/margins": 19.578907012939453, "rewards/rejected": -89.81953430175781, "rewards/weighted_accuracies": 0.590624988079071, "rewards/weighted_chosen": -0.2871032655239105, "rewards/weighted_margins": 0.06427612155675888, "rewards/weighted_rejected": -0.3513946533203125, "step": 320 }, { "epoch": 0.1768251841929002, "grad_norm": 24.225061416625977, "learning_rate": 9.824755460639899e-07, "logits/chosen": -3.1796875, "logits/rejected": -3.179150342941284, "logps/chosen": -265.12030029296875, "logps/rejected": -308.19842529296875, "logps/weighted_chosen": -0.833850085735321, "logps/weighted_rejected": -0.8948730230331421, "loss": 0.6836, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -73.27030944824219, "rewards/margins": 16.651952743530273, "rewards/rejected": -89.87968444824219, "rewards/weighted_accuracies": 0.5531250238418579, "rewards/weighted_chosen": -0.31196290254592896, "rewards/weighted_margins": 0.04750366136431694, "rewards/weighted_rejected": -0.35954588651657104, "step": 330 }, { "epoch": 0.18218352310783656, "grad_norm": 19.119462966918945, "learning_rate": 9.799376207714444e-07, "logits/chosen": -3.1602540016174316, "logits/rejected": -3.154492139816284, "logps/chosen": -277.9624938964844, "logps/rejected": -296.7093811035156, "logps/weighted_chosen": -0.902087390422821, "logps/weighted_rejected": -0.974658191204071, "loss": 0.6727, "rewards/accuracies": 0.5406249761581421, "rewards/chosen": -77.0367202758789, "rewards/margins": 9.926562309265137, "rewards/rejected": -86.9476547241211, "rewards/weighted_accuracies": 0.559374988079071, "rewards/weighted_chosen": -0.334219366312027, "rewards/weighted_margins": 0.06939239799976349, "rewards/weighted_rejected": -0.4036361575126648, "step": 340 }, { "epoch": 0.18754186202277295, "grad_norm": 22.155990600585938, "learning_rate": 9.772318717677903e-07, "logits/chosen": -3.130908250808716, "logits/rejected": -3.132128953933716, "logps/chosen": -288.41094970703125, "logps/rejected": -313.0843811035156, "logps/weighted_chosen": -0.9267333745956421, "logps/weighted_rejected": -1.040771484375, "loss": 0.6683, "rewards/accuracies": 0.625, "rewards/chosen": -88.24609375, "rewards/margins": 18.800390243530273, "rewards/rejected": -107.046875, "rewards/weighted_accuracies": 0.6156250238418579, "rewards/weighted_chosen": -0.39622801542282104, "rewards/weighted_margins": 0.08176574856042862, "rewards/weighted_rejected": -0.47796326875686646, "step": 350 }, { "epoch": 0.19290020093770932, "grad_norm": 26.421598434448242, "learning_rate": 9.743592451943998e-07, "logits/chosen": -3.229687452316284, "logits/rejected": -3.228222608566284, "logps/chosen": -327.7328186035156, "logps/rejected": -361.0015563964844, "logps/weighted_chosen": -0.990966796875, "logps/weighted_rejected": -1.057519555091858, "loss": 0.6726, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -116.12812805175781, "rewards/margins": 18.506641387939453, "rewards/rejected": -134.654296875, "rewards/weighted_accuracies": 0.543749988079071, "rewards/weighted_chosen": -0.4190429747104645, "rewards/weighted_margins": 0.08066101372241974, "rewards/weighted_rejected": -0.499612420797348, "step": 360 }, { "epoch": 0.1982585398526457, "grad_norm": 19.50035858154297, "learning_rate": 9.713207455460892e-07, "logits/chosen": -3.184375047683716, "logits/rejected": -3.209912061691284, "logps/chosen": -326.4937438964844, "logps/rejected": -343.4593811035156, "logps/weighted_chosen": -1.00506591796875, "logps/weighted_rejected": -1.092797875404358, "loss": 0.6762, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -120.6937484741211, "rewards/margins": 6.901953220367432, "rewards/rejected": -127.60469055175781, "rewards/weighted_accuracies": 0.5531250238418579, "rewards/weighted_chosen": -0.4098266661167145, "rewards/weighted_margins": 0.08511047065258026, "rewards/weighted_rejected": -0.495004266500473, "step": 370 }, { "epoch": 0.20361687876758205, "grad_norm": 17.131114959716797, "learning_rate": 9.681174353198686e-07, "logits/chosen": -3.2510743141174316, "logits/rejected": -3.23828125, "logps/chosen": -330.140625, "logps/rejected": -368.10467529296875, "logps/weighted_chosen": -0.9594482183456421, "logps/weighted_rejected": -1.056616187095642, "loss": 0.6719, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -119.54219055175781, "rewards/margins": 20.350391387939453, "rewards/rejected": -139.82461547851562, "rewards/weighted_accuracies": 0.574999988079071, "rewards/weighted_chosen": -0.415090948343277, "rewards/weighted_margins": 0.086700439453125, "rewards/weighted_rejected": -0.501788318157196, "step": 380 }, { "epoch": 0.20897521768251842, "grad_norm": 38.318756103515625, "learning_rate": 9.647504346434103e-07, "logits/chosen": -3.445019483566284, "logits/rejected": -3.4564452171325684, "logps/chosen": -371.9156188964844, "logps/rejected": -389.43438720703125, "logps/weighted_chosen": -1.062597632408142, "logps/weighted_rejected": -1.165747046470642, "loss": 0.6656, "rewards/accuracies": 0.578125, "rewards/chosen": -150.7859344482422, "rewards/margins": 19.3828125, "rewards/rejected": -170.19686889648438, "rewards/weighted_accuracies": 0.578125, "rewards/weighted_chosen": -0.49369508028030396, "rewards/weighted_margins": 0.09916381537914276, "rewards/weighted_rejected": -0.5928589105606079, "step": 390 }, { "epoch": 0.2143335565974548, "grad_norm": 37.75257873535156, "learning_rate": 9.612209208833646e-07, "logits/chosen": -3.4891600608825684, "logits/rejected": -3.4913086891174316, "logps/chosen": -373.2046813964844, "logps/rejected": -387.3968811035156, "logps/weighted_chosen": -1.159387230873108, "logps/weighted_rejected": -1.2549560070037842, "loss": 0.6802, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -173.24063110351562, "rewards/margins": 12.66796875, "rewards/rejected": -185.9562530517578, "rewards/weighted_accuracies": 0.5531250238418579, "rewards/weighted_chosen": -0.619030773639679, "rewards/weighted_margins": 0.07501526176929474, "rewards/weighted_rejected": -0.694287121295929, "step": 400 }, { "epoch": 0.21969189551239116, "grad_norm": 62.04045486450195, "learning_rate": 9.5753012823366e-07, "logits/chosen": -3.5625, "logits/rejected": -3.555224657058716, "logps/chosen": -392.5843811035156, "logps/rejected": -429.76873779296875, "logps/weighted_chosen": -1.109643578529358, "logps/weighted_rejected": -1.2004883289337158, "loss": 0.6903, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -180.3562469482422, "rewards/margins": 26.413280487060547, "rewards/rejected": -206.75936889648438, "rewards/weighted_accuracies": 0.5375000238418579, "rewards/weighted_chosen": -0.5794036984443665, "rewards/weighted_margins": 0.08088378608226776, "rewards/weighted_rejected": -0.660137951374054, "step": 410 }, { "epoch": 0.22505023442732752, "grad_norm": 29.432497024536133, "learning_rate": 9.536793472839324e-07, "logits/chosen": -3.7582030296325684, "logits/rejected": -3.77783203125, "logps/chosen": -413.109375, "logps/rejected": -459.015625, "logps/weighted_chosen": -1.0903198719024658, "logps/weighted_rejected": -1.1765625476837158, "loss": 0.6833, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -182.9140625, "rewards/margins": 33.59453201293945, "rewards/rejected": -216.5343780517578, "rewards/weighted_accuracies": 0.590624988079071, "rewards/weighted_chosen": -0.528045654296875, "rewards/weighted_margins": 0.08157958835363388, "rewards/weighted_rejected": -0.609661877155304, "step": 420 }, { "epoch": 0.2304085733422639, "grad_norm": 34.61969757080078, "learning_rate": 9.496699245682351e-07, "logits/chosen": -3.876757860183716, "logits/rejected": -3.873339891433716, "logps/chosen": -411.23748779296875, "logps/rejected": -474.3687438964844, "logps/weighted_chosen": -1.104589819908142, "logps/weighted_rejected": -1.18817138671875, "loss": 0.6878, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -190.3468780517578, "rewards/margins": 43.072265625, "rewards/rejected": -233.43124389648438, "rewards/weighted_accuracies": 0.5531250238418579, "rewards/weighted_chosen": -0.543225109577179, "rewards/weighted_margins": 0.05898132175207138, "rewards/weighted_rejected": -0.6022491455078125, "step": 430 }, { "epoch": 0.23576691225720026, "grad_norm": 25.689876556396484, "learning_rate": 9.455032620941839e-07, "logits/chosen": -3.893261671066284, "logits/rejected": -3.866406202316284, "logps/chosen": -390.3531188964844, "logps/rejected": -474.80780029296875, "logps/weighted_chosen": -1.0908081531524658, "logps/weighted_rejected": -1.19012451171875, "loss": 0.6673, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -196.52188110351562, "rewards/margins": 52.310546875, "rewards/rejected": -248.7218780517578, "rewards/weighted_accuracies": 0.5718749761581421, "rewards/weighted_chosen": -0.555419921875, "rewards/weighted_margins": 0.09650878608226776, "rewards/weighted_rejected": -0.651898205280304, "step": 440 }, { "epoch": 0.24112525117213665, "grad_norm": 54.604705810546875, "learning_rate": 9.411808168527066e-07, "logits/chosen": -3.88916015625, "logits/rejected": -3.8941407203674316, "logps/chosen": -411.3031311035156, "logps/rejected": -472.203125, "logps/weighted_chosen": -1.082067847251892, "logps/weighted_rejected": -1.1839110851287842, "loss": 0.6816, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -205.703125, "rewards/margins": 46.81132888793945, "rewards/rejected": -252.5343780517578, "rewards/weighted_accuracies": 0.590624988079071, "rewards/weighted_chosen": -0.554046630859375, "rewards/weighted_margins": 0.07362671196460724, "rewards/weighted_rejected": -0.6277831792831421, "step": 450 }, { "epoch": 0.24648359008707302, "grad_norm": 26.44339942932129, "learning_rate": 9.367041003085648e-07, "logits/chosen": -3.8785157203674316, "logits/rejected": -3.866894483566284, "logps/chosen": -523.2593994140625, "logps/rejected": -523.2750244140625, "logps/weighted_chosen": -1.2444579601287842, "logps/weighted_rejected": -1.3485107421875, "loss": 0.6775, "rewards/accuracies": 0.5625, "rewards/chosen": -304.52813720703125, "rewards/margins": 4.764062404632568, "rewards/rejected": -309.3812561035156, "rewards/weighted_accuracies": 0.590624988079071, "rewards/weighted_chosen": -0.6716858148574829, "rewards/weighted_margins": 0.08544921875, "rewards/weighted_rejected": -0.757153332233429, "step": 460 }, { "epoch": 0.25184192900200936, "grad_norm": 28.781457901000977, "learning_rate": 9.320746778718274e-07, "logits/chosen": -4.042578220367432, "logits/rejected": -4.053515434265137, "logps/chosen": -444.046875, "logps/rejected": -514.0531005859375, "logps/weighted_chosen": -1.281274437904358, "logps/weighted_rejected": -1.4283936023712158, "loss": 0.6526, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -256.4828186035156, "rewards/margins": 53.646095275878906, "rewards/rejected": -310.15313720703125, "rewards/weighted_accuracies": 0.596875011920929, "rewards/weighted_chosen": -0.714306652545929, "rewards/weighted_margins": 0.13612060248851776, "rewards/weighted_rejected": -0.8504638671875, "step": 470 }, { "epoch": 0.2572002679169457, "grad_norm": 58.31265640258789, "learning_rate": 9.272941683504808e-07, "logits/chosen": -4.060449123382568, "logits/rejected": -4.083691596984863, "logps/chosen": -564.7312622070312, "logps/rejected": -582.9124755859375, "logps/weighted_chosen": -1.305688500404358, "logps/weighted_rejected": -1.470678687095642, "loss": 0.6627, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -346.97344970703125, "rewards/margins": 15.486719131469727, "rewards/rejected": -362.28594970703125, "rewards/weighted_accuracies": 0.596875011920929, "rewards/weighted_chosen": -0.7694152593612671, "rewards/weighted_margins": 0.14488831162452698, "rewards/weighted_rejected": -0.9143127202987671, "step": 480 }, { "epoch": 0.2625586068318821, "grad_norm": 28.284282684326172, "learning_rate": 9.223642433843679e-07, "logits/chosen": -3.8697266578674316, "logits/rejected": -3.857128858566284, "logps/chosen": -500.0718688964844, "logps/rejected": -534.6281127929688, "logps/weighted_chosen": -1.1939697265625, "logps/weighted_rejected": -1.323388695716858, "loss": 0.6567, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -298.09063720703125, "rewards/margins": 29.775781631469727, "rewards/rejected": -328.0640563964844, "rewards/weighted_accuracies": 0.59375, "rewards/weighted_chosen": -0.631103515625, "rewards/weighted_margins": 0.13239136338233948, "rewards/weighted_rejected": -0.763378918170929, "step": 490 }, { "epoch": 0.2679169457468185, "grad_norm": 32.619327545166016, "learning_rate": 9.172866268606513e-07, "logits/chosen": -3.828320264816284, "logits/rejected": -3.82373046875, "logps/chosen": -527.2687377929688, "logps/rejected": -588.8343505859375, "logps/weighted_chosen": -1.291723608970642, "logps/weighted_rejected": -1.4711182117462158, "loss": 0.6559, "rewards/accuracies": 0.625, "rewards/chosen": -312.36248779296875, "rewards/margins": 48.47734451293945, "rewards/rejected": -360.8890686035156, "rewards/weighted_accuracies": 0.59375, "rewards/weighted_chosen": -0.727709949016571, "rewards/weighted_margins": 0.145904541015625, "rewards/weighted_rejected": -0.8731445074081421, "step": 500 }, { "epoch": 0.2679169457468185, "eval_logits/chosen": -3.994471788406372, "eval_logits/rejected": -4.005757808685303, "eval_logps/chosen": -530.9920043945312, "eval_logps/rejected": -592.9425659179688, "eval_logps/weighted_chosen": -1.3731945753097534, "eval_logps/weighted_rejected": -1.5277278423309326, "eval_loss": 0.6660141348838806, "eval_rewards/accuracies": 0.5874499082565308, "eval_rewards/chosen": -328.0513916015625, "eval_rewards/margins": 48.42765426635742, "eval_rewards/rejected": -376.4005432128906, "eval_rewards/weighted_accuracies": 0.5901201367378235, "eval_rewards/weighted_chosen": -0.8177139759063721, "eval_rewards/weighted_margins": 0.13399113714694977, "eval_rewards/weighted_rejected": -0.9517050385475159, "eval_runtime": 1889.7494, "eval_samples_per_second": 1.584, "eval_steps_per_second": 0.396, "step": 500 } ], "logging_steps": 10, "max_steps": 1867, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }