{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 340, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0029411764705882353, "grad_norm": 2.3687267303466797, "kl/avg_steps": 0.0, "kl/beta": 0.009999999776482582, "kl/n_epsilon_steps": 0.5, "kl/p_epsilon_steps": 0.5, "learning_rate": 0.0, "logits/chosen": -0.5232092142105103, "logits/rejected": -0.36964714527130127, "logps/chosen": -69.28079223632812, "logps/ref_chosen": -69.2831802368164, "logps/ref_rejected": -69.74366760253906, "logps/rejected": -69.7318344116211, "loss": 0.6932, "rewards/accuracies": 0.515625, "rewards/chosen": 9.683193638920784e-06, "rewards/margins": -0.0001216536620631814, "rewards/rejected": 0.00013133684115018696, "step": 1 }, { "epoch": 0.014705882352941176, "grad_norm": 2.401517868041992, "kl/avg_steps": 0.001953125, "kl/beta": 0.009997854940593243, "kl/n_epsilon_steps": 0.498046875, "kl/p_epsilon_steps": 0.5, "learning_rate": 5.88235294117647e-08, "logits/chosen": -0.5336302518844604, "logits/rejected": -0.41014784574508667, "logps/chosen": -75.71084594726562, "logps/ref_chosen": -75.70054626464844, "logps/ref_rejected": -81.47293090820312, "logps/rejected": -81.47822570800781, "loss": 0.6932, "rewards/accuracies": 0.505859375, "rewards/chosen": -0.00011636512499535456, "rewards/margins": -7.826486398698762e-05, "rewards/rejected": -3.8100268284324557e-05, "step": 5 }, { "epoch": 0.029411764705882353, "grad_norm": 2.312957525253296, "kl/avg_steps": -0.05312500149011612, "kl/beta": 0.010005339980125427, "kl/n_epsilon_steps": 0.5234375, "kl/p_epsilon_steps": 0.4703125059604645, "learning_rate": 1.3235294117647057e-07, "logits/chosen": -0.5401719808578491, "logits/rejected": -0.4321846067905426, "logps/chosen": -77.008544921875, "logps/ref_chosen": -77.0025405883789, "logps/ref_rejected": -82.64138793945312, "logps/rejected": -82.64922332763672, "loss": 0.6932, "rewards/accuracies": 0.4765625, "rewards/chosen": -7.36937508918345e-05, "rewards/margins": -9.958527698472608e-06, "rewards/rejected": -6.373519863700494e-05, "step": 10 }, { "epoch": 0.04411764705882353, "grad_norm": 2.890460968017578, "kl/avg_steps": 0.12968750298023224, "kl/beta": 0.010008977726101875, "kl/n_epsilon_steps": 0.43437498807907104, "kl/p_epsilon_steps": 0.5640624761581421, "learning_rate": 2.0588235294117645e-07, "logits/chosen": -0.5125764608383179, "logits/rejected": -0.4432317316532135, "logps/chosen": -70.82783508300781, "logps/ref_chosen": -70.83788299560547, "logps/ref_rejected": -87.43305206298828, "logps/rejected": -87.48735809326172, "loss": 0.6928, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": 8.707816596142948e-05, "rewards/margins": 0.0006134368595667183, "rewards/rejected": -0.0005263587227091193, "step": 15 }, { "epoch": 0.058823529411764705, "grad_norm": 2.124864101409912, "kl/avg_steps": 0.265625, "kl/beta": 0.009920386597514153, "kl/n_epsilon_steps": 0.3656249940395355, "kl/p_epsilon_steps": 0.6312500238418579, "learning_rate": 2.7941176470588235e-07, "logits/chosen": -0.5464522242546082, "logits/rejected": -0.4405369162559509, "logps/chosen": -70.1437759399414, "logps/ref_chosen": -70.1697006225586, "logps/ref_rejected": -82.27420806884766, "logps/rejected": -82.44139099121094, "loss": 0.6922, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.00024056310940068215, "rewards/margins": 0.0018755672499537468, "rewards/rejected": -0.0016350041842088103, "step": 20 }, { "epoch": 0.07352941176470588, "grad_norm": 2.521970510482788, "kl/avg_steps": 0.543749988079071, "kl/beta": 0.009726567193865776, "kl/n_epsilon_steps": 0.22812500596046448, "kl/p_epsilon_steps": 0.7718750238418579, "learning_rate": 3.529411764705882e-07, "logits/chosen": -0.568504273891449, "logits/rejected": -0.4329379200935364, "logps/chosen": -74.4179458618164, "logps/ref_chosen": -74.5040283203125, "logps/ref_rejected": -89.5297622680664, "logps/rejected": -90.02223205566406, "loss": 0.6904, "rewards/accuracies": 0.7796875238418579, "rewards/chosen": 0.0008119211415760219, "rewards/margins": 0.005549114663153887, "rewards/rejected": -0.00473719323053956, "step": 25 }, { "epoch": 0.08823529411764706, "grad_norm": 2.3963418006896973, "kl/avg_steps": 0.567187488079071, "kl/beta": 0.00945484172552824, "kl/n_epsilon_steps": 0.21562500298023224, "kl/p_epsilon_steps": 0.7828124761581421, "learning_rate": 4.264705882352941e-07, "logits/chosen": -0.6653466820716858, "logits/rejected": -0.49282917380332947, "logps/chosen": -76.55107879638672, "logps/ref_chosen": -76.60227966308594, "logps/ref_rejected": -82.36322784423828, "logps/rejected": -83.71476745605469, "loss": 0.6867, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0004493276646826416, "rewards/margins": 0.01309473067522049, "rewards/rejected": -0.012645403854548931, "step": 30 }, { "epoch": 0.10294117647058823, "grad_norm": 2.311098337173462, "kl/avg_steps": 0.5546875, "kl/beta": 0.009198471903800964, "kl/n_epsilon_steps": 0.22187499701976776, "kl/p_epsilon_steps": 0.776562511920929, "learning_rate": 5e-07, "logits/chosen": -0.6610927581787109, "logits/rejected": -0.5268033146858215, "logps/chosen": -76.14710998535156, "logps/ref_chosen": -75.79379272460938, "logps/ref_rejected": -83.69039154052734, "logps/rejected": -86.21320343017578, "loss": 0.6835, "rewards/accuracies": 0.809374988079071, "rewards/chosen": -0.003281622426584363, "rewards/margins": 0.019690800458192825, "rewards/rejected": -0.02297242358326912, "step": 35 }, { "epoch": 0.11764705882352941, "grad_norm": 2.681466817855835, "kl/avg_steps": 0.4078125059604645, "kl/beta": 0.008961381390690804, "kl/n_epsilon_steps": 0.2953124940395355, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.996706849759452e-07, "logits/chosen": -0.8570469617843628, "logits/rejected": -0.7218376398086548, "logps/chosen": -77.57659149169922, "logps/ref_chosen": -75.21812438964844, "logps/ref_rejected": -86.6792984008789, "logps/rejected": -93.75047302246094, "loss": 0.6732, "rewards/accuracies": 0.78125, "rewards/chosen": -0.021187324076890945, "rewards/margins": 0.04168969392776489, "rewards/rejected": -0.06287702172994614, "step": 40 }, { "epoch": 0.1323529411764706, "grad_norm": 3.2158050537109375, "kl/avg_steps": 0.3187499940395355, "kl/beta": 0.008803511038422585, "kl/n_epsilon_steps": 0.34062498807907104, "kl/p_epsilon_steps": 0.659375011920929, "learning_rate": 4.986836074908615e-07, "logits/chosen": -0.9623914957046509, "logits/rejected": -0.8303581476211548, "logps/chosen": -82.83192443847656, "logps/ref_chosen": -77.2712173461914, "logps/ref_rejected": -91.67030334472656, "logps/rejected": -102.5731201171875, "loss": 0.6715, "rewards/accuracies": 0.7515624761581421, "rewards/chosen": -0.049075882881879807, "rewards/margins": 0.04632042720913887, "rewards/rejected": -0.09539631009101868, "step": 45 }, { "epoch": 0.14705882352941177, "grad_norm": 3.3687705993652344, "kl/avg_steps": 0.20624999701976776, "kl/beta": 0.008690183982253075, "kl/n_epsilon_steps": 0.3968749940395355, "kl/p_epsilon_steps": 0.6031249761581421, "learning_rate": 4.970413680203148e-07, "logits/chosen": -1.0613696575164795, "logits/rejected": -0.921216607093811, "logps/chosen": -82.58769226074219, "logps/ref_chosen": -73.91633605957031, "logps/ref_rejected": -79.92402648925781, "logps/rejected": -94.2342758178711, "loss": 0.6712, "rewards/accuracies": 0.698437511920929, "rewards/chosen": -0.07562652230262756, "rewards/margins": 0.048155996948480606, "rewards/rejected": -0.12378251552581787, "step": 50 }, { "epoch": 0.16176470588235295, "grad_norm": 4.448400020599365, "kl/avg_steps": 0.2109375, "kl/beta": 0.00860314816236496, "kl/n_epsilon_steps": 0.39375001192092896, "kl/p_epsilon_steps": 0.604687511920929, "learning_rate": 4.947482930773511e-07, "logits/chosen": -1.1757243871688843, "logits/rejected": -1.0121644735336304, "logps/chosen": -91.91180419921875, "logps/ref_chosen": -79.74378204345703, "logps/ref_rejected": -83.18132019042969, "logps/rejected": -103.12516021728516, "loss": 0.6639, "rewards/accuracies": 0.6859375238418579, "rewards/chosen": -0.10501708835363388, "rewards/margins": 0.06571148335933685, "rewards/rejected": -0.17072856426239014, "step": 55 }, { "epoch": 0.17647058823529413, "grad_norm": 3.918736219406128, "kl/avg_steps": 0.171875, "kl/beta": 0.008520014584064484, "kl/n_epsilon_steps": 0.4140625, "kl/p_epsilon_steps": 0.5859375, "learning_rate": 4.918104238142103e-07, "logits/chosen": -1.2249476909637451, "logits/rejected": -1.1036134958267212, "logps/chosen": -98.29524993896484, "logps/ref_chosen": -81.61141967773438, "logps/ref_rejected": -80.947998046875, "logps/rejected": -105.27479553222656, "loss": 0.6663, "rewards/accuracies": 0.660937488079071, "rewards/chosen": -0.14256855845451355, "rewards/margins": 0.06371048837900162, "rewards/rejected": -0.20627903938293457, "step": 60 }, { "epoch": 0.19117647058823528, "grad_norm": 3.5865535736083984, "kl/avg_steps": 0.27656251192092896, "kl/beta": 0.00841777864843607, "kl/n_epsilon_steps": 0.3609375059604645, "kl/p_epsilon_steps": 0.637499988079071, "learning_rate": 4.882355001067891e-07, "logits/chosen": -1.2322965860366821, "logits/rejected": -1.151759147644043, "logps/chosen": -91.71420288085938, "logps/ref_chosen": -75.09439849853516, "logps/ref_rejected": -87.96830749511719, "logps/rejected": -117.06733703613281, "loss": 0.6479, "rewards/accuracies": 0.6875, "rewards/chosen": -0.14002391695976257, "rewards/margins": 0.10341048240661621, "rewards/rejected": -0.24343439936637878, "step": 65 }, { "epoch": 0.20588235294117646, "grad_norm": 3.871297836303711, "kl/avg_steps": 0.28437501192092896, "kl/beta": 0.008305966854095459, "kl/n_epsilon_steps": 0.3578124940395355, "kl/p_epsilon_steps": 0.6421874761581421, "learning_rate": 4.840329401637809e-07, "logits/chosen": -1.2796670198440552, "logits/rejected": -1.1985622644424438, "logps/chosen": -89.69293975830078, "logps/ref_chosen": -70.07804870605469, "logps/ref_rejected": -88.98612976074219, "logps/rejected": -122.0387954711914, "loss": 0.6462, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.16316808760166168, "rewards/margins": 0.10986582934856415, "rewards/rejected": -0.27303391695022583, "step": 70 }, { "epoch": 0.22058823529411764, "grad_norm": 3.9685800075531006, "kl/avg_steps": 0.1875, "kl/beta": 0.008209030143916607, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.59375, "learning_rate": 4.792138157142157e-07, "logits/chosen": -1.2629064321517944, "logits/rejected": -1.1684788465499878, "logps/chosen": -101.08387756347656, "logps/ref_chosen": -77.74958801269531, "logps/ref_rejected": -82.17206573486328, "logps/rejected": -117.42021179199219, "loss": 0.6538, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.191951185464859, "rewards/margins": 0.09596569836139679, "rewards/rejected": -0.2879168391227722, "step": 75 }, { "epoch": 0.23529411764705882, "grad_norm": 4.582348823547363, "kl/avg_steps": 0.24375000596046448, "kl/beta": 0.008118118159472942, "kl/n_epsilon_steps": 0.37812501192092896, "kl/p_epsilon_steps": 0.621874988079071, "learning_rate": 4.737908228387656e-07, "logits/chosen": -1.2720203399658203, "logits/rejected": -1.218477725982666, "logps/chosen": -107.53079986572266, "logps/ref_chosen": -81.88478088378906, "logps/ref_rejected": -90.519775390625, "logps/rejected": -131.16079711914062, "loss": 0.6438, "rewards/accuracies": 0.6875, "rewards/chosen": -0.20838662981987, "rewards/margins": 0.11963216215372086, "rewards/rejected": -0.32801881432533264, "step": 80 }, { "epoch": 0.25, "grad_norm": 3.6524829864501953, "kl/avg_steps": 0.2515625059604645, "kl/beta": 0.0080325398594141, "kl/n_epsilon_steps": 0.37187498807907104, "kl/p_epsilon_steps": 0.6234375238418579, "learning_rate": 4.6777824852166437e-07, "logits/chosen": -1.2834303379058838, "logits/rejected": -1.198880672454834, "logps/chosen": -95.5977554321289, "logps/ref_chosen": -70.41683197021484, "logps/ref_rejected": -78.02936553955078, "logps/rejected": -118.98405456542969, "loss": 0.6418, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -0.20263484120368958, "rewards/margins": 0.12456460297107697, "rewards/rejected": -0.32719942927360535, "step": 85 }, { "epoch": 0.2647058823529412, "grad_norm": 4.22735071182251, "kl/avg_steps": 0.2874999940395355, "kl/beta": 0.007919726893305779, "kl/n_epsilon_steps": 0.35468751192092896, "kl/p_epsilon_steps": 0.6421874761581421, "learning_rate": 4.611919330113591e-07, "logits/chosen": -1.2632228136062622, "logits/rejected": -1.2163931131362915, "logps/chosen": -105.8456039428711, "logps/ref_chosen": -76.6160888671875, "logps/ref_rejected": -89.49937438964844, "logps/rejected": -136.4986572265625, "loss": 0.6361, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.23172405362129211, "rewards/margins": 0.13836422562599182, "rewards/rejected": -0.37008827924728394, "step": 90 }, { "epoch": 0.27941176470588236, "grad_norm": 4.236695766448975, "kl/avg_steps": 0.2750000059604645, "kl/beta": 0.0078009068965911865, "kl/n_epsilon_steps": 0.36250001192092896, "kl/p_epsilon_steps": 0.637499988079071, "learning_rate": 4.5404922808905543e-07, "logits/chosen": -1.2625572681427002, "logits/rejected": -1.2011988162994385, "logps/chosen": -104.29510498046875, "logps/ref_chosen": -73.50260162353516, "logps/ref_rejected": -76.48811340332031, "logps/rejected": -124.16410827636719, "loss": 0.6411, "rewards/accuracies": 0.7015625238418579, "rewards/chosen": -0.24040882289409637, "rewards/margins": 0.1294616460800171, "rewards/rejected": -0.36987045407295227, "step": 95 }, { "epoch": 0.29411764705882354, "grad_norm": 4.193100452423096, "kl/avg_steps": 0.3656249940395355, "kl/beta": 0.0076876478269696236, "kl/n_epsilon_steps": 0.31718748807907104, "kl/p_epsilon_steps": 0.682812511920929, "learning_rate": 4.4636895135509966e-07, "logits/chosen": -1.2317556142807007, "logits/rejected": -1.1946831941604614, "logps/chosen": -103.88249206542969, "logps/ref_chosen": -72.6116714477539, "logps/ref_rejected": -81.16241455078125, "logps/rejected": -134.57403564453125, "loss": 0.6236, "rewards/accuracies": 0.746874988079071, "rewards/chosen": -0.24038386344909668, "rewards/margins": 0.16778317093849182, "rewards/rejected": -0.4081670641899109, "step": 100 }, { "epoch": 0.29411764705882354, "eval_kl/n_epsilon_steps": 0.4236111044883728, "eval_kl/p_epsilon_steps": 0.5759548544883728, "eval_logits/chosen": -1.2261141538619995, "eval_logits/rejected": -1.1807267665863037, "eval_logps/chosen": -127.64717864990234, "eval_logps/ref_chosen": -87.82356262207031, "eval_logps/ref_rejected": -82.81887817382812, "eval_logps/rejected": -134.3017578125, "eval_loss": 0.6636335253715515, "eval_rewards/accuracies": 0.6124131679534912, "eval_rewards/chosen": -0.30329596996307373, "eval_rewards/margins": 0.08622786402702332, "eval_rewards/rejected": -0.38952386379241943, "eval_runtime": 22.4366, "eval_samples_per_second": 104.249, "eval_steps_per_second": 0.847, "step": 100 }, { "epoch": 0.3088235294117647, "grad_norm": 4.206778049468994, "kl/avg_steps": 0.28125, "kl/beta": 0.007563448045402765, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 4.381713366536311e-07, "logits/chosen": -1.2459900379180908, "logits/rejected": -1.1858142614364624, "logps/chosen": -112.22574615478516, "logps/ref_chosen": -76.5867919921875, "logps/ref_rejected": -84.33440399169922, "logps/rejected": -140.7528533935547, "loss": 0.6304, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2697572112083435, "rewards/margins": 0.15455064177513123, "rewards/rejected": -0.42430782318115234, "step": 105 }, { "epoch": 0.3235294117647059, "grad_norm": 5.154345989227295, "kl/avg_steps": 0.28437501192092896, "kl/beta": 0.007447557989507914, "kl/n_epsilon_steps": 0.3578124940395355, "kl/p_epsilon_steps": 0.6421874761581421, "learning_rate": 4.2947798076611047e-07, "logits/chosen": -1.2248286008834839, "logits/rejected": -1.1694958209991455, "logps/chosen": -118.81462097167969, "logps/ref_chosen": -78.16385650634766, "logps/ref_rejected": -83.61200714111328, "logps/rejected": -146.4515838623047, "loss": 0.6294, "rewards/accuracies": 0.692187488079071, "rewards/chosen": -0.3029385209083557, "rewards/margins": 0.16256316006183624, "rewards/rejected": -0.46550169587135315, "step": 110 }, { "epoch": 0.3382352941176471, "grad_norm": 5.148464679718018, "kl/avg_steps": 0.35468751192092896, "kl/beta": 0.007336863782256842, "kl/n_epsilon_steps": 0.3218750059604645, "kl/p_epsilon_steps": 0.676562488079071, "learning_rate": 4.203117865141635e-07, "logits/chosen": -1.2170436382293701, "logits/rejected": -1.1504008769989014, "logps/chosen": -118.66552734375, "logps/ref_chosen": -74.8998031616211, "logps/ref_rejected": -85.2784652709961, "logps/rejected": -156.08580017089844, "loss": 0.618, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.3210408687591553, "rewards/margins": 0.19527961313724518, "rewards/rejected": -0.516320526599884, "step": 115 }, { "epoch": 0.35294117647058826, "grad_norm": 5.226547718048096, "kl/avg_steps": 0.265625, "kl/beta": 0.007222268730401993, "kl/n_epsilon_steps": 0.3671875, "kl/p_epsilon_steps": 0.6328125, "learning_rate": 4.106969024216348e-07, "logits/chosen": -1.1989049911499023, "logits/rejected": -1.1565752029418945, "logps/chosen": -119.46983337402344, "logps/ref_chosen": -73.58607482910156, "logps/ref_rejected": -85.84365844726562, "logps/rejected": -158.82113647460938, "loss": 0.6197, "rewards/accuracies": 0.698437511920929, "rewards/chosen": -0.3315422534942627, "rewards/margins": 0.19263319671154022, "rewards/rejected": -0.5241755247116089, "step": 120 }, { "epoch": 0.36764705882352944, "grad_norm": 5.764974594116211, "kl/avg_steps": 0.30937498807907104, "kl/beta": 0.007117821369320154, "kl/n_epsilon_steps": 0.3453125059604645, "kl/p_epsilon_steps": 0.6546875238418579, "learning_rate": 4.006586590948141e-07, "logits/chosen": -1.1903568506240845, "logits/rejected": -1.130084753036499, "logps/chosen": -130.13233947753906, "logps/ref_chosen": -80.25770568847656, "logps/ref_rejected": -81.34100341796875, "logps/rejected": -161.29537963867188, "loss": 0.6139, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.3550013303756714, "rewards/margins": 0.21088404953479767, "rewards/rejected": -0.5658854246139526, "step": 125 }, { "epoch": 0.38235294117647056, "grad_norm": 5.23793363571167, "kl/avg_steps": 0.2718749940395355, "kl/beta": 0.007017888128757477, "kl/n_epsilon_steps": 0.36406248807907104, "kl/p_epsilon_steps": 0.635937511920929, "learning_rate": 3.9022350248844246e-07, "logits/chosen": -1.1651326417922974, "logits/rejected": -1.1263306140899658, "logps/chosen": -128.90423583984375, "logps/ref_chosen": -74.67902374267578, "logps/ref_rejected": -84.1854019165039, "logps/rejected": -167.0582733154297, "loss": 0.6209, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.3804508149623871, "rewards/margins": 0.19761842489242554, "rewards/rejected": -0.5780693292617798, "step": 130 }, { "epoch": 0.39705882352941174, "grad_norm": 5.766234874725342, "kl/avg_steps": 0.32499998807907104, "kl/beta": 0.006909938994795084, "kl/n_epsilon_steps": 0.3375000059604645, "kl/p_epsilon_steps": 0.6625000238418579, "learning_rate": 3.794189242333106e-07, "logits/chosen": -1.1625608205795288, "logits/rejected": -1.0963513851165771, "logps/chosen": -138.46322631835938, "logps/ref_chosen": -81.2975845336914, "logps/ref_rejected": -87.74832916259766, "logps/rejected": -174.3780059814453, "loss": 0.6207, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.39489540457725525, "rewards/margins": 0.20013752579689026, "rewards/rejected": -0.5950329303741455, "step": 135 }, { "epoch": 0.4117647058823529, "grad_norm": 4.888461112976074, "kl/avg_steps": 0.3828125, "kl/beta": 0.006796327419579029, "kl/n_epsilon_steps": 0.30781251192092896, "kl/p_epsilon_steps": 0.690625011920929, "learning_rate": 3.6827338920900253e-07, "logits/chosen": -1.172863483428955, "logits/rejected": -1.106768012046814, "logps/chosen": -121.53498840332031, "logps/ref_chosen": -71.20382690429688, "logps/ref_rejected": -84.62137603759766, "logps/rejected": -170.3011474609375, "loss": 0.6009, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.3417351245880127, "rewards/margins": 0.2368461638689041, "rewards/rejected": -0.5785812139511108, "step": 140 }, { "epoch": 0.4264705882352941, "grad_norm": 5.264912128448486, "kl/avg_steps": 0.390625, "kl/beta": 0.0066697075963020325, "kl/n_epsilon_steps": 0.3046875, "kl/p_epsilon_steps": 0.6953125, "learning_rate": 3.568162605525952e-07, "logits/chosen": -1.1562573909759521, "logits/rejected": -1.0977518558502197, "logps/chosen": -132.38858032226562, "logps/ref_chosen": -78.03334045410156, "logps/ref_rejected": -86.95343017578125, "logps/rejected": -178.3859405517578, "loss": 0.5992, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.3619559407234192, "rewards/margins": 0.2438311129808426, "rewards/rejected": -0.6057869791984558, "step": 145 }, { "epoch": 0.4411764705882353, "grad_norm": 5.746659278869629, "kl/avg_steps": 0.328125, "kl/beta": 0.00654013454914093, "kl/n_epsilon_steps": 0.3359375, "kl/p_epsilon_steps": 0.6640625, "learning_rate": 3.4507772230088147e-07, "logits/chosen": -1.0893394947052002, "logits/rejected": -1.0510880947113037, "logps/chosen": -136.75088500976562, "logps/ref_chosen": -73.69932556152344, "logps/ref_rejected": -86.18521118164062, "logps/rejected": -184.1068878173828, "loss": 0.614, "rewards/accuracies": 0.7015625238418579, "rewards/chosen": -0.4121219515800476, "rewards/margins": 0.22426274418830872, "rewards/rejected": -0.6363847255706787, "step": 150 }, { "epoch": 0.45588235294117646, "grad_norm": 5.235478401184082, "kl/avg_steps": 0.34375, "kl/beta": 0.006440295372158289, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 3.3308869986991487e-07, "logits/chosen": -1.0995477437973022, "logits/rejected": -1.031232237815857, "logps/chosen": -144.14666748046875, "logps/ref_chosen": -78.81468963623047, "logps/ref_rejected": -82.33976745605469, "logps/rejected": -183.3446502685547, "loss": 0.6118, "rewards/accuracies": 0.703125, "rewards/chosen": -0.4207354485988617, "rewards/margins": 0.22600603103637695, "rewards/rejected": -0.646741509437561, "step": 155 }, { "epoch": 0.47058823529411764, "grad_norm": 5.473912239074707, "kl/avg_steps": 0.3812499940395355, "kl/beta": 0.0063315341249108315, "kl/n_epsilon_steps": 0.30937498807907104, "kl/p_epsilon_steps": 0.690625011920929, "learning_rate": 3.208807785813777e-07, "logits/chosen": -1.080108880996704, "logits/rejected": -1.0139106512069702, "logps/chosen": -132.09349060058594, "logps/ref_chosen": -71.280517578125, "logps/ref_rejected": -86.39788818359375, "logps/rejected": -188.9801788330078, "loss": 0.5951, "rewards/accuracies": 0.7203124761581421, "rewards/chosen": -0.3846417963504791, "rewards/margins": 0.26059722900390625, "rewards/rejected": -0.645238995552063, "step": 160 }, { "epoch": 0.4852941176470588, "grad_norm": 5.492692947387695, "kl/avg_steps": 0.33125001192092896, "kl/beta": 0.006211251951754093, "kl/n_epsilon_steps": 0.3343749940395355, "kl/p_epsilon_steps": 0.6656249761581421, "learning_rate": 3.084861204504122e-07, "logits/chosen": -1.064668893814087, "logits/rejected": -0.9995222091674805, "logps/chosen": -148.7730255126953, "logps/ref_chosen": -79.35147094726562, "logps/ref_rejected": -83.44163513183594, "logps/rejected": -191.25628662109375, "loss": 0.608, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.430931031703949, "rewards/margins": 0.23459258675575256, "rewards/rejected": -0.6655236482620239, "step": 165 }, { "epoch": 0.5, "grad_norm": 5.870633602142334, "kl/avg_steps": 0.3499999940395355, "kl/beta": 0.006105704233050346, "kl/n_epsilon_steps": 0.32343751192092896, "kl/p_epsilon_steps": 0.6734374761581421, "learning_rate": 2.959373794541426e-07, "logits/chosen": -1.0475225448608398, "logits/rejected": -1.006306529045105, "logps/chosen": -147.1262664794922, "logps/ref_chosen": -75.01612854003906, "logps/ref_rejected": -86.07945251464844, "logps/rejected": -199.21173095703125, "loss": 0.6032, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.4399870038032532, "rewards/margins": 0.2465648353099823, "rewards/rejected": -0.6865519285202026, "step": 170 }, { "epoch": 0.5147058823529411, "grad_norm": 5.422708988189697, "kl/avg_steps": 0.41093748807907104, "kl/beta": 0.0059935590252280235, "kl/n_epsilon_steps": 0.29374998807907104, "kl/p_epsilon_steps": 0.7046874761581421, "learning_rate": 2.8326761550411346e-07, "logits/chosen": -1.037719488143921, "logits/rejected": -0.9720247387886047, "logps/chosen": -149.66494750976562, "logps/ref_chosen": -75.85931396484375, "logps/ref_rejected": -88.4763412475586, "logps/rejected": -206.0808563232422, "loss": 0.5969, "rewards/accuracies": 0.729687511920929, "rewards/chosen": -0.4419892430305481, "rewards/margins": 0.2586033344268799, "rewards/rejected": -0.7005925178527832, "step": 175 }, { "epoch": 0.5294117647058824, "grad_norm": 5.140402793884277, "kl/avg_steps": 0.30937498807907104, "kl/beta": 0.005884683690965176, "kl/n_epsilon_steps": 0.3453125059604645, "kl/p_epsilon_steps": 0.6546875238418579, "learning_rate": 2.7051020734928443e-07, "logits/chosen": -1.0452353954315186, "logits/rejected": -0.968549370765686, "logps/chosen": -143.4625701904297, "logps/ref_chosen": -74.5296859741211, "logps/ref_rejected": -78.44059753417969, "logps/rejected": -188.22622680664062, "loss": 0.6093, "rewards/accuracies": 0.692187488079071, "rewards/chosen": -0.4056355059146881, "rewards/margins": 0.23657508194446564, "rewards/rejected": -0.6422106027603149, "step": 180 }, { "epoch": 0.5441176470588235, "grad_norm": 5.03032112121582, "kl/avg_steps": 0.3984375, "kl/beta": 0.005778872407972813, "kl/n_epsilon_steps": 0.30000001192092896, "kl/p_epsilon_steps": 0.698437511920929, "learning_rate": 2.5769876463904263e-07, "logits/chosen": -1.0298566818237305, "logits/rejected": -0.9755008816719055, "logps/chosen": -137.92031860351562, "logps/ref_chosen": -70.28861999511719, "logps/ref_rejected": -85.20851135253906, "logps/rejected": -197.1123809814453, "loss": 0.5968, "rewards/accuracies": 0.7328125238418579, "rewards/chosen": -0.3904454708099365, "rewards/margins": 0.25234130024909973, "rewards/rejected": -0.6427868008613586, "step": 185 }, { "epoch": 0.5588235294117647, "grad_norm": 6.057910919189453, "kl/avg_steps": 0.359375, "kl/beta": 0.005678877234458923, "kl/n_epsilon_steps": 0.3203125, "kl/p_epsilon_steps": 0.6796875, "learning_rate": 2.4486703937790243e-07, "logits/chosen": -1.0044220685958862, "logits/rejected": -0.9527886509895325, "logps/chosen": -151.2794952392578, "logps/ref_chosen": -75.0217514038086, "logps/ref_rejected": -90.4836654663086, "logps/rejected": -214.67868041992188, "loss": 0.5951, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.43261224031448364, "rewards/margins": 0.2681336998939514, "rewards/rejected": -0.7007459402084351, "step": 190 }, { "epoch": 0.5735294117647058, "grad_norm": 5.573934555053711, "kl/avg_steps": 0.3499999940395355, "kl/beta": 0.005573070142418146, "kl/n_epsilon_steps": 0.32499998807907104, "kl/p_epsilon_steps": 0.675000011920929, "learning_rate": 2.320488370051681e-07, "logits/chosen": -0.989575207233429, "logits/rejected": -0.9092248678207397, "logps/chosen": -154.51953125, "logps/ref_chosen": -73.42979431152344, "logps/ref_rejected": -84.43408203125, "logps/rejected": -211.646240234375, "loss": 0.6019, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.4517548084259033, "rewards/margins": 0.25306665897369385, "rewards/rejected": -0.7048214673995972, "step": 195 }, { "epoch": 0.5882352941176471, "grad_norm": 5.598110198974609, "kl/avg_steps": 0.35468751192092896, "kl/beta": 0.005477838683873415, "kl/n_epsilon_steps": 0.3218750059604645, "kl/p_epsilon_steps": 0.676562488079071, "learning_rate": 2.192779273338215e-07, "logits/chosen": -0.9810283780097961, "logits/rejected": -0.8921745419502258, "logps/chosen": -159.2919464111328, "logps/ref_chosen": -77.8104019165039, "logps/ref_rejected": -86.66553497314453, "logps/rejected": -219.63626098632812, "loss": 0.5934, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.4459984302520752, "rewards/margins": 0.27772727608680725, "rewards/rejected": -0.7237256765365601, "step": 200 }, { "epoch": 0.5882352941176471, "eval_kl/n_epsilon_steps": 0.3958333432674408, "eval_kl/p_epsilon_steps": 0.6037326455116272, "eval_logits/chosen": -0.9817464351654053, "eval_logits/rejected": -0.8951107859611511, "eval_logps/chosen": -182.90843200683594, "eval_logps/ref_chosen": -87.82356262207031, "eval_logps/ref_rejected": -82.81887817382812, "eval_logps/rejected": -209.30340576171875, "eval_loss": 0.6429124474525452, "eval_rewards/accuracies": 0.6323784589767456, "eval_rewards/chosen": -0.5156466960906982, "eval_rewards/margins": 0.1663396805524826, "eval_rewards/rejected": -0.6819863319396973, "eval_runtime": 22.339, "eval_samples_per_second": 104.705, "eval_steps_per_second": 0.851, "step": 200 }, { "epoch": 0.6029411764705882, "grad_norm": 5.395305156707764, "kl/avg_steps": 0.31562501192092896, "kl/beta": 0.005382629111409187, "kl/n_epsilon_steps": 0.3421874940395355, "kl/p_epsilon_steps": 0.6578124761581421, "learning_rate": 2.065879555832674e-07, "logits/chosen": -0.9339988827705383, "logits/rejected": -0.8349924087524414, "logps/chosen": -150.00833129882812, "logps/ref_chosen": -71.83072662353516, "logps/ref_rejected": -78.26126861572266, "logps/rejected": -207.3239288330078, "loss": 0.5976, "rewards/accuracies": 0.7015625238418579, "rewards/chosen": -0.42064207792282104, "rewards/margins": 0.26989927887916565, "rewards/rejected": -0.6905413866043091, "step": 205 }, { "epoch": 0.6176470588235294, "grad_norm": 8.636336326599121, "kl/avg_steps": 0.3343749940395355, "kl/beta": 0.005294554866850376, "kl/n_epsilon_steps": 0.33281248807907104, "kl/p_epsilon_steps": 0.667187511920929, "learning_rate": 1.9401235374032425e-07, "logits/chosen": -0.940881073474884, "logits/rejected": -0.835827648639679, "logps/chosen": -169.9760284423828, "logps/ref_chosen": -81.13362121582031, "logps/ref_rejected": -83.91246032714844, "logps/rejected": -226.44479370117188, "loss": 0.5961, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.4700423777103424, "rewards/margins": 0.28001874685287476, "rewards/rejected": -0.7500611543655396, "step": 210 }, { "epoch": 0.6323529411764706, "grad_norm": 5.697958946228027, "kl/avg_steps": 0.37968748807907104, "kl/beta": 0.005207170732319355, "kl/n_epsilon_steps": 0.30937498807907104, "kl/p_epsilon_steps": 0.6890624761581421, "learning_rate": 1.8158425248197928e-07, "logits/chosen": -0.9595499038696289, "logits/rejected": -0.8254610300064087, "logps/chosen": -168.97909545898438, "logps/ref_chosen": -79.5214614868164, "logps/ref_rejected": -83.58778381347656, "logps/rejected": -225.5334014892578, "loss": 0.5994, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.4653104245662689, "rewards/margins": 0.2690308690071106, "rewards/rejected": -0.7343412637710571, "step": 215 }, { "epoch": 0.6470588235294118, "grad_norm": 5.304469108581543, "kl/avg_steps": 0.3343749940395355, "kl/beta": 0.005111886188387871, "kl/n_epsilon_steps": 0.33281248807907104, "kl/p_epsilon_steps": 0.667187511920929, "learning_rate": 1.6933639389195134e-07, "logits/chosen": -0.9539089202880859, "logits/rejected": -0.8665965795516968, "logps/chosen": -166.537353515625, "logps/ref_chosen": -81.25938415527344, "logps/ref_rejected": -83.04185485839844, "logps/rejected": -215.3668670654297, "loss": 0.6056, "rewards/accuracies": 0.723437488079071, "rewards/chosen": -0.43559327721595764, "rewards/margins": 0.2370072603225708, "rewards/rejected": -0.6726005673408508, "step": 220 }, { "epoch": 0.6617647058823529, "grad_norm": 5.622444152832031, "kl/avg_steps": 0.4468750059604645, "kl/beta": 0.005018714815378189, "kl/n_epsilon_steps": 0.27656251192092896, "kl/p_epsilon_steps": 0.723437488079071, "learning_rate": 1.573010452010098e-07, "logits/chosen": -0.9484726190567017, "logits/rejected": -0.8518384695053101, "logps/chosen": -162.01535034179688, "logps/ref_chosen": -77.427001953125, "logps/ref_rejected": -89.23592376708984, "logps/rejected": -233.6844024658203, "loss": 0.5839, "rewards/accuracies": 0.765625, "rewards/chosen": -0.4237908720970154, "rewards/margins": 0.2962331175804138, "rewards/rejected": -0.7200239896774292, "step": 225 }, { "epoch": 0.6764705882352942, "grad_norm": 5.60673189163208, "kl/avg_steps": 0.48124998807907104, "kl/beta": 0.004900630097836256, "kl/n_epsilon_steps": 0.2593750059604645, "kl/p_epsilon_steps": 0.7406250238418579, "learning_rate": 1.4550991377830423e-07, "logits/chosen": -0.9383388757705688, "logits/rejected": -0.856258749961853, "logps/chosen": -156.29066467285156, "logps/ref_chosen": -70.1819839477539, "logps/ref_rejected": -87.79248046875, "logps/rejected": -232.82858276367188, "loss": 0.5866, "rewards/accuracies": 0.7671874761581421, "rewards/chosen": -0.42099839448928833, "rewards/margins": 0.2847110629081726, "rewards/rejected": -0.7057094573974609, "step": 230 }, { "epoch": 0.6911764705882353, "grad_norm": 5.7163004875183105, "kl/avg_steps": 0.4828124940395355, "kl/beta": 0.004785512108355761, "kl/n_epsilon_steps": 0.2578125, "kl/p_epsilon_steps": 0.7406250238418579, "learning_rate": 1.339940635976592e-07, "logits/chosen": -0.8863986134529114, "logits/rejected": -0.8059118390083313, "logps/chosen": -174.5547637939453, "logps/ref_chosen": -77.51251220703125, "logps/ref_rejected": -89.81958770751953, "logps/rejected": -251.2958526611328, "loss": 0.583, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.4634285569190979, "rewards/margins": 0.30392760038375854, "rewards/rejected": -0.7673560976982117, "step": 235 }, { "epoch": 0.7058823529411765, "grad_norm": 6.860780715942383, "kl/avg_steps": 0.36250001192092896, "kl/beta": 0.004683743230998516, "kl/n_epsilon_steps": 0.3187499940395355, "kl/p_epsilon_steps": 0.6812499761581421, "learning_rate": 1.227838333989088e-07, "logits/chosen": -0.8450605273246765, "logits/rejected": -0.7272099256515503, "logps/chosen": -177.47744750976562, "logps/ref_chosen": -74.5803451538086, "logps/ref_rejected": -81.81297302246094, "logps/rejected": -243.76736450195312, "loss": 0.5968, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.4815599322319031, "rewards/margins": 0.27244722843170166, "rewards/rejected": -0.7540072202682495, "step": 240 }, { "epoch": 0.7205882352941176, "grad_norm": 5.382650852203369, "kl/avg_steps": 0.4000000059604645, "kl/beta": 0.004598929081112146, "kl/n_epsilon_steps": 0.30000001192092896, "kl/p_epsilon_steps": 0.699999988079071, "learning_rate": 1.1190875675987355e-07, "logits/chosen": -0.8378638029098511, "logits/rejected": -0.7307332158088684, "logps/chosen": -178.53826904296875, "logps/ref_chosen": -76.56635284423828, "logps/ref_rejected": -86.859130859375, "logps/rejected": -255.5751495361328, "loss": 0.5827, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.46838441491127014, "rewards/margins": 0.30266332626342773, "rewards/rejected": -0.7710477113723755, "step": 245 }, { "epoch": 0.7352941176470589, "grad_norm": 5.63203763961792, "kl/avg_steps": 0.3499999940395355, "kl/beta": 0.00451111001893878, "kl/n_epsilon_steps": 0.32499998807907104, "kl/p_epsilon_steps": 0.675000011920929, "learning_rate": 1.0139748428955333e-07, "logits/chosen": -0.8333392143249512, "logits/rejected": -0.7355720400810242, "logps/chosen": -183.86294555664062, "logps/ref_chosen": -77.37183380126953, "logps/ref_rejected": -79.96475219726562, "logps/rejected": -237.01980590820312, "loss": 0.6155, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.4800783693790436, "rewards/margins": 0.22398455440998077, "rewards/rejected": -0.7040629982948303, "step": 250 }, { "epoch": 0.75, "grad_norm": 5.822533130645752, "kl/avg_steps": 0.35624998807907104, "kl/beta": 0.004430105909705162, "kl/n_epsilon_steps": 0.3218750059604645, "kl/p_epsilon_steps": 0.6781250238418579, "learning_rate": 9.127770814751932e-08, "logits/chosen": -0.8416454195976257, "logits/rejected": -0.7227948904037476, "logps/chosen": -184.06822204589844, "logps/ref_chosen": -79.62632751464844, "logps/ref_rejected": -83.8196792602539, "logps/rejected": -246.44998168945312, "loss": 0.6013, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.46239757537841797, "rewards/margins": 0.25374993681907654, "rewards/rejected": -0.7161475419998169, "step": 255 }, { "epoch": 0.7647058823529411, "grad_norm": 5.885540008544922, "kl/avg_steps": 0.3687500059604645, "kl/beta": 0.004350547678768635, "kl/n_epsilon_steps": 0.31562501192092896, "kl/p_epsilon_steps": 0.684374988079071, "learning_rate": 8.15760890883607e-08, "logits/chosen": -0.8616160154342651, "logits/rejected": -0.7643041610717773, "logps/chosen": -184.8510284423828, "logps/ref_chosen": -80.03411865234375, "logps/ref_rejected": -85.39453125, "logps/rejected": -246.5211639404297, "loss": 0.6056, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.4556017816066742, "rewards/margins": 0.2411097288131714, "rewards/rejected": -0.6967115998268127, "step": 260 }, { "epoch": 0.7794117647058824, "grad_norm": 5.461711883544922, "kl/avg_steps": 0.3531250059604645, "kl/beta": 0.0042734695598483086, "kl/n_epsilon_steps": 0.32343751192092896, "kl/p_epsilon_steps": 0.676562488079071, "learning_rate": 7.231818622338822e-08, "logits/chosen": -0.8387966156005859, "logits/rejected": -0.7239198088645935, "logps/chosen": -178.0113067626953, "logps/ref_chosen": -76.63539123535156, "logps/ref_rejected": -79.94613647460938, "logps/rejected": -238.1660614013672, "loss": 0.603, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.432711660861969, "rewards/margins": 0.2392859160900116, "rewards/rejected": -0.6719975471496582, "step": 265 }, { "epoch": 0.7941176470588235, "grad_norm": 5.6931915283203125, "kl/avg_steps": 0.3968749940395355, "kl/beta": 0.004198429174721241, "kl/n_epsilon_steps": 0.30156248807907104, "kl/p_epsilon_steps": 0.698437511920929, "learning_rate": 6.352838968463919e-08, "logits/chosen": -0.8596851229667664, "logits/rejected": -0.7193423509597778, "logps/chosen": -173.6400604248047, "logps/ref_chosen": -76.02762603759766, "logps/ref_rejected": -80.83404541015625, "logps/rejected": -236.97607421875, "loss": 0.6021, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.4092663824558258, "rewards/margins": 0.24205096065998077, "rewards/rejected": -0.6513173580169678, "step": 270 }, { "epoch": 0.8088235294117647, "grad_norm": 5.091865062713623, "kl/avg_steps": 0.40625, "kl/beta": 0.004111775197088718, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 5.5229856368582376e-08, "logits/chosen": -0.8502656817436218, "logits/rejected": -0.7681766748428345, "logps/chosen": -180.9755859375, "logps/ref_chosen": -77.58733367919922, "logps/ref_rejected": -88.50263214111328, "logps/rejected": -254.04690551757812, "loss": 0.5997, "rewards/accuracies": 0.739062488079071, "rewards/chosen": -0.4245019555091858, "rewards/margins": 0.25200843811035156, "rewards/rejected": -0.6765104532241821, "step": 275 }, { "epoch": 0.8235294117647058, "grad_norm": 5.737886905670166, "kl/avg_steps": 0.42500001192092896, "kl/beta": 0.004024769179522991, "kl/n_epsilon_steps": 0.2874999940395355, "kl/p_epsilon_steps": 0.7124999761581421, "learning_rate": 4.7444448928806615e-08, "logits/chosen": -0.876409649848938, "logits/rejected": -0.7702105641365051, "logps/chosen": -186.74009704589844, "logps/ref_chosen": -81.46415710449219, "logps/ref_rejected": -94.69911193847656, "logps/rejected": -265.301025390625, "loss": 0.5958, "rewards/accuracies": 0.729687511920929, "rewards/chosen": -0.4230107367038727, "rewards/margins": 0.2593509256839752, "rewards/rejected": -0.6823617219924927, "step": 280 }, { "epoch": 0.8382352941176471, "grad_norm": 5.05964469909668, "kl/avg_steps": 0.4000000059604645, "kl/beta": 0.003945710603147745, "kl/n_epsilon_steps": 0.30000001192092896, "kl/p_epsilon_steps": 0.699999988079071, "learning_rate": 4.019267817841834e-08, "logits/chosen": -0.8164280652999878, "logits/rejected": -0.7374383211135864, "logps/chosen": -183.66696166992188, "logps/ref_chosen": -77.9266128540039, "logps/ref_rejected": -85.77226257324219, "logps/rejected": -251.9569854736328, "loss": 0.6036, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.41665568947792053, "rewards/margins": 0.23489956557750702, "rewards/rejected": -0.6515552997589111, "step": 285 }, { "epoch": 0.8529411764705882, "grad_norm": 4.995400428771973, "kl/avg_steps": 0.37187498807907104, "kl/beta": 0.003868584055453539, "kl/n_epsilon_steps": 0.3140625059604645, "kl/p_epsilon_steps": 0.6859375238418579, "learning_rate": 3.349364905389032e-08, "logits/chosen": -0.788993775844574, "logits/rejected": -0.7104808688163757, "logps/chosen": -178.77645874023438, "logps/ref_chosen": -72.49942016601562, "logps/ref_rejected": -83.77849578857422, "logps/rejected": -253.64126586914062, "loss": 0.6008, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.41073670983314514, "rewards/margins": 0.24252267181873322, "rewards/rejected": -0.6532593965530396, "step": 290 }, { "epoch": 0.8676470588235294, "grad_norm": 5.24601411819458, "kl/avg_steps": 0.4000000059604645, "kl/beta": 0.0037986349780112505, "kl/n_epsilon_steps": 0.30000001192092896, "kl/p_epsilon_steps": 0.699999988079071, "learning_rate": 2.736501028272095e-08, "logits/chosen": -0.7796621918678284, "logits/rejected": -0.7315692901611328, "logps/chosen": -182.55836486816406, "logps/ref_chosen": -72.81735229492188, "logps/ref_rejected": -91.62478637695312, "logps/rejected": -265.3271789550781, "loss": 0.6044, "rewards/accuracies": 0.739062488079071, "rewards/chosen": -0.4162219166755676, "rewards/margins": 0.23918703198432922, "rewards/rejected": -0.6554089784622192, "step": 295 }, { "epoch": 0.8823529411764706, "grad_norm": 5.059004306793213, "kl/avg_steps": 0.39375001192092896, "kl/beta": 0.003725191578269005, "kl/n_epsilon_steps": 0.3031249940395355, "kl/p_epsilon_steps": 0.6968749761581421, "learning_rate": 2.1822907887504932e-08, "logits/chosen": -0.7761374711990356, "logits/rejected": -0.6450864672660828, "logps/chosen": -178.6937255859375, "logps/ref_chosen": -70.4697265625, "logps/ref_rejected": -77.26274108886719, "logps/rejected": -241.739013671875, "loss": 0.6156, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.40281882882118225, "rewards/margins": 0.20612934231758118, "rewards/rejected": -0.6089481115341187, "step": 300 }, { "epoch": 0.8823529411764706, "eval_kl/n_epsilon_steps": 0.3849826455116272, "eval_kl/p_epsilon_steps": 0.6150173544883728, "eval_logits/chosen": -0.8325175046920776, "eval_logits/rejected": -0.7259347438812256, "eval_logps/chosen": -208.17991638183594, "eval_logps/ref_chosen": -87.82356262207031, "eval_logps/ref_rejected": -82.81887817382812, "eval_logps/rejected": -243.57456970214844, "eval_loss": 0.6442785263061523, "eval_rewards/accuracies": 0.6401909589767456, "eval_rewards/chosen": -0.44304588437080383, "eval_rewards/margins": 0.14540132880210876, "eval_rewards/rejected": -0.5884472131729126, "eval_runtime": 22.3967, "eval_samples_per_second": 104.435, "eval_steps_per_second": 0.848, "step": 300 }, { "epoch": 0.8970588235294118, "grad_norm": 5.704087734222412, "kl/avg_steps": 0.375, "kl/beta": 0.003651682287454605, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 1.6881942648911074e-08, "logits/chosen": -0.7806903719902039, "logits/rejected": -0.7004286050796509, "logps/chosen": -181.45826721191406, "logps/ref_chosen": -75.5998764038086, "logps/ref_rejected": -86.76122283935547, "logps/rejected": -256.80096435546875, "loss": 0.6049, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.3862135410308838, "rewards/margins": 0.23094138503074646, "rewards/rejected": -0.6171549558639526, "step": 305 }, { "epoch": 0.9117647058823529, "grad_norm": 5.218584060668945, "kl/avg_steps": 0.3968749940395355, "kl/beta": 0.0035780933685600758, "kl/n_epsilon_steps": 0.30156248807907104, "kl/p_epsilon_steps": 0.698437511920929, "learning_rate": 1.2555131639630567e-08, "logits/chosen": -0.7832438349723816, "logits/rejected": -0.6719276309013367, "logps/chosen": -191.44869995117188, "logps/ref_chosen": -78.4868392944336, "logps/ref_rejected": -83.08047485351562, "logps/rejected": -258.40545654296875, "loss": 0.6111, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.4038282036781311, "rewards/margins": 0.21982701122760773, "rewards/rejected": -0.6236552000045776, "step": 310 }, { "epoch": 0.9264705882352942, "grad_norm": 6.10360860824585, "kl/avg_steps": 0.3375000059604645, "kl/beta": 0.0035165518056601286, "kl/n_epsilon_steps": 0.33125001192092896, "kl/p_epsilon_steps": 0.668749988079071, "learning_rate": 8.85387393063622e-09, "logits/chosen": -0.8057095408439636, "logits/rejected": -0.7011617422103882, "logps/chosen": -194.56436157226562, "logps/ref_chosen": -79.54651641845703, "logps/ref_rejected": -87.11808776855469, "logps/rejected": -261.40032958984375, "loss": 0.6153, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4042418897151947, "rewards/margins": 0.20526555180549622, "rewards/rejected": -0.6095074415206909, "step": 315 }, { "epoch": 0.9411764705882353, "grad_norm": 5.0830488204956055, "kl/avg_steps": 0.28437501192092896, "kl/beta": 0.0034615718759596348, "kl/n_epsilon_steps": 0.3578124940395355, "kl/p_epsilon_steps": 0.6421874761581421, "learning_rate": 5.7879205600998296e-09, "logits/chosen": -0.8048986196517944, "logits/rejected": -0.6852750778198242, "logps/chosen": -193.45582580566406, "logps/ref_chosen": -78.56401062011719, "logps/ref_rejected": -83.85292053222656, "logps/rejected": -248.977783203125, "loss": 0.6302, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.39771518111228943, "rewards/margins": 0.17076563835144043, "rewards/rejected": -0.5684808492660522, "step": 320 }, { "epoch": 0.9558823529411765, "grad_norm": 5.110870361328125, "kl/avg_steps": 0.3499999940395355, "kl/beta": 0.0034066252410411835, "kl/n_epsilon_steps": 0.32499998807907104, "kl/p_epsilon_steps": 0.675000011920929, "learning_rate": 3.3653488440851253e-09, "logits/chosen": -0.7829563021659851, "logits/rejected": -0.7219451665878296, "logps/chosen": -183.75088500976562, "logps/ref_chosen": -74.60850524902344, "logps/ref_rejected": -86.81698608398438, "logps/rejected": -254.2379150390625, "loss": 0.6219, "rewards/accuracies": 0.692187488079071, "rewards/chosen": -0.3717408776283264, "rewards/margins": 0.195209339261055, "rewards/rejected": -0.5669502019882202, "step": 325 }, { "epoch": 0.9705882352941176, "grad_norm": 4.562494277954102, "kl/avg_steps": 0.47343748807907104, "kl/beta": 0.003342044074088335, "kl/n_epsilon_steps": 0.26249998807907104, "kl/p_epsilon_steps": 0.7359374761581421, "learning_rate": 1.592541096695571e-09, "logits/chosen": -0.7936745882034302, "logits/rejected": -0.739700436592102, "logps/chosen": -178.63034057617188, "logps/ref_chosen": -74.63096618652344, "logps/ref_rejected": -92.50404357910156, "logps/rejected": -266.2847595214844, "loss": 0.601, "rewards/accuracies": 0.7578125, "rewards/chosen": -0.34669384360313416, "rewards/margins": 0.23011669516563416, "rewards/rejected": -0.5768105387687683, "step": 330 }, { "epoch": 0.9852941176470589, "grad_norm": 4.651317596435547, "kl/avg_steps": 0.35624998807907104, "kl/beta": 0.003271129447966814, "kl/n_epsilon_steps": 0.3218750059604645, "kl/p_epsilon_steps": 0.6781250238418579, "learning_rate": 4.741678157389739e-10, "logits/chosen": -0.8402039408683777, "logits/rejected": -0.7369452118873596, "logps/chosen": -193.51834106445312, "logps/ref_chosen": -81.25680541992188, "logps/ref_rejected": -88.71739196777344, "logps/rejected": -261.07110595703125, "loss": 0.6167, "rewards/accuracies": 0.7203124761581421, "rewards/chosen": -0.3669508695602417, "rewards/margins": 0.19351065158843994, "rewards/rejected": -0.5604615211486816, "step": 335 }, { "epoch": 1.0, "grad_norm": 4.5893425941467285, "kl/avg_steps": 0.37812501192092896, "kl/beta": 0.003211395815014839, "kl/n_epsilon_steps": 0.3109374940395355, "kl/p_epsilon_steps": 0.6890624761581421, "learning_rate": 1.31753782067201e-11, "logits/chosen": -0.7557514905929565, "logits/rejected": -0.6398700475692749, "logps/chosen": -185.0140838623047, "logps/ref_chosen": -72.54796600341797, "logps/ref_rejected": -78.83277893066406, "logps/rejected": -256.5284423828125, "loss": 0.612, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.36068642139434814, "rewards/margins": 0.20638315379619598, "rewards/rejected": -0.5670695900917053, "step": 340 }, { "epoch": 1.0, "step": 340, "total_flos": 0.0, "train_loss": 0.6232832217917723, "train_runtime": 1489.7896, "train_samples_per_second": 29.265, "train_steps_per_second": 0.228 } ], "logging_steps": 5, "max_steps": 340, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }