{ "best_global_step": 4400, "best_metric": 0.6191316843032837, "best_model_checkpoint": "/mnt/scratch/users/sglli24/fine-tuning-project/fine_tuned_model/llama2-tatoeba-en-fr-20251120-101824/checkpoint-4400", "epoch": 0.9777777777777777, "eval_steps": 200, "global_step": 4400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.8978288495540618, "epoch": 0.011111111111111112, "grad_norm": 4.291239261627197, "learning_rate": 7.25925925925926e-05, "loss": 1.6226, "mean_token_accuracy": 0.6836586940288544, "num_tokens": 18001.0, "step": 50 }, { "entropy": 0.6665060448646546, "epoch": 0.022222222222222223, "grad_norm": 1.3680720329284668, "learning_rate": 0.00014666666666666666, "loss": 0.6686, "mean_token_accuracy": 0.8410311663150787, "num_tokens": 35844.0, "step": 100 }, { "entropy": 0.6690424817800522, "epoch": 0.03333333333333333, "grad_norm": 0.8499931693077087, "learning_rate": 0.00019999492362553862, "loss": 0.6736, "mean_token_accuracy": 0.8405047404766083, "num_tokens": 53968.0, "step": 150 }, { "entropy": 0.6419891297817231, "epoch": 0.044444444444444446, "grad_norm": 0.5812472105026245, "learning_rate": 0.0001998939319921494, "loss": 0.6571, "mean_token_accuracy": 0.8440787088871002, "num_tokens": 71937.0, "step": 200 }, { "epoch": 0.044444444444444446, "eval_entropy": 0.6093449130654335, "eval_loss": 0.7027862071990967, "eval_mean_token_accuracy": 0.8369034069776535, "eval_num_tokens": 71937.0, "eval_runtime": 88.0125, "eval_samples_per_second": 45.448, "eval_steps_per_second": 5.681, "step": 200 }, { "entropy": 0.6681106066703797, "epoch": 0.05555555555555555, "grad_norm": 0.41130882501602173, "learning_rate": 0.00019966359123301493, "loss": 0.6637, "mean_token_accuracy": 0.8417772734165192, "num_tokens": 90058.0, "step": 250 }, { "entropy": 0.6466343528032303, "epoch": 0.06666666666666667, "grad_norm": 0.5034320950508118, "learning_rate": 0.00019930419960825186, "loss": 0.6507, "mean_token_accuracy": 0.8456220865249634, "num_tokens": 107871.0, "step": 300 }, { "entropy": 0.659225270152092, "epoch": 0.07777777777777778, "grad_norm": 0.6105099320411682, "learning_rate": 0.0001988162224813867, "loss": 0.6507, "mean_token_accuracy": 0.8445734548568725, "num_tokens": 126050.0, "step": 350 }, { "entropy": 0.6419739973545074, "epoch": 0.08888888888888889, "grad_norm": 0.4679639935493469, "learning_rate": 0.00019820029171677286, "loss": 0.658, "mean_token_accuracy": 0.8432002317905426, "num_tokens": 143680.0, "step": 400 }, { "epoch": 0.08888888888888889, "eval_entropy": 0.5937302582263947, "eval_loss": 0.68473219871521, "eval_mean_token_accuracy": 0.8390118762254715, "eval_num_tokens": 143680.0, "eval_runtime": 88.0097, "eval_samples_per_second": 45.45, "eval_steps_per_second": 5.681, "step": 400 }, { "entropy": 0.6531517493724823, "epoch": 0.1, "grad_norm": 0.7278866171836853, "learning_rate": 0.00019745720486141172, "loss": 0.6568, "mean_token_accuracy": 0.8461188304424286, "num_tokens": 161432.0, "step": 450 }, { "entropy": 0.6509544163942337, "epoch": 0.1111111111111111, "grad_norm": 0.5412130951881409, "learning_rate": 0.00019658792411223736, "loss": 0.6524, "mean_token_accuracy": 0.8447290122509002, "num_tokens": 179392.0, "step": 500 }, { "entropy": 0.6577291601896286, "epoch": 0.12222222222222222, "grad_norm": 0.48672693967819214, "learning_rate": 0.00019559357507020162, "loss": 0.6567, "mean_token_accuracy": 0.8449541187286377, "num_tokens": 197260.0, "step": 550 }, { "entropy": 0.6546426635980606, "epoch": 0.13333333333333333, "grad_norm": 0.355656236410141, "learning_rate": 0.00019447544528277316, "loss": 0.6505, "mean_token_accuracy": 0.8445561909675598, "num_tokens": 215226.0, "step": 600 }, { "epoch": 0.13333333333333333, "eval_entropy": 0.5969589519500732, "eval_loss": 0.6697619557380676, "eval_mean_token_accuracy": 0.8419688076972961, "eval_num_tokens": 215226.0, "eval_runtime": 87.9751, "eval_samples_per_second": 45.467, "eval_steps_per_second": 5.683, "step": 600 }, { "entropy": 0.6280296057462692, "epoch": 0.14444444444444443, "grad_norm": 0.38800254464149475, "learning_rate": 0.00019323498257673775, "loss": 0.6329, "mean_token_accuracy": 0.851637612581253, "num_tokens": 232693.0, "step": 650 }, { "entropy": 0.6526319080591202, "epoch": 0.15555555555555556, "grad_norm": 0.3850780427455902, "learning_rate": 0.00019187379318345846, "loss": 0.6558, "mean_token_accuracy": 0.8436833143234252, "num_tokens": 250484.0, "step": 700 }, { "entropy": 0.6411016964912415, "epoch": 0.16666666666666666, "grad_norm": 0.4081222116947174, "learning_rate": 0.00019039363965902336, "loss": 0.6377, "mean_token_accuracy": 0.8467725789546967, "num_tokens": 268442.0, "step": 750 }, { "entropy": 0.6459184002876281, "epoch": 0.17777777777777778, "grad_norm": 0.40721970796585083, "learning_rate": 0.0001887964386019739, "loss": 0.6432, "mean_token_accuracy": 0.8461637806892395, "num_tokens": 286332.0, "step": 800 }, { "epoch": 0.17777777777777778, "eval_entropy": 0.6085493609309196, "eval_loss": 0.6541684865951538, "eval_mean_token_accuracy": 0.8444654858112335, "eval_num_tokens": 286332.0, "eval_runtime": 88.0078, "eval_samples_per_second": 45.451, "eval_steps_per_second": 5.681, "step": 800 }, { "entropy": 0.652884566783905, "epoch": 0.18888888888888888, "grad_norm": 0.3846185505390167, "learning_rate": 0.0001870842581715691, "loss": 0.6537, "mean_token_accuracy": 0.8446102023124695, "num_tokens": 304193.0, "step": 850 }, { "entropy": 0.6620096349716187, "epoch": 0.2, "grad_norm": 0.48012644052505493, "learning_rate": 0.0001852593154097991, "loss": 0.6477, "mean_token_accuracy": 0.8448492765426636, "num_tokens": 322178.0, "step": 900 }, { "entropy": 0.6532311421632767, "epoch": 0.2111111111111111, "grad_norm": 0.3579741418361664, "learning_rate": 0.00018332397337061585, "loss": 0.6536, "mean_token_accuracy": 0.8460512828826904, "num_tokens": 340138.0, "step": 950 }, { "entropy": 0.647466834783554, "epoch": 0.2222222222222222, "grad_norm": 0.3946477472782135, "learning_rate": 0.000181280738060098, "loss": 0.6452, "mean_token_accuracy": 0.8455521380901336, "num_tokens": 358160.0, "step": 1000 }, { "epoch": 0.2222222222222222, "eval_entropy": 0.6152187791466713, "eval_loss": 0.6446735858917236, "eval_mean_token_accuracy": 0.8457589077949524, "eval_num_tokens": 358160.0, "eval_runtime": 88.02, "eval_samples_per_second": 45.444, "eval_steps_per_second": 5.681, "step": 1000 }, { "entropy": 0.6481309163570405, "epoch": 0.23333333333333334, "grad_norm": 0.3936346769332886, "learning_rate": 0.00017913225519151194, "loss": 0.6366, "mean_token_accuracy": 0.8483670651912689, "num_tokens": 376065.0, "step": 1050 }, { "entropy": 0.6381743800640106, "epoch": 0.24444444444444444, "grad_norm": 0.41422468423843384, "learning_rate": 0.00017688130675947122, "loss": 0.6342, "mean_token_accuracy": 0.8456036126613617, "num_tokens": 394017.0, "step": 1100 }, { "entropy": 0.6401758706569671, "epoch": 0.25555555555555554, "grad_norm": 0.34131714701652527, "learning_rate": 0.00017453080743763, "loss": 0.633, "mean_token_accuracy": 0.8499649393558503, "num_tokens": 411946.0, "step": 1150 }, { "entropy": 0.6275939923524857, "epoch": 0.26666666666666666, "grad_norm": 0.35889938473701477, "learning_rate": 0.00017208380080457485, "loss": 0.6322, "mean_token_accuracy": 0.8477345359325409, "num_tokens": 429919.0, "step": 1200 }, { "epoch": 0.26666666666666666, "eval_entropy": 0.6125104904770852, "eval_loss": 0.6421033143997192, "eval_mean_token_accuracy": 0.8464078787565231, "eval_num_tokens": 429919.0, "eval_runtime": 87.9726, "eval_samples_per_second": 45.469, "eval_steps_per_second": 5.684, "step": 1200 }, { "entropy": 0.6274101620912552, "epoch": 0.2777777777777778, "grad_norm": 0.43362855911254883, "learning_rate": 0.0001695434554028025, "loss": 0.6316, "mean_token_accuracy": 0.8491797077655793, "num_tokens": 447547.0, "step": 1250 }, { "entropy": 0.6455954253673554, "epoch": 0.28888888888888886, "grad_norm": 0.4339112937450409, "learning_rate": 0.00016691306063588583, "loss": 0.6414, "mean_token_accuracy": 0.8443870341777802, "num_tokens": 465741.0, "step": 1300 }, { "entropy": 0.6329428994655609, "epoch": 0.3, "grad_norm": 0.42899656295776367, "learning_rate": 0.00016419602250914155, "loss": 0.6252, "mean_token_accuracy": 0.8489247059822083, "num_tokens": 483587.0, "step": 1350 }, { "entropy": 0.6409079706668854, "epoch": 0.3111111111111111, "grad_norm": 0.4631573259830475, "learning_rate": 0.00016139585921931394, "loss": 0.6553, "mean_token_accuracy": 0.8468127429485321, "num_tokens": 501709.0, "step": 1400 }, { "epoch": 0.3111111111111111, "eval_entropy": 0.621678286254406, "eval_loss": 0.6395798325538635, "eval_mean_token_accuracy": 0.8467997794151306, "eval_num_tokens": 501709.0, "eval_runtime": 87.9297, "eval_samples_per_second": 45.491, "eval_steps_per_second": 5.686, "step": 1400 }, { "entropy": 0.6271992588043213, "epoch": 0.32222222222222224, "grad_norm": 0.3840474486351013, "learning_rate": 0.00015851619659898623, "loss": 0.6295, "mean_token_accuracy": 0.8492995512485504, "num_tokens": 519472.0, "step": 1450 }, { "entropy": 0.6468123584985733, "epoch": 0.3333333333333333, "grad_norm": 0.38773396611213684, "learning_rate": 0.00015556076342161795, "loss": 0.6468, "mean_token_accuracy": 0.8433372890949249, "num_tokens": 537436.0, "step": 1500 }, { "entropy": 0.6341830235719681, "epoch": 0.34444444444444444, "grad_norm": 0.37739357352256775, "learning_rate": 0.00015253338657328784, "loss": 0.6282, "mean_token_accuracy": 0.8481453454494476, "num_tokens": 555101.0, "step": 1550 }, { "entropy": 0.6232440257072449, "epoch": 0.35555555555555557, "grad_norm": 0.4216736853122711, "learning_rate": 0.00014943798609739418, "loss": 0.6077, "mean_token_accuracy": 0.853328298330307, "num_tokens": 572708.0, "step": 1600 }, { "epoch": 0.35555555555555557, "eval_entropy": 0.5843898313045501, "eval_loss": 0.6388325095176697, "eval_mean_token_accuracy": 0.8475761367082596, "eval_num_tokens": 572708.0, "eval_runtime": 88.0014, "eval_samples_per_second": 45.454, "eval_steps_per_second": 5.682, "step": 1600 }, { "entropy": 0.629493715763092, "epoch": 0.36666666666666664, "grad_norm": 0.4576176106929779, "learning_rate": 0.00014627857011872893, "loss": 0.6369, "mean_token_accuracy": 0.8483493518829346, "num_tokens": 590473.0, "step": 1650 }, { "entropy": 0.6264066398143768, "epoch": 0.37777777777777777, "grad_norm": 0.3063693940639496, "learning_rate": 0.00014305922965349857, "loss": 0.6223, "mean_token_accuracy": 0.8495516037940979, "num_tokens": 608122.0, "step": 1700 }, { "entropy": 0.6332901197671891, "epoch": 0.3888888888888889, "grad_norm": 0.45330291986465454, "learning_rate": 0.00013978413331201158, "loss": 0.6393, "mean_token_accuracy": 0.8468031466007233, "num_tokens": 626153.0, "step": 1750 }, { "entropy": 0.6384645104408264, "epoch": 0.4, "grad_norm": 0.4152071475982666, "learning_rate": 0.00013645752190089206, "loss": 0.6325, "mean_token_accuracy": 0.8465726387500763, "num_tokens": 644466.0, "step": 1800 }, { "epoch": 0.4, "eval_entropy": 0.6132586502432823, "eval_loss": 0.6346827149391174, "eval_mean_token_accuracy": 0.8480832484960557, "eval_num_tokens": 644466.0, "eval_runtime": 87.9901, "eval_samples_per_second": 45.46, "eval_steps_per_second": 5.682, "step": 1800 }, { "entropy": 0.631557651758194, "epoch": 0.4111111111111111, "grad_norm": 0.2891533076763153, "learning_rate": 0.00013308370293180902, "loss": 0.6255, "mean_token_accuracy": 0.8511977612972259, "num_tokens": 662209.0, "step": 1850 }, { "entropy": 0.6207279634475708, "epoch": 0.4222222222222222, "grad_norm": 0.42430344223976135, "learning_rate": 0.00012966704504383168, "loss": 0.6205, "mean_token_accuracy": 0.8519205749034882, "num_tokens": 679790.0, "step": 1900 }, { "entropy": 0.6246707677841187, "epoch": 0.43333333333333335, "grad_norm": 0.29151901602745056, "learning_rate": 0.00012621197234663283, "loss": 0.6247, "mean_token_accuracy": 0.8536795401573181, "num_tokens": 697693.0, "step": 1950 }, { "entropy": 0.6282322096824646, "epoch": 0.4444444444444444, "grad_norm": 0.40175795555114746, "learning_rate": 0.0001227229586918655, "loss": 0.6315, "mean_token_accuracy": 0.8500750088691711, "num_tokens": 715480.0, "step": 2000 }, { "epoch": 0.4444444444444444, "eval_entropy": 0.614536872446537, "eval_loss": 0.6332096457481384, "eval_mean_token_accuracy": 0.8484884305000305, "eval_num_tokens": 715480.0, "eval_runtime": 87.9964, "eval_samples_per_second": 45.456, "eval_steps_per_second": 5.682, "step": 2000 }, { "entropy": 0.6323211324214936, "epoch": 0.45555555555555555, "grad_norm": 0.3964557945728302, "learning_rate": 0.00011920452188013029, "loss": 0.6327, "mean_token_accuracy": 0.8506696331501007, "num_tokens": 733381.0, "step": 2050 }, { "entropy": 0.6420121788978577, "epoch": 0.4666666666666667, "grad_norm": 0.270047664642334, "learning_rate": 0.0001156612178110351, "loss": 0.6356, "mean_token_accuracy": 0.8502523565292358, "num_tokens": 751399.0, "step": 2100 }, { "entropy": 0.6098494738340378, "epoch": 0.4777777777777778, "grad_norm": 0.30119839310646057, "learning_rate": 0.00011209763458392135, "loss": 0.6028, "mean_token_accuracy": 0.8526248228549957, "num_tokens": 769220.0, "step": 2150 }, { "entropy": 0.6257252705097198, "epoch": 0.4888888888888889, "grad_norm": 0.41217416524887085, "learning_rate": 0.00010851838655689625, "loss": 0.6278, "mean_token_accuracy": 0.8503323125839234, "num_tokens": 787060.0, "step": 2200 }, { "epoch": 0.4888888888888889, "eval_entropy": 0.6182988230586052, "eval_loss": 0.629729151725769, "eval_mean_token_accuracy": 0.8488966919183731, "eval_num_tokens": 787060.0, "eval_runtime": 88.02, "eval_samples_per_second": 45.444, "eval_steps_per_second": 5.681, "step": 2200 }, { "entropy": 0.629525854587555, "epoch": 0.5, "grad_norm": 0.33919742703437805, "learning_rate": 0.00010492810837186333, "loss": 0.6288, "mean_token_accuracy": 0.8490475380420685, "num_tokens": 804988.0, "step": 2250 }, { "entropy": 0.6336910331249237, "epoch": 0.5111111111111111, "grad_norm": 0.42546504735946655, "learning_rate": 0.00010133144895328832, "loss": 0.6302, "mean_token_accuracy": 0.848661150932312, "num_tokens": 823030.0, "step": 2300 }, { "entropy": 0.6298299109935761, "epoch": 0.5222222222222223, "grad_norm": 0.4121692180633545, "learning_rate": 9.7733065488471e-05, "loss": 0.6261, "mean_token_accuracy": 0.8524516999721528, "num_tokens": 840873.0, "step": 2350 }, { "entropy": 0.6273639261722564, "epoch": 0.5333333333333333, "grad_norm": 0.33091071248054504, "learning_rate": 9.413761739711771e-05, "loss": 0.6279, "mean_token_accuracy": 0.8502094805240631, "num_tokens": 858631.0, "step": 2400 }, { "epoch": 0.5333333333333333, "eval_entropy": 0.6137934300303459, "eval_loss": 0.6281214356422424, "eval_mean_token_accuracy": 0.849094120979309, "eval_num_tokens": 858631.0, "eval_runtime": 88.0567, "eval_samples_per_second": 45.425, "eval_steps_per_second": 5.678, "step": 2400 }, { "entropy": 0.6171291017532349, "epoch": 0.5444444444444444, "grad_norm": 0.41635480523109436, "learning_rate": 9.054976029802337e-05, "loss": 0.6157, "mean_token_accuracy": 0.8495936059951782, "num_tokens": 876423.0, "step": 2450 }, { "entropy": 0.6276765954494476, "epoch": 0.5555555555555556, "grad_norm": 0.43940269947052, "learning_rate": 8.6974139980675e-05, "loss": 0.6233, "mean_token_accuracy": 0.8525845003128052, "num_tokens": 894210.0, "step": 2500 }, { "entropy": 0.6247690558433533, "epoch": 0.5666666666666667, "grad_norm": 0.31268852949142456, "learning_rate": 8.341538638958291e-05, "loss": 0.6255, "mean_token_accuracy": 0.8500683605670929, "num_tokens": 912075.0, "step": 2550 }, { "entropy": 0.6128390139341354, "epoch": 0.5777777777777777, "grad_norm": 0.3916880190372467, "learning_rate": 7.987810762912924e-05, "loss": 0.615, "mean_token_accuracy": 0.8547429955005645, "num_tokens": 929888.0, "step": 2600 }, { "epoch": 0.5777777777777777, "eval_entropy": 0.612749766767025, "eval_loss": 0.6256938576698303, "eval_mean_token_accuracy": 0.8497577202320099, "eval_num_tokens": 929888.0, "eval_runtime": 87.9613, "eval_samples_per_second": 45.475, "eval_steps_per_second": 5.684, "step": 2600 }, { "entropy": 0.6389735889434814, "epoch": 0.5888888888888889, "grad_norm": 0.38676777482032776, "learning_rate": 7.636688399669589e-05, "loss": 0.6435, "mean_token_accuracy": 0.84725133061409, "num_tokens": 947902.0, "step": 2650 }, { "entropy": 0.6316736024618149, "epoch": 0.6, "grad_norm": 0.3665507733821869, "learning_rate": 7.288626205179951e-05, "loss": 0.6242, "mean_token_accuracy": 0.8496048271656036, "num_tokens": 965905.0, "step": 2700 }, { "entropy": 0.6109506344795227, "epoch": 0.6111111111111112, "grad_norm": 0.4900154769420624, "learning_rate": 6.944074872891199e-05, "loss": 0.6063, "mean_token_accuracy": 0.852264449596405, "num_tokens": 983884.0, "step": 2750 }, { "entropy": 0.6153418481349945, "epoch": 0.6222222222222222, "grad_norm": 0.47226202487945557, "learning_rate": 6.603480550158995e-05, "loss": 0.62, "mean_token_accuracy": 0.8504341340065003, "num_tokens": 1001717.0, "step": 2800 }, { "epoch": 0.6222222222222222, "eval_entropy": 0.619692858338356, "eval_loss": 0.6247742772102356, "eval_mean_token_accuracy": 0.8500253454446792, "eval_num_tokens": 1001717.0, "eval_runtime": 87.9899, "eval_samples_per_second": 45.46, "eval_steps_per_second": 5.682, "step": 2800 }, { "entropy": 0.6241399937868118, "epoch": 0.6333333333333333, "grad_norm": 0.4187294840812683, "learning_rate": 6.267284260547049e-05, "loss": 0.6156, "mean_token_accuracy": 0.8507660686969757, "num_tokens": 1019451.0, "step": 2850 }, { "entropy": 0.6226123148202896, "epoch": 0.6444444444444445, "grad_norm": 0.33195361495018005, "learning_rate": 5.9359213327612416e-05, "loss": 0.6129, "mean_token_accuracy": 0.8508730328083038, "num_tokens": 1037299.0, "step": 2900 }, { "entropy": 0.6067140877246857, "epoch": 0.6555555555555556, "grad_norm": 0.4044334292411804, "learning_rate": 5.609820836957871e-05, "loss": 0.5978, "mean_token_accuracy": 0.8547445130348206, "num_tokens": 1054997.0, "step": 2950 }, { "entropy": 0.6072817480564118, "epoch": 0.6666666666666666, "grad_norm": 0.3837581276893616, "learning_rate": 5.28940502915587e-05, "loss": 0.6195, "mean_token_accuracy": 0.8500410854816437, "num_tokens": 1072981.0, "step": 3000 }, { "epoch": 0.6666666666666666, "eval_entropy": 0.6186771767735482, "eval_loss": 0.6234644055366516, "eval_mean_token_accuracy": 0.8501761881113052, "eval_num_tokens": 1072981.0, "eval_runtime": 88.0305, "eval_samples_per_second": 45.439, "eval_steps_per_second": 5.68, "step": 3000 }, { "entropy": 0.6253303802013397, "epoch": 0.6777777777777778, "grad_norm": 0.4809369742870331, "learning_rate": 4.975088804472356e-05, "loss": 0.6199, "mean_token_accuracy": 0.850438643693924, "num_tokens": 1090627.0, "step": 3050 }, { "entropy": 0.6156042230129242, "epoch": 0.6888888888888889, "grad_norm": 0.35011181235313416, "learning_rate": 4.667279159889624e-05, "loss": 0.6211, "mean_token_accuracy": 0.8514653861522674, "num_tokens": 1108362.0, "step": 3100 }, { "entropy": 0.6167298531532288, "epoch": 0.7, "grad_norm": 0.41358232498168945, "learning_rate": 4.366374667249118e-05, "loss": 0.6069, "mean_token_accuracy": 0.8523639941215515, "num_tokens": 1126044.0, "step": 3150 }, { "entropy": 0.6212670838832856, "epoch": 0.7111111111111111, "grad_norm": 0.44910740852355957, "learning_rate": 4.0727649571548146e-05, "loss": 0.6272, "mean_token_accuracy": 0.8506516909599304, "num_tokens": 1143877.0, "step": 3200 }, { "epoch": 0.7111111111111111, "eval_entropy": 0.6189550485610962, "eval_loss": 0.622456431388855, "eval_mean_token_accuracy": 0.8506602959632874, "eval_num_tokens": 1143877.0, "eval_runtime": 87.9986, "eval_samples_per_second": 45.455, "eval_steps_per_second": 5.682, "step": 3200 }, { "entropy": 0.6167463368177414, "epoch": 0.7222222222222222, "grad_norm": 0.4499205946922302, "learning_rate": 3.786830214454315e-05, "loss": 0.6159, "mean_token_accuracy": 0.8481730723381042, "num_tokens": 1161796.0, "step": 3250 }, { "entropy": 0.6131170511245727, "epoch": 0.7333333333333333, "grad_norm": 0.38012415170669556, "learning_rate": 3.5089406859509166e-05, "loss": 0.6219, "mean_token_accuracy": 0.8537890160083771, "num_tokens": 1179566.0, "step": 3300 }, { "entropy": 0.6255488413572311, "epoch": 0.7444444444444445, "grad_norm": 0.3967890739440918, "learning_rate": 3.2394562009840835e-05, "loss": 0.6268, "mean_token_accuracy": 0.8487379801273346, "num_tokens": 1197824.0, "step": 3350 }, { "entropy": 0.6168732368946075, "epoch": 0.7555555555555555, "grad_norm": 0.38855499029159546, "learning_rate": 2.9787257054991592e-05, "loss": 0.6036, "mean_token_accuracy": 0.8553757643699647, "num_tokens": 1215503.0, "step": 3400 }, { "epoch": 0.7555555555555555, "eval_entropy": 0.6248494290113449, "eval_loss": 0.6211217045783997, "eval_mean_token_accuracy": 0.8511025402545929, "eval_num_tokens": 1215503.0, "eval_runtime": 87.9806, "eval_samples_per_second": 45.465, "eval_steps_per_second": 5.683, "step": 3400 }, { "entropy": 0.6244831168651581, "epoch": 0.7666666666666667, "grad_norm": 0.40160423517227173, "learning_rate": 2.727086810209559e-05, "loss": 0.6144, "mean_token_accuracy": 0.8539444077014923, "num_tokens": 1233233.0, "step": 3450 }, { "entropy": 0.6258886575698852, "epoch": 0.7777777777777778, "grad_norm": 0.4197278618812561, "learning_rate": 2.4848653534365886e-05, "loss": 0.6157, "mean_token_accuracy": 0.8522231721878052, "num_tokens": 1251135.0, "step": 3500 }, { "entropy": 0.6242718535661698, "epoch": 0.7888888888888889, "grad_norm": 0.31663864850997925, "learning_rate": 2.2523749791929127e-05, "loss": 0.6236, "mean_token_accuracy": 0.8481460773944854, "num_tokens": 1269437.0, "step": 3550 }, { "entropy": 0.6332012844085694, "epoch": 0.8, "grad_norm": 0.4183398485183716, "learning_rate": 2.029916731055981e-05, "loss": 0.6331, "mean_token_accuracy": 0.8506816875934601, "num_tokens": 1287466.0, "step": 3600 }, { "epoch": 0.8, "eval_entropy": 0.6211995969414711, "eval_loss": 0.6201685667037964, "eval_mean_token_accuracy": 0.8512796934843063, "eval_num_tokens": 1287466.0, "eval_runtime": 87.995, "eval_samples_per_second": 45.457, "eval_steps_per_second": 5.682, "step": 3600 }, { "entropy": 0.6207393455505371, "epoch": 0.8111111111111111, "grad_norm": 0.3644893169403076, "learning_rate": 1.8177786623573322e-05, "loss": 0.6051, "mean_token_accuracy": 0.8554283630847931, "num_tokens": 1305114.0, "step": 3650 }, { "entropy": 0.6093165588378906, "epoch": 0.8222222222222222, "grad_norm": 0.4035656154155731, "learning_rate": 1.6162354631925204e-05, "loss": 0.5942, "mean_token_accuracy": 0.8568060100078583, "num_tokens": 1322570.0, "step": 3700 }, { "entropy": 0.610592405796051, "epoch": 0.8333333333333334, "grad_norm": 0.4430359899997711, "learning_rate": 1.425548104734583e-05, "loss": 0.6228, "mean_token_accuracy": 0.853781110048294, "num_tokens": 1340182.0, "step": 3750 }, { "entropy": 0.624454995393753, "epoch": 0.8444444444444444, "grad_norm": 0.47560906410217285, "learning_rate": 1.2459635013117043e-05, "loss": 0.6285, "mean_token_accuracy": 0.8506780207157135, "num_tokens": 1357930.0, "step": 3800 }, { "epoch": 0.8444444444444444, "eval_entropy": 0.6203543889522553, "eval_loss": 0.6197088360786438, "eval_mean_token_accuracy": 0.851304793715477, "eval_num_tokens": 1357930.0, "eval_runtime": 88.0719, "eval_samples_per_second": 45.417, "eval_steps_per_second": 5.677, "step": 3800 }, { "entropy": 0.6153292739391327, "epoch": 0.8555555555555555, "grad_norm": 0.6369743347167969, "learning_rate": 1.0777141906865584e-05, "loss": 0.6206, "mean_token_accuracy": 0.8497644782066345, "num_tokens": 1375720.0, "step": 3850 }, { "entropy": 0.6167460489273071, "epoch": 0.8666666666666667, "grad_norm": 0.31762850284576416, "learning_rate": 9.210180329513674e-06, "loss": 0.6164, "mean_token_accuracy": 0.8522521209716797, "num_tokens": 1393343.0, "step": 3900 }, { "entropy": 0.6256991571187973, "epoch": 0.8777777777777778, "grad_norm": 0.44901618361473083, "learning_rate": 7.760779284285724e-06, "loss": 0.6304, "mean_token_accuracy": 0.8484922182559967, "num_tokens": 1411415.0, "step": 3950 }, { "entropy": 0.6351810383796692, "epoch": 0.8888888888888888, "grad_norm": 0.3676467537879944, "learning_rate": 6.430815549423541e-06, "loss": 0.6343, "mean_token_accuracy": 0.8446752560138703, "num_tokens": 1429355.0, "step": 4000 }, { "epoch": 0.8888888888888888, "eval_entropy": 0.6227085783481597, "eval_loss": 0.6193701028823853, "eval_mean_token_accuracy": 0.8513179312944412, "eval_num_tokens": 1429355.0, "eval_runtime": 87.8963, "eval_samples_per_second": 45.508, "eval_steps_per_second": 5.689, "step": 4000 }, { "entropy": 0.6218395137786865, "epoch": 0.9, "grad_norm": 0.352983683347702, "learning_rate": 5.222011248012537e-06, "loss": 0.6208, "mean_token_accuracy": 0.8524598634243011, "num_tokens": 1447378.0, "step": 4050 }, { "entropy": 0.6192989981174469, "epoch": 0.9111111111111111, "grad_norm": 0.5437832474708557, "learning_rate": 4.1359316180653806e-06, "loss": 0.6071, "mean_token_accuracy": 0.8508526837825775, "num_tokens": 1465326.0, "step": 4100 }, { "entropy": 0.6204692393541336, "epoch": 0.9222222222222223, "grad_norm": 0.29962158203125, "learning_rate": 3.1739829857504234e-06, "loss": 0.6173, "mean_token_accuracy": 0.8498383402824402, "num_tokens": 1483262.0, "step": 4150 }, { "entropy": 0.6081379109621048, "epoch": 0.9333333333333333, "grad_norm": 0.39637628197669983, "learning_rate": 2.3374109443897065e-06, "loss": 0.6151, "mean_token_accuracy": 0.8525230586528778, "num_tokens": 1500855.0, "step": 4200 }, { "epoch": 0.9333333333333333, "eval_entropy": 0.6221803342103958, "eval_loss": 0.6191594004631042, "eval_mean_token_accuracy": 0.8513582646846771, "eval_num_tokens": 1500855.0, "eval_runtime": 87.871, "eval_samples_per_second": 45.521, "eval_steps_per_second": 5.69, "step": 4200 }, { "entropy": 0.6116468846797943, "epoch": 0.9444444444444444, "grad_norm": 0.3078557252883911, "learning_rate": 1.6272987415841267e-06, "loss": 0.598, "mean_token_accuracy": 0.8564710664749146, "num_tokens": 1518914.0, "step": 4250 }, { "entropy": 0.6252698361873626, "epoch": 0.9555555555555556, "grad_norm": 0.38908475637435913, "learning_rate": 1.0445658765543153e-06, "loss": 0.6194, "mean_token_accuracy": 0.8515545094013214, "num_tokens": 1536933.0, "step": 4300 }, { "entropy": 0.6222954159975052, "epoch": 0.9666666666666667, "grad_norm": 0.4802163541316986, "learning_rate": 5.899669095136174e-07, "loss": 0.6291, "mean_token_accuracy": 0.8507583463191986, "num_tokens": 1555013.0, "step": 4350 }, { "entropy": 0.6046170508861541, "epoch": 0.9777777777777777, "grad_norm": 0.3905353546142578, "learning_rate": 2.640904846146652e-07, "loss": 0.6059, "mean_token_accuracy": 0.8536884272098542, "num_tokens": 1572842.0, "step": 4400 }, { "epoch": 0.9777777777777777, "eval_entropy": 0.6219840022921562, "eval_loss": 0.6191316843032837, "eval_mean_token_accuracy": 0.8512439979314804, "eval_num_tokens": 1572842.0, "eval_runtime": 87.8917, "eval_samples_per_second": 45.511, "eval_steps_per_second": 5.689, "step": 4400 } ], "logging_steps": 50, "max_steps": 4500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.357263504886989e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }