{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 10000, "global_step": 9132, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010950803515207929, "grad_norm": 5.517458438873291, "learning_rate": 1.0831509846827136e-06, "loss": 2.5966, "step": 100 }, { "epoch": 0.021901607030415857, "grad_norm": 5.273920059204102, "learning_rate": 2.177242888402626e-06, "loss": 2.3125, "step": 200 }, { "epoch": 0.03285241054562379, "grad_norm": 5.7524261474609375, "learning_rate": 3.2713347921225385e-06, "loss": 2.1574, "step": 300 }, { "epoch": 0.043803214060831715, "grad_norm": 4.415042400360107, "learning_rate": 4.365426695842451e-06, "loss": 1.9882, "step": 400 }, { "epoch": 0.05475401757603964, "grad_norm": 3.4843761920928955, "learning_rate": 5.459518599562363e-06, "loss": 1.9221, "step": 500 }, { "epoch": 0.06570482109124758, "grad_norm": 3.8609654903411865, "learning_rate": 6.553610503282276e-06, "loss": 1.9077, "step": 600 }, { "epoch": 0.0766556246064555, "grad_norm": 3.310288429260254, "learning_rate": 7.64770240700219e-06, "loss": 1.8748, "step": 700 }, { "epoch": 0.08760642812166343, "grad_norm": 3.843994140625, "learning_rate": 8.741794310722102e-06, "loss": 1.88, "step": 800 }, { "epoch": 0.09855723163687136, "grad_norm": 3.1463918685913086, "learning_rate": 9.835886214442013e-06, "loss": 1.8617, "step": 900 }, { "epoch": 0.10950803515207928, "grad_norm": 3.763201951980591, "learning_rate": 9.997360588415263e-06, "loss": 1.8323, "step": 1000 }, { "epoch": 0.12045883866728721, "grad_norm": 2.8540737628936768, "learning_rate": 9.987501154068591e-06, "loss": 1.8238, "step": 1100 }, { "epoch": 0.13140964218249515, "grad_norm": 3.5842385292053223, "learning_rate": 9.970353900512644e-06, "loss": 1.8213, "step": 1200 }, { "epoch": 0.14236044569770306, "grad_norm": 4.1146135330200195, "learning_rate": 9.945943883598031e-06, "loss": 1.8147, "step": 1300 }, { "epoch": 0.153311249212911, "grad_norm": 3.3109846115112305, "learning_rate": 9.914306771645357e-06, "loss": 1.7824, "step": 1400 }, { "epoch": 0.16426205272811892, "grad_norm": 3.7267284393310547, "learning_rate": 9.875488793326074e-06, "loss": 1.7896, "step": 1500 }, { "epoch": 0.17521285624332686, "grad_norm": 4.14414644241333, "learning_rate": 9.82954667011238e-06, "loss": 1.7788, "step": 1600 }, { "epoch": 0.18616365975853477, "grad_norm": 3.413673162460327, "learning_rate": 9.776547533394874e-06, "loss": 1.7894, "step": 1700 }, { "epoch": 0.1971144632737427, "grad_norm": 3.6218016147613525, "learning_rate": 9.716568826389045e-06, "loss": 1.752, "step": 1800 }, { "epoch": 0.20806526678895063, "grad_norm": 3.2655038833618164, "learning_rate": 9.649698190973977e-06, "loss": 1.7587, "step": 1900 }, { "epoch": 0.21901607030415857, "grad_norm": 3.188708543777466, "learning_rate": 9.576033339628578e-06, "loss": 1.7648, "step": 2000 }, { "epoch": 0.2299668738193665, "grad_norm": 3.2714788913726807, "learning_rate": 9.495681912652486e-06, "loss": 1.7507, "step": 2100 }, { "epoch": 0.24091767733457442, "grad_norm": 3.6338818073272705, "learning_rate": 9.408761320880292e-06, "loss": 1.7628, "step": 2200 }, { "epoch": 0.25186848084978236, "grad_norm": 3.466273546218872, "learning_rate": 9.315398574118876e-06, "loss": 1.7299, "step": 2300 }, { "epoch": 0.2628192843649903, "grad_norm": 3.3957815170288086, "learning_rate": 9.215730095558582e-06, "loss": 1.7188, "step": 2400 }, { "epoch": 0.2737700878801982, "grad_norm": 3.8373427391052246, "learning_rate": 9.10990152242939e-06, "loss": 1.7139, "step": 2500 }, { "epoch": 0.28472089139540613, "grad_norm": 3.5663390159606934, "learning_rate": 8.998067493193395e-06, "loss": 1.711, "step": 2600 }, { "epoch": 0.29567169491061407, "grad_norm": 3.543086290359497, "learning_rate": 8.880391421584511e-06, "loss": 1.7143, "step": 2700 }, { "epoch": 0.306622498425822, "grad_norm": 3.4232897758483887, "learning_rate": 8.757045257825642e-06, "loss": 1.693, "step": 2800 }, { "epoch": 0.31757330194102995, "grad_norm": 3.153503179550171, "learning_rate": 8.628209237372148e-06, "loss": 1.6915, "step": 2900 }, { "epoch": 0.32852410545623784, "grad_norm": 3.6370930671691895, "learning_rate": 8.494071617548831e-06, "loss": 1.6932, "step": 3000 }, { "epoch": 0.3394749089714458, "grad_norm": 3.5099680423736572, "learning_rate": 8.354828402465215e-06, "loss": 1.6522, "step": 3100 }, { "epoch": 0.3504257124866537, "grad_norm": 4.088729381561279, "learning_rate": 8.210683056611086e-06, "loss": 1.6759, "step": 3200 }, { "epoch": 0.36137651600186166, "grad_norm": 4.009633541107178, "learning_rate": 8.06184620755083e-06, "loss": 1.6584, "step": 3300 }, { "epoch": 0.37232731951706954, "grad_norm": 3.3517208099365234, "learning_rate": 7.90853533815094e-06, "loss": 1.6598, "step": 3400 }, { "epoch": 0.3832781230322775, "grad_norm": 3.5369486808776855, "learning_rate": 7.750974468790462e-06, "loss": 1.6602, "step": 3500 }, { "epoch": 0.3942289265474854, "grad_norm": 3.1376800537109375, "learning_rate": 7.589393830018696e-06, "loss": 1.6466, "step": 3600 }, { "epoch": 0.40517973006269337, "grad_norm": 3.253649950027466, "learning_rate": 7.4240295261385205e-06, "loss": 1.6247, "step": 3700 }, { "epoch": 0.41613053357790125, "grad_norm": 3.432814359664917, "learning_rate": 7.2551231902068775e-06, "loss": 1.6204, "step": 3800 }, { "epoch": 0.4270813370931092, "grad_norm": 3.8177409172058105, "learning_rate": 7.082921630956545e-06, "loss": 1.618, "step": 3900 }, { "epoch": 0.43803214060831713, "grad_norm": 3.866558313369751, "learning_rate": 6.9076764721551385e-06, "loss": 1.6237, "step": 4000 }, { "epoch": 0.4489829441235251, "grad_norm": 3.8972673416137695, "learning_rate": 6.729643784928295e-06, "loss": 1.6185, "step": 4100 }, { "epoch": 0.459933747638733, "grad_norm": 3.088886260986328, "learning_rate": 6.549083713584314e-06, "loss": 1.6114, "step": 4200 }, { "epoch": 0.4708845511539409, "grad_norm": 3.4185397624969482, "learning_rate": 6.366260095486977e-06, "loss": 1.6123, "step": 4300 }, { "epoch": 0.48183535466914884, "grad_norm": 4.178684234619141, "learning_rate": 6.181440075532042e-06, "loss": 1.5628, "step": 4400 }, { "epoch": 0.4927861581843568, "grad_norm": 3.807727813720703, "learning_rate": 5.99489371579069e-06, "loss": 1.5712, "step": 4500 }, { "epoch": 0.5037369616995647, "grad_norm": 3.5558791160583496, "learning_rate": 5.806893600890361e-06, "loss": 1.5599, "step": 4600 }, { "epoch": 0.5146877652147727, "grad_norm": 3.706101894378662, "learning_rate": 5.617714439709588e-06, "loss": 1.5595, "step": 4700 }, { "epoch": 0.5256385687299806, "grad_norm": 3.324862003326416, "learning_rate": 5.42763266396884e-06, "loss": 1.5747, "step": 4800 }, { "epoch": 0.5365893722451885, "grad_norm": 3.485616445541382, "learning_rate": 5.236926024303909e-06, "loss": 1.5547, "step": 4900 }, { "epoch": 0.5475401757603964, "grad_norm": 3.410731077194214, "learning_rate": 5.045873184412099e-06, "loss": 1.5846, "step": 5000 }, { "epoch": 0.5584909792756043, "grad_norm": 3.5248425006866455, "learning_rate": 4.854753313864212e-06, "loss": 1.5473, "step": 5100 }, { "epoch": 0.5694417827908123, "grad_norm": 3.993953227996826, "learning_rate": 4.663845680177349e-06, "loss": 1.5513, "step": 5200 }, { "epoch": 0.5803925863060202, "grad_norm": 3.5981061458587646, "learning_rate": 4.473429240744606e-06, "loss": 1.5596, "step": 5300 }, { "epoch": 0.5913433898212281, "grad_norm": 4.107110977172852, "learning_rate": 4.283782235217901e-06, "loss": 1.5334, "step": 5400 }, { "epoch": 0.6022941933364361, "grad_norm": 3.5353140830993652, "learning_rate": 4.095181778939598e-06, "loss": 1.5183, "step": 5500 }, { "epoch": 0.613244996851644, "grad_norm": 3.8818717002868652, "learning_rate": 3.90790345801699e-06, "loss": 1.5402, "step": 5600 }, { "epoch": 0.624195800366852, "grad_norm": 3.5173981189727783, "learning_rate": 3.7222209266313026e-06, "loss": 1.5132, "step": 5700 }, { "epoch": 0.6351466038820599, "grad_norm": 4.288154602050781, "learning_rate": 3.538405507169692e-06, "loss": 1.5098, "step": 5800 }, { "epoch": 0.6460974073972677, "grad_norm": 3.918605089187622, "learning_rate": 3.356725793764477e-06, "loss": 1.5147, "step": 5900 }, { "epoch": 0.6570482109124757, "grad_norm": 4.286097049713135, "learning_rate": 3.1774472598189503e-06, "loss": 1.5384, "step": 6000 }, { "epoch": 0.6679990144276836, "grad_norm": 3.301164150238037, "learning_rate": 3.0008318700932426e-06, "loss": 1.5411, "step": 6100 }, { "epoch": 0.6789498179428916, "grad_norm": 3.758004903793335, "learning_rate": 2.827137697917096e-06, "loss": 1.5003, "step": 6200 }, { "epoch": 0.6899006214580995, "grad_norm": 3.1831018924713135, "learning_rate": 2.6566185480888276e-06, "loss": 1.5175, "step": 6300 }, { "epoch": 0.7008514249733074, "grad_norm": 3.6004598140716553, "learning_rate": 2.4895235860115652e-06, "loss": 1.4974, "step": 6400 }, { "epoch": 0.7118022284885154, "grad_norm": 3.951835870742798, "learning_rate": 2.326096973608648e-06, "loss": 1.497, "step": 6500 }, { "epoch": 0.7227530320037233, "grad_norm": 3.2882447242736816, "learning_rate": 2.166577512550162e-06, "loss": 1.502, "step": 6600 }, { "epoch": 0.7337038355189311, "grad_norm": 4.077866077423096, "learning_rate": 2.0111982953120073e-06, "loss": 1.5173, "step": 6700 }, { "epoch": 0.7446546390341391, "grad_norm": 4.122990608215332, "learning_rate": 1.8601863645773128e-06, "loss": 1.4877, "step": 6800 }, { "epoch": 0.755605442549347, "grad_norm": 3.5521764755249023, "learning_rate": 1.7137623814779036e-06, "loss": 1.4705, "step": 6900 }, { "epoch": 0.766556246064555, "grad_norm": 3.5683555603027344, "learning_rate": 1.5721403031606048e-06, "loss": 1.4747, "step": 7000 }, { "epoch": 0.7775070495797629, "grad_norm": 3.852078914642334, "learning_rate": 1.43552707014953e-06, "loss": 1.5005, "step": 7100 }, { "epoch": 0.7884578530949709, "grad_norm": 3.6315114498138428, "learning_rate": 1.3041223039611489e-06, "loss": 1.5038, "step": 7200 }, { "epoch": 0.7994086566101788, "grad_norm": 3.6797256469726562, "learning_rate": 1.1781180154140331e-06, "loss": 1.5086, "step": 7300 }, { "epoch": 0.8103594601253867, "grad_norm": 3.5173234939575195, "learning_rate": 1.057698324059469e-06, "loss": 1.4665, "step": 7400 }, { "epoch": 0.8213102636405947, "grad_norm": 3.8179659843444824, "learning_rate": 9.43039189142922e-07, "loss": 1.4686, "step": 7500 }, { "epoch": 0.8322610671558025, "grad_norm": 4.368917942047119, "learning_rate": 8.343081524894763e-07, "loss": 1.4933, "step": 7600 }, { "epoch": 0.8432118706710104, "grad_norm": 4.125387191772461, "learning_rate": 7.316640936889491e-07, "loss": 1.4976, "step": 7700 }, { "epoch": 0.8541626741862184, "grad_norm": 4.123210906982422, "learning_rate": 6.352569979384027e-07, "loss": 1.4663, "step": 7800 }, { "epoch": 0.8651134777014263, "grad_norm": 3.9425387382507324, "learning_rate": 5.452277368812936e-07, "loss": 1.4635, "step": 7900 }, { "epoch": 0.8760642812166343, "grad_norm": 3.996006727218628, "learning_rate": 4.617078627635019e-07, "loss": 1.4739, "step": 8000 }, { "epoch": 0.8870150847318422, "grad_norm": 3.8166754245758057, "learning_rate": 3.8481941620700127e-07, "loss": 1.4525, "step": 8100 }, { "epoch": 0.8979658882470501, "grad_norm": 4.163847923278809, "learning_rate": 3.146747478820938e-07, "loss": 1.4467, "step": 8200 }, { "epoch": 0.9089166917622581, "grad_norm": 3.3138253688812256, "learning_rate": 2.513763543387465e-07, "loss": 1.4893, "step": 8300 }, { "epoch": 0.919867495277466, "grad_norm": 3.702721118927002, "learning_rate": 1.9501672823693584e-07, "loss": 1.4246, "step": 8400 }, { "epoch": 0.9308182987926739, "grad_norm": 3.539092779159546, "learning_rate": 1.4567822319484614e-07, "loss": 1.456, "step": 8500 }, { "epoch": 0.9417691023078818, "grad_norm": 4.07131814956665, "learning_rate": 1.0343293345239702e-07, "loss": 1.4473, "step": 8600 }, { "epoch": 0.9527199058230897, "grad_norm": 3.898056745529175, "learning_rate": 6.834258852594866e-08, "loss": 1.4813, "step": 8700 }, { "epoch": 0.9636707093382977, "grad_norm": 3.525865316390991, "learning_rate": 4.045846300811229e-08, "loss": 1.4259, "step": 8800 }, { "epoch": 0.9746215128535056, "grad_norm": 3.3846275806427, "learning_rate": 1.9821301644462056e-08, "loss": 1.4595, "step": 8900 }, { "epoch": 0.9855723163687136, "grad_norm": 3.4979400634765625, "learning_rate": 6.461259796644026e-09, "loss": 1.4601, "step": 9000 }, { "epoch": 0.9965231198839215, "grad_norm": 3.377657413482666, "learning_rate": 3.978593788622753e-10, "loss": 1.4738, "step": 9100 }, { "epoch": 1.0, "step": 9132, "total_flos": 1.591591112898773e+18, "train_loss": 1.633884816748293, "train_runtime": 12935.5693, "train_samples_per_second": 45.179, "train_steps_per_second": 0.706 } ], "logging_steps": 100, "max_steps": 9132, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.591591112898773e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }