{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9980657640232108, "eval_steps": 100, "global_step": 258, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0038684719535783366, "grad_norm": 6.09610478125056e+18, "learning_rate": 0.0, "loss": 3.1242, "step": 1 }, { "epoch": 0.019342359767891684, "grad_norm": 2697.71337890625, "learning_rate": 3.0769230769230774e-06, "loss": 3.1772, "step": 5 }, { "epoch": 0.03868471953578337, "grad_norm": 25.10409927368164, "learning_rate": 6.923076923076923e-06, "loss": 2.8881, "step": 10 }, { "epoch": 0.058027079303675046, "grad_norm": 12.363713264465332, "learning_rate": 1.076923076923077e-05, "loss": 2.4609, "step": 15 }, { "epoch": 0.07736943907156674, "grad_norm": 22.165477752685547, "learning_rate": 1.4615384615384615e-05, "loss": 2.2508, "step": 20 }, { "epoch": 0.09671179883945841, "grad_norm": 14.86285400390625, "learning_rate": 1.8461538461538465e-05, "loss": 2.0898, "step": 25 }, { "epoch": 0.11605415860735009, "grad_norm": 7.5494914054870605, "learning_rate": 1.9991749570421146e-05, "loss": 1.9877, "step": 30 }, { "epoch": 0.13539651837524178, "grad_norm": 4.193073272705078, "learning_rate": 1.9941379571543597e-05, "loss": 1.8119, "step": 35 }, { "epoch": 0.15473887814313347, "grad_norm": 5.1745171546936035, "learning_rate": 1.984545368367337e-05, "loss": 1.7714, "step": 40 }, { "epoch": 0.17408123791102514, "grad_norm": 6.300442695617676, "learning_rate": 1.9704411482532116e-05, "loss": 1.7391, "step": 45 }, { "epoch": 0.19342359767891681, "grad_norm": 5.5538201332092285, "learning_rate": 1.9518899287155558e-05, "loss": 1.7088, "step": 50 }, { "epoch": 0.2127659574468085, "grad_norm": 4.729171276092529, "learning_rate": 1.9289767198167918e-05, "loss": 1.6553, "step": 55 }, { "epoch": 0.23210831721470018, "grad_norm": 2.736438512802124, "learning_rate": 1.9018065202237083e-05, "loss": 1.6234, "step": 60 }, { "epoch": 0.2514506769825919, "grad_norm": 2.781848192214966, "learning_rate": 1.8705038360561724e-05, "loss": 1.6005, "step": 65 }, { "epoch": 0.27079303675048355, "grad_norm": 30.041719436645508, "learning_rate": 1.8352121103438804e-05, "loss": 1.6161, "step": 70 }, { "epoch": 0.2901353965183752, "grad_norm": 2.318030595779419, "learning_rate": 1.796093065705644e-05, "loss": 1.5743, "step": 75 }, { "epoch": 0.30947775628626695, "grad_norm": 2.369173526763916, "learning_rate": 1.7533259632633443e-05, "loss": 1.537, "step": 80 }, { "epoch": 0.3288201160541586, "grad_norm": 8.767570495605469, "learning_rate": 1.7071067811865477e-05, "loss": 1.5716, "step": 85 }, { "epoch": 0.3481624758220503, "grad_norm": 2.4866902828216553, "learning_rate": 1.6576473166320644e-05, "loss": 1.5327, "step": 90 }, { "epoch": 0.36750483558994196, "grad_norm": 2.8688602447509766, "learning_rate": 1.6051742151937655e-05, "loss": 1.5223, "step": 95 }, { "epoch": 0.38684719535783363, "grad_norm": 9.360047340393066, "learning_rate": 1.549927932310155e-05, "loss": 1.5069, "step": 100 }, { "epoch": 0.38684719535783363, "eval_loss": 1.489111065864563, "eval_runtime": 4.7946, "eval_samples_per_second": 186.667, "eval_steps_per_second": 5.84, "step": 100 }, { "epoch": 0.40618955512572535, "grad_norm": 2.7148385047912598, "learning_rate": 1.4921616313890073e-05, "loss": 1.4631, "step": 105 }, { "epoch": 0.425531914893617, "grad_norm": 2.1432764530181885, "learning_rate": 1.4321400236983459e-05, "loss": 1.4502, "step": 110 }, { "epoch": 0.4448742746615087, "grad_norm": 2.373682975769043, "learning_rate": 1.3701381553399147e-05, "loss": 1.411, "step": 115 }, { "epoch": 0.46421663442940037, "grad_norm": 2.2317380905151367, "learning_rate": 1.3064401468637793e-05, "loss": 1.4031, "step": 120 }, { "epoch": 0.4835589941972921, "grad_norm": 1.9476709365844727, "learning_rate": 1.2413378912997058e-05, "loss": 1.3929, "step": 125 }, { "epoch": 0.5029013539651838, "grad_norm": 1.9585973024368286, "learning_rate": 1.175129716571531e-05, "loss": 1.3855, "step": 130 }, { "epoch": 0.5222437137330754, "grad_norm": 2.38059401512146, "learning_rate": 1.1081190184239418e-05, "loss": 1.3818, "step": 135 }, { "epoch": 0.5415860735009671, "grad_norm": 2.2399237155914307, "learning_rate": 1.0406128701262128e-05, "loss": 1.334, "step": 140 }, { "epoch": 0.5609284332688588, "grad_norm": 2.42098069190979, "learning_rate": 9.729206153238658e-06, "loss": 1.3365, "step": 145 }, { "epoch": 0.5802707930367504, "grad_norm": 2.278903007507324, "learning_rate": 9.053524504864391e-06, "loss": 1.3288, "step": 150 }, { "epoch": 0.5996131528046421, "grad_norm": 1.902079701423645, "learning_rate": 8.382180034472353e-06, "loss": 1.309, "step": 155 }, { "epoch": 0.6189555125725339, "grad_norm": 1.8283921480178833, "learning_rate": 7.718249145488143e-06, "loss": 1.3069, "step": 160 }, { "epoch": 0.6382978723404256, "grad_norm": 1.9884213209152222, "learning_rate": 7.064774268960654e-06, "loss": 1.2787, "step": 165 }, { "epoch": 0.6576402321083172, "grad_norm": 2.0930237770080566, "learning_rate": 6.4247499217695995e-06, "loss": 1.2678, "step": 170 }, { "epoch": 0.6769825918762089, "grad_norm": 2.522498369216919, "learning_rate": 5.801108984397355e-06, "loss": 1.2107, "step": 175 }, { "epoch": 0.6963249516441006, "grad_norm": 2.825383424758911, "learning_rate": 5.196709261146606e-06, "loss": 1.2417, "step": 180 }, { "epoch": 0.7156673114119922, "grad_norm": 2.353144645690918, "learning_rate": 4.614320384390959e-06, "loss": 1.2198, "step": 185 }, { "epoch": 0.7350096711798839, "grad_norm": 2.3441641330718994, "learning_rate": 4.056611122869106e-06, "loss": 1.2026, "step": 190 }, { "epoch": 0.7543520309477756, "grad_norm": 2.090709924697876, "learning_rate": 3.5261371521817247e-06, "loss": 1.1828, "step": 195 }, { "epoch": 0.7736943907156673, "grad_norm": 1.8887003660202026, "learning_rate": 3.0253293435321797e-06, "loss": 1.2171, "step": 200 }, { "epoch": 0.7736943907156673, "eval_loss": 1.193406581878662, "eval_runtime": 4.7625, "eval_samples_per_second": 187.926, "eval_steps_per_second": 5.879, "step": 200 }, { "epoch": 0.793036750483559, "grad_norm": 2.0174171924591064, "learning_rate": 2.5564826243772965e-06, "loss": 1.1818, "step": 205 }, { "epoch": 0.8123791102514507, "grad_norm": 2.1461687088012695, "learning_rate": 2.1217454620337842e-06, "loss": 1.1851, "step": 210 }, { "epoch": 0.8317214700193424, "grad_norm": 1.9792951345443726, "learning_rate": 1.7231100184310955e-06, "loss": 1.1799, "step": 215 }, { "epoch": 0.851063829787234, "grad_norm": 2.1498570442199707, "learning_rate": 1.3624030211261684e-06, "loss": 1.1748, "step": 220 }, { "epoch": 0.8704061895551257, "grad_norm": 2.3734147548675537, "learning_rate": 1.0412773924131202e-06, "loss": 1.1529, "step": 225 }, { "epoch": 0.8897485493230174, "grad_norm": 21.268768310546875, "learning_rate": 7.612046748871327e-07, "loss": 1.1486, "step": 230 }, { "epoch": 0.9090909090909091, "grad_norm": 2.0091605186462402, "learning_rate": 5.234682881719766e-07, "loss": 1.1451, "step": 235 }, { "epoch": 0.9284332688588007, "grad_norm": 1.8940573930740356, "learning_rate": 3.2915764771193294e-07, "loss": 1.1681, "step": 240 }, { "epoch": 0.9477756286266924, "grad_norm": 2.0212926864624023, "learning_rate": 1.791631725784404e-07, "loss": 1.1391, "step": 245 }, { "epoch": 0.9671179883945842, "grad_norm": 2.1692233085632324, "learning_rate": 7.4172205167945e-08, "loss": 1.1662, "step": 250 }, { "epoch": 0.9864603481624759, "grad_norm": 1.9714499711990356, "learning_rate": 1.4665861488761813e-08, "loss": 1.163, "step": 255 }, { "epoch": 0.9980657640232108, "step": 258, "total_flos": 9.517157411769549e+16, "train_loss": 1.5008004404777704, "train_runtime": 885.0234, "train_samples_per_second": 18.662, "train_steps_per_second": 0.292 } ], "logging_steps": 5, "max_steps": 258, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.517157411769549e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }