{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9862844814301125, "eval_steps": 500, "global_step": 1600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 5.028895375132561, "epoch": 0.015410695022345508, "grad_norm": 6.3125, "learning_rate": 4.897959183673469e-06, "loss": 7.5902880859375, "mean_token_accuracy": 0.09856362253893167, "num_tokens": 132392.0, "step": 25 }, { "entropy": 5.056147763133049, "epoch": 0.030821390044691015, "grad_norm": 3.28125, "learning_rate": 1e-05, "loss": 4.8427432250976565, "mean_token_accuracy": 0.33738345025107264, "num_tokens": 254164.0, "step": 50 }, { "entropy": 3.3333908554911615, "epoch": 0.04623208506703652, "grad_norm": 2.046875, "learning_rate": 9.841168996188057e-06, "loss": 3.2706063842773436, "mean_token_accuracy": 0.5363785127736628, "num_tokens": 383737.0, "step": 75 }, { "entropy": 2.5955499114096163, "epoch": 0.06164278008938203, "grad_norm": 1.6171875, "learning_rate": 9.682337992376113e-06, "loss": 2.6094720458984373, "mean_token_accuracy": 0.631682768985629, "num_tokens": 516457.0, "step": 100 }, { "entropy": 2.3073494301736357, "epoch": 0.07705347511172754, "grad_norm": 2.078125, "learning_rate": 9.523506988564168e-06, "loss": 2.3903713989257813, "mean_token_accuracy": 0.6567921816185117, "num_tokens": 636033.0, "step": 125 }, { "entropy": 2.3254229539632796, "epoch": 0.09246417013407304, "grad_norm": 1.7890625, "learning_rate": 9.364675984752224e-06, "loss": 2.4465567016601564, "mean_token_accuracy": 0.6496502718515694, "num_tokens": 777342.0, "step": 150 }, { "entropy": 2.2117018654197453, "epoch": 0.10787486515641856, "grad_norm": 1.484375, "learning_rate": 9.20584498094028e-06, "loss": 2.288543701171875, "mean_token_accuracy": 0.664428948648274, "num_tokens": 902453.0, "step": 175 }, { "entropy": 2.174446207880974, "epoch": 0.12328556017876406, "grad_norm": 1.5078125, "learning_rate": 9.047013977128337e-06, "loss": 2.327859649658203, "mean_token_accuracy": 0.6647384916990995, "num_tokens": 1039427.0, "step": 200 }, { "entropy": 2.0399726448208093, "epoch": 0.13869625520110956, "grad_norm": 1.640625, "learning_rate": 8.888182973316391e-06, "loss": 2.156089324951172, "mean_token_accuracy": 0.6871329558640719, "num_tokens": 1166840.0, "step": 225 }, { "entropy": 2.071619209870696, "epoch": 0.15410695022345508, "grad_norm": 2.203125, "learning_rate": 8.729351969504447e-06, "loss": 2.2405677795410157, "mean_token_accuracy": 0.680448934994638, "num_tokens": 1300789.0, "step": 250 }, { "entropy": 2.032237692028284, "epoch": 0.1695176452458006, "grad_norm": 1.78125, "learning_rate": 8.570520965692504e-06, "loss": 2.1679531860351564, "mean_token_accuracy": 0.6796054230630397, "num_tokens": 1432596.0, "step": 275 }, { "entropy": 1.9784896748512983, "epoch": 0.18492834026814609, "grad_norm": 1.109375, "learning_rate": 8.41168996188056e-06, "loss": 2.0713957214355467, "mean_token_accuracy": 0.6908648996800184, "num_tokens": 1559688.0, "step": 300 }, { "entropy": 1.9056549924612045, "epoch": 0.2003390352904916, "grad_norm": 1.171875, "learning_rate": 8.252858958068616e-06, "loss": 2.1153318786621096, "mean_token_accuracy": 0.7020338359847664, "num_tokens": 1688970.0, "step": 325 }, { "entropy": 1.9747070623934269, "epoch": 0.21574973031283712, "grad_norm": 1.8046875, "learning_rate": 8.09402795425667e-06, "loss": 2.1253143310546876, "mean_token_accuracy": 0.6871407954767347, "num_tokens": 1816182.0, "step": 350 }, { "entropy": 2.024077450931072, "epoch": 0.2311604253351826, "grad_norm": 1.09375, "learning_rate": 7.935196950444729e-06, "loss": 2.1842442321777344, "mean_token_accuracy": 0.677086523026228, "num_tokens": 1950644.0, "step": 375 }, { "entropy": 1.9184869919717311, "epoch": 0.24657112035752812, "grad_norm": 1.3203125, "learning_rate": 7.776365946632783e-06, "loss": 2.0085203552246096, "mean_token_accuracy": 0.694020996466279, "num_tokens": 2079928.0, "step": 400 }, { "entropy": 1.8541498044878244, "epoch": 0.26198181537987364, "grad_norm": 1.09375, "learning_rate": 7.617534942820839e-06, "loss": 2.0481745910644533, "mean_token_accuracy": 0.7019787009432912, "num_tokens": 2218227.0, "step": 425 }, { "entropy": 1.9318342459201814, "epoch": 0.27739251040221913, "grad_norm": 1.7734375, "learning_rate": 7.458703939008896e-06, "loss": 1.995413818359375, "mean_token_accuracy": 0.6925960695371032, "num_tokens": 2350679.0, "step": 450 }, { "entropy": 1.7835957117378711, "epoch": 0.29280320542456467, "grad_norm": 1.2734375, "learning_rate": 7.299872935196951e-06, "loss": 1.928093719482422, "mean_token_accuracy": 0.7172103912383317, "num_tokens": 2472981.0, "step": 475 }, { "entropy": 1.9411964005231857, "epoch": 0.30821390044691016, "grad_norm": 1.609375, "learning_rate": 7.141041931385007e-06, "loss": 2.106062774658203, "mean_token_accuracy": 0.6849398523569107, "num_tokens": 2614781.0, "step": 500 }, { "entropy": 1.7948928633891046, "epoch": 0.32362459546925565, "grad_norm": 1.859375, "learning_rate": 6.982210927573063e-06, "loss": 1.9581819152832032, "mean_token_accuracy": 0.70925975356251, "num_tokens": 2743217.0, "step": 525 }, { "entropy": 1.720666101127863, "epoch": 0.3390352904916012, "grad_norm": 1.5703125, "learning_rate": 6.823379923761118e-06, "loss": 1.8939352416992188, "mean_token_accuracy": 0.7219540763273835, "num_tokens": 2872564.0, "step": 550 }, { "entropy": 1.765952904894948, "epoch": 0.3544459855139467, "grad_norm": 1.1640625, "learning_rate": 6.6645489199491745e-06, "loss": 1.8741084289550782, "mean_token_accuracy": 0.7158531962707638, "num_tokens": 3003409.0, "step": 575 }, { "entropy": 1.7450720983743668, "epoch": 0.36985668053629217, "grad_norm": 1.53125, "learning_rate": 6.505717916137231e-06, "loss": 1.8735758972167968, "mean_token_accuracy": 0.7193968405947089, "num_tokens": 3134051.0, "step": 600 }, { "entropy": 1.7807146763801576, "epoch": 0.3852673755586377, "grad_norm": 1.5078125, "learning_rate": 6.346886912325286e-06, "loss": 1.89009521484375, "mean_token_accuracy": 0.7120489033311606, "num_tokens": 3272289.0, "step": 625 }, { "entropy": 1.688743471726775, "epoch": 0.4006780705809832, "grad_norm": 1.5390625, "learning_rate": 6.188055908513342e-06, "loss": 1.8537098693847656, "mean_token_accuracy": 0.7283642463758588, "num_tokens": 3395473.0, "step": 650 }, { "entropy": 1.681125262901187, "epoch": 0.4160887656033287, "grad_norm": 1.21875, "learning_rate": 6.029224904701399e-06, "loss": 1.7891111755371094, "mean_token_accuracy": 0.7292297334969043, "num_tokens": 3524237.0, "step": 675 }, { "entropy": 1.7346787237748504, "epoch": 0.43149946062567424, "grad_norm": 1.625, "learning_rate": 5.870393900889454e-06, "loss": 1.9180752563476562, "mean_token_accuracy": 0.716539504416287, "num_tokens": 3660040.0, "step": 700 }, { "entropy": 1.7547597530111672, "epoch": 0.4469101556480197, "grad_norm": 1.7109375, "learning_rate": 5.71156289707751e-06, "loss": 1.8909840393066406, "mean_token_accuracy": 0.7133365147560835, "num_tokens": 3791154.0, "step": 725 }, { "entropy": 1.792205568253994, "epoch": 0.4623208506703652, "grad_norm": 1.203125, "learning_rate": 5.552731893265566e-06, "loss": 1.93789306640625, "mean_token_accuracy": 0.7044468146562576, "num_tokens": 3930320.0, "step": 750 }, { "entropy": 1.801283170208335, "epoch": 0.47773154569271076, "grad_norm": 1.3125, "learning_rate": 5.393900889453621e-06, "loss": 1.971471405029297, "mean_token_accuracy": 0.702786465510726, "num_tokens": 4061353.0, "step": 775 }, { "entropy": 1.7702496079355479, "epoch": 0.49314224071505625, "grad_norm": 1.515625, "learning_rate": 5.235069885641678e-06, "loss": 1.8937255859375, "mean_token_accuracy": 0.7105545987561345, "num_tokens": 4188252.0, "step": 800 }, { "entropy": 1.746090711504221, "epoch": 0.5085529357374018, "grad_norm": 1.3671875, "learning_rate": 5.076238881829734e-06, "loss": 1.8904119873046874, "mean_token_accuracy": 0.710063861683011, "num_tokens": 4319900.0, "step": 825 }, { "entropy": 1.6968115794286132, "epoch": 0.5239636307597473, "grad_norm": 1.40625, "learning_rate": 4.91740787801779e-06, "loss": 1.8843421936035156, "mean_token_accuracy": 0.7204603585228324, "num_tokens": 4452384.0, "step": 850 }, { "entropy": 1.7424921029433609, "epoch": 0.5393743257820928, "grad_norm": 1.0625, "learning_rate": 4.758576874205845e-06, "loss": 1.8412220764160157, "mean_token_accuracy": 0.7172071708366274, "num_tokens": 4583222.0, "step": 875 }, { "entropy": 1.7446832180023193, "epoch": 0.5547850208044383, "grad_norm": 1.7109375, "learning_rate": 4.599745870393902e-06, "loss": 1.9152328491210937, "mean_token_accuracy": 0.7119575057178735, "num_tokens": 4715126.0, "step": 900 }, { "entropy": 1.7576309859752655, "epoch": 0.5701957158267837, "grad_norm": 1.96875, "learning_rate": 4.440914866581957e-06, "loss": 1.8862844848632812, "mean_token_accuracy": 0.7049576634168625, "num_tokens": 4850794.0, "step": 925 }, { "entropy": 1.7258573825657368, "epoch": 0.5856064108491293, "grad_norm": 1.1875, "learning_rate": 4.282083862770013e-06, "loss": 1.818639678955078, "mean_token_accuracy": 0.716189993545413, "num_tokens": 4981104.0, "step": 950 }, { "entropy": 1.6470270904898643, "epoch": 0.6010171058714748, "grad_norm": 1.2421875, "learning_rate": 4.123252858958069e-06, "loss": 1.7824436950683593, "mean_token_accuracy": 0.7320361129194498, "num_tokens": 5114019.0, "step": 975 }, { "entropy": 1.62755079947412, "epoch": 0.6164278008938203, "grad_norm": 1.015625, "learning_rate": 3.964421855146125e-06, "loss": 1.7079803466796875, "mean_token_accuracy": 0.7364445444941521, "num_tokens": 5243964.0, "step": 1000 }, { "entropy": 1.7295116788893938, "epoch": 0.6318384959161658, "grad_norm": 1.3125, "learning_rate": 3.8055908513341803e-06, "loss": 1.844159393310547, "mean_token_accuracy": 0.712419720813632, "num_tokens": 5377738.0, "step": 1025 }, { "entropy": 1.6804203514009715, "epoch": 0.6472491909385113, "grad_norm": 1.2734375, "learning_rate": 3.6467598475222366e-06, "loss": 1.8214646911621093, "mean_token_accuracy": 0.7212847074493766, "num_tokens": 5510981.0, "step": 1050 }, { "entropy": 1.5993686743080615, "epoch": 0.6626598859608568, "grad_norm": 1.2890625, "learning_rate": 3.4879288437102924e-06, "loss": 1.6954243469238282, "mean_token_accuracy": 0.7390070861950516, "num_tokens": 5633201.0, "step": 1075 }, { "entropy": 1.667136338762939, "epoch": 0.6780705809832024, "grad_norm": 1.4375, "learning_rate": 3.3290978398983487e-06, "loss": 1.7823049926757812, "mean_token_accuracy": 0.7207983901910484, "num_tokens": 5759874.0, "step": 1100 }, { "entropy": 1.678079522177577, "epoch": 0.6934812760055479, "grad_norm": 1.25, "learning_rate": 3.170266836086404e-06, "loss": 1.7977125549316406, "mean_token_accuracy": 0.7210700345411897, "num_tokens": 5883476.0, "step": 1125 }, { "entropy": 1.6764129892736674, "epoch": 0.7088919710278934, "grad_norm": 1.2578125, "learning_rate": 3.0114358322744603e-06, "loss": 1.8482670593261719, "mean_token_accuracy": 0.7182181442528963, "num_tokens": 6011086.0, "step": 1150 }, { "entropy": 1.7141736481338739, "epoch": 0.7243026660502389, "grad_norm": 2.078125, "learning_rate": 2.852604828462516e-06, "loss": 1.8182452392578126, "mean_token_accuracy": 0.7161370900273323, "num_tokens": 6146546.0, "step": 1175 }, { "entropy": 1.641965696439147, "epoch": 0.7397133610725843, "grad_norm": 1.375, "learning_rate": 2.693773824650572e-06, "loss": 1.779376220703125, "mean_token_accuracy": 0.7270625644922256, "num_tokens": 6281309.0, "step": 1200 }, { "entropy": 1.6535117710381746, "epoch": 0.7551240560949298, "grad_norm": 1.5703125, "learning_rate": 2.534942820838628e-06, "loss": 1.762202606201172, "mean_token_accuracy": 0.7227025451511144, "num_tokens": 6407111.0, "step": 1225 }, { "entropy": 1.5914642249792814, "epoch": 0.7705347511172754, "grad_norm": 1.2265625, "learning_rate": 2.376111817026684e-06, "loss": 1.7256475830078124, "mean_token_accuracy": 0.7357470904290676, "num_tokens": 6537257.0, "step": 1250 }, { "entropy": 1.7124161531031132, "epoch": 0.7859454461396209, "grad_norm": 1.2890625, "learning_rate": 2.21728081321474e-06, "loss": 1.8616783142089843, "mean_token_accuracy": 0.7149319493025541, "num_tokens": 6662719.0, "step": 1275 }, { "entropy": 1.6728860459476709, "epoch": 0.8013561411619664, "grad_norm": 1.0390625, "learning_rate": 2.0584498094027953e-06, "loss": 1.8268055725097656, "mean_token_accuracy": 0.7221832738444209, "num_tokens": 6798456.0, "step": 1300 }, { "entropy": 1.6639322647452355, "epoch": 0.8167668361843119, "grad_norm": 1.421875, "learning_rate": 1.8996188055908516e-06, "loss": 1.7712481689453126, "mean_token_accuracy": 0.72425989869982, "num_tokens": 6928111.0, "step": 1325 }, { "entropy": 1.6321870504319669, "epoch": 0.8321775312066574, "grad_norm": 1.4609375, "learning_rate": 1.7407878017789074e-06, "loss": 1.8233981323242188, "mean_token_accuracy": 0.728203468695283, "num_tokens": 7062876.0, "step": 1350 }, { "entropy": 1.8206283743306995, "epoch": 0.847588226229003, "grad_norm": 1.3671875, "learning_rate": 1.5819567979669634e-06, "loss": 1.9631840515136718, "mean_token_accuracy": 0.7005566702410578, "num_tokens": 7198189.0, "step": 1375 }, { "entropy": 1.6183030263334512, "epoch": 0.8629989212513485, "grad_norm": 1.171875, "learning_rate": 1.4231257941550193e-06, "loss": 1.7469532775878907, "mean_token_accuracy": 0.7299554903805255, "num_tokens": 7331299.0, "step": 1400 }, { "entropy": 1.6894471324980258, "epoch": 0.878409616273694, "grad_norm": 0.95703125, "learning_rate": 1.2642947903430749e-06, "loss": 1.819949951171875, "mean_token_accuracy": 0.7225298710912466, "num_tokens": 7468273.0, "step": 1425 }, { "entropy": 1.6117998372018336, "epoch": 0.8938203112960394, "grad_norm": 1.3125, "learning_rate": 1.105463786531131e-06, "loss": 1.7713958740234375, "mean_token_accuracy": 0.7342760527133941, "num_tokens": 7598957.0, "step": 1450 }, { "entropy": 1.7378646701574325, "epoch": 0.9092310063183849, "grad_norm": 1.109375, "learning_rate": 9.466327827191868e-07, "loss": 1.8703489685058594, "mean_token_accuracy": 0.7118765298649669, "num_tokens": 7731099.0, "step": 1475 }, { "entropy": 1.6067133033648133, "epoch": 0.9246417013407304, "grad_norm": 0.9453125, "learning_rate": 7.878017789072427e-07, "loss": 1.7261012268066407, "mean_token_accuracy": 0.7332322986423969, "num_tokens": 7858562.0, "step": 1500 }, { "entropy": 1.6211172859743237, "epoch": 0.940052396363076, "grad_norm": 1.1875, "learning_rate": 6.289707750952986e-07, "loss": 1.801383514404297, "mean_token_accuracy": 0.7289227614179253, "num_tokens": 7990016.0, "step": 1525 }, { "entropy": 1.6162557833641769, "epoch": 0.9554630913854215, "grad_norm": 1.125, "learning_rate": 4.7013977128335456e-07, "loss": 1.7632601928710938, "mean_token_accuracy": 0.7340098781138659, "num_tokens": 8128392.0, "step": 1550 }, { "entropy": 1.638201398998499, "epoch": 0.970873786407767, "grad_norm": 1.0546875, "learning_rate": 3.1130876747141044e-07, "loss": 1.7828684997558595, "mean_token_accuracy": 0.7265522088482976, "num_tokens": 8255611.0, "step": 1575 }, { "entropy": 1.742434518635273, "epoch": 0.9862844814301125, "grad_norm": 1.0546875, "learning_rate": 1.5247776365946635e-07, "loss": 1.8525875854492186, "mean_token_accuracy": 0.7128269827365875, "num_tokens": 8385670.0, "step": 1600 } ], "logging_steps": 25, "max_steps": 1623, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0231669797680909e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }