{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 2011, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.8093053457140923, "epoch": 0.02486325211337643, "grad_norm": 3.875, "learning_rate": 7.277227722772277e-06, "loss": 1.7423, "mean_token_accuracy": 0.671543929874897, "num_tokens": 12997394.0, "step": 50 }, { "entropy": 1.2584100252389907, "epoch": 0.04972650422675286, "grad_norm": 0.62109375, "learning_rate": 1.4702970297029704e-05, "loss": 1.3055, "mean_token_accuracy": 0.7016283966600895, "num_tokens": 25982466.0, "step": 100 }, { "entropy": 1.1815398697555066, "epoch": 0.07458975634012929, "grad_norm": 0.5625, "learning_rate": 2.212871287128713e-05, "loss": 1.1874, "mean_token_accuracy": 0.7216299413144589, "num_tokens": 38965230.0, "step": 150 }, { "entropy": 1.1402550649642944, "epoch": 0.09945300845350571, "grad_norm": 0.5390625, "learning_rate": 2.9554455445544555e-05, "loss": 1.142, "mean_token_accuracy": 0.7295443145930767, "num_tokens": 51948935.0, "step": 200 }, { "entropy": 1.110574235022068, "epoch": 0.12431626056688215, "grad_norm": 0.5546875, "learning_rate": 2.9950061158395005e-05, "loss": 1.1098, "mean_token_accuracy": 0.7351739694178104, "num_tokens": 64951043.0, "step": 250 }, { "entropy": 1.0846257837116717, "epoch": 0.14917951268025859, "grad_norm": 0.5390625, "learning_rate": 2.9787675503104344e-05, "loss": 1.0836, "mean_token_accuracy": 0.7398869113624096, "num_tokens": 77936279.0, "step": 300 }, { "entropy": 1.0673179541528226, "epoch": 0.174042764793635, "grad_norm": 0.54296875, "learning_rate": 2.9513863052820796e-05, "loss": 1.0648, "mean_token_accuracy": 0.7432914447784423, "num_tokens": 90932336.0, "step": 350 }, { "entropy": 1.052229733467102, "epoch": 0.19890601690701143, "grad_norm": 0.5703125, "learning_rate": 2.913068701509568e-05, "loss": 1.049, "mean_token_accuracy": 0.746162725687027, "num_tokens": 103921567.0, "step": 400 }, { "entropy": 1.0345054648816585, "epoch": 0.22376926902038788, "grad_norm": 0.53515625, "learning_rate": 2.864103466438891e-05, "loss": 1.0312, "mean_token_accuracy": 0.7494943365454674, "num_tokens": 116912063.0, "step": 450 }, { "entropy": 1.0316937543451785, "epoch": 0.2486325211337643, "grad_norm": 0.53125, "learning_rate": 2.8048595586131855e-05, "loss": 1.0284, "mean_token_accuracy": 0.7500163938105107, "num_tokens": 129905218.0, "step": 500 }, { "entropy": 1.0241920906305313, "epoch": 0.27349577324714075, "grad_norm": 0.55078125, "learning_rate": 2.7357833875286894e-05, "loss": 1.0203, "mean_token_accuracy": 0.7514490978419781, "num_tokens": 142885565.0, "step": 550 }, { "entropy": 1.0306694743037224, "epoch": 0.29835902536051717, "grad_norm": 0.54296875, "learning_rate": 2.6573954498890738e-05, "loss": 1.0269, "mean_token_accuracy": 0.750104363411665, "num_tokens": 155872064.0, "step": 600 }, { "entropy": 1.0152570338547229, "epoch": 0.3232222774738936, "grad_norm": 0.5234375, "learning_rate": 2.5702864076043493e-05, "loss": 1.0108, "mean_token_accuracy": 0.7531800110638142, "num_tokens": 168867280.0, "step": 650 }, { "entropy": 1.0094216690957547, "epoch": 0.34808552958727, "grad_norm": 0.53125, "learning_rate": 2.4751126370870668e-05, "loss": 1.005, "mean_token_accuracy": 0.7541881857812405, "num_tokens": 181860562.0, "step": 700 }, { "entropy": 1.0075355672836304, "epoch": 0.37294878170064644, "grad_norm": 0.5703125, "learning_rate": 2.3725912833823623e-05, "loss": 1.0029, "mean_token_accuracy": 0.7544908618927002, "num_tokens": 194857393.0, "step": 750 }, { "entropy": 1.0026545779407023, "epoch": 0.39781203381402286, "grad_norm": 0.54296875, "learning_rate": 2.263494856399534e-05, "loss": 0.9983, "mean_token_accuracy": 0.7555115695297718, "num_tokens": 207830748.0, "step": 800 }, { "entropy": 0.9955900900065899, "epoch": 0.4226752859273993, "grad_norm": 0.55859375, "learning_rate": 2.148645409963155e-05, "loss": 0.991, "mean_token_accuracy": 0.7569138373434544, "num_tokens": 220811854.0, "step": 850 }, { "entropy": 0.9966490264236927, "epoch": 0.44753853804077576, "grad_norm": 0.5234375, "learning_rate": 2.0289083475452206e-05, "loss": 0.9917, "mean_token_accuracy": 0.7566816800832749, "num_tokens": 233809152.0, "step": 900 }, { "entropy": 0.9946993951499462, "epoch": 0.4724017901541522, "grad_norm": 0.5703125, "learning_rate": 1.9051859013528333e-05, "loss": 0.9902, "mean_token_accuracy": 0.7567966990172863, "num_tokens": 246795198.0, "step": 950 }, { "entropy": 0.9998035056889057, "epoch": 0.4972650422675286, "grad_norm": 0.50390625, "learning_rate": 1.7784103339072398e-05, "loss": 0.9943, "mean_token_accuracy": 0.7560001449286937, "num_tokens": 259785941.0, "step": 1000 }, { "entropy": 0.991586543917656, "epoch": 0.522128294380905, "grad_norm": 0.515625, "learning_rate": 1.649536913341075e-05, "loss": 0.9866, "mean_token_accuracy": 0.7574710394442081, "num_tokens": 272778072.0, "step": 1050 }, { "entropy": 0.9825779674947261, "epoch": 0.5469915464942815, "grad_norm": 0.5234375, "learning_rate": 1.5195367153457442e-05, "loss": 0.9782, "mean_token_accuracy": 0.7593892233073711, "num_tokens": 285763198.0, "step": 1100 }, { "entropy": 0.9868022166192532, "epoch": 0.5718547986076579, "grad_norm": 0.52734375, "learning_rate": 1.3893893060070753e-05, "loss": 0.982, "mean_token_accuracy": 0.7586962369084358, "num_tokens": 298755470.0, "step": 1150 }, { "entropy": 0.9881981492042542, "epoch": 0.5967180507210343, "grad_norm": 0.51953125, "learning_rate": 1.260075360664893e-05, "loss": 0.983, "mean_token_accuracy": 0.7583484600484371, "num_tokens": 311732964.0, "step": 1200 }, { "entropy": 0.990956412255764, "epoch": 0.6215813028344107, "grad_norm": 0.54296875, "learning_rate": 1.1325692744142444e-05, "loss": 0.9855, "mean_token_accuracy": 0.7576845416426659, "num_tokens": 324722969.0, "step": 1250 }, { "entropy": 0.9845810621976853, "epoch": 0.6464445549477872, "grad_norm": 0.5234375, "learning_rate": 1.0078318199289694e-05, "loss": 0.98, "mean_token_accuracy": 0.7590868937969207, "num_tokens": 337708515.0, "step": 1300 }, { "entropy": 0.9848950608074665, "epoch": 0.6713078070611636, "grad_norm": 0.51171875, "learning_rate": 8.868029079317466e-06, "loss": 0.9794, "mean_token_accuracy": 0.7590427026152611, "num_tokens": 350694304.0, "step": 1350 }, { "entropy": 0.9880955889821053, "epoch": 0.69617105917454, "grad_norm": 0.5234375, "learning_rate": 7.703945048612838e-06, "loss": 0.9829, "mean_token_accuracy": 0.758459353595972, "num_tokens": 363687831.0, "step": 1400 }, { "entropy": 0.9795867702364922, "epoch": 0.7210343112879165, "grad_norm": 0.5, "learning_rate": 6.594837611028224e-06, "loss": 0.9748, "mean_token_accuracy": 0.760101655125618, "num_tokens": 376685856.0, "step": 1450 }, { "entropy": 0.9814675351977349, "epoch": 0.7458975634012929, "grad_norm": 0.51171875, "learning_rate": 5.549064015615166e-06, "loss": 0.9759, "mean_token_accuracy": 0.7595324893295765, "num_tokens": 389669946.0, "step": 1500 }, { "entropy": 0.9784578867256641, "epoch": 0.7707608155146694, "grad_norm": 0.515625, "learning_rate": 4.574504283814536e-06, "loss": 0.9735, "mean_token_accuracy": 0.7601935516297817, "num_tokens": 402664698.0, "step": 1550 }, { "entropy": 0.9806207512319088, "epoch": 0.7956240676280457, "grad_norm": 0.52734375, "learning_rate": 3.67850183261035e-06, "loss": 0.9754, "mean_token_accuracy": 0.7597657778859138, "num_tokens": 415650519.0, "step": 1600 }, { "entropy": 0.981166479587555, "epoch": 0.8204873197414222, "grad_norm": 0.498046875, "learning_rate": 2.8678081410584267e-06, "loss": 0.9759, "mean_token_accuracy": 0.7597530463337898, "num_tokens": 428641539.0, "step": 1650 }, { "entropy": 0.9809097257256508, "epoch": 0.8453505718547986, "grad_norm": 0.50390625, "learning_rate": 2.1485318771337776e-06, "loss": 0.9757, "mean_token_accuracy": 0.7594838063418865, "num_tokens": 441636832.0, "step": 1700 }, { "entropy": 0.9838246862590313, "epoch": 0.870213823968175, "grad_norm": 0.4921875, "learning_rate": 1.5260928682316267e-06, "loss": 0.9788, "mean_token_accuracy": 0.7591392694413662, "num_tokens": 454630338.0, "step": 1750 }, { "entropy": 0.9816241288185119, "epoch": 0.8950770760815515, "grad_norm": 0.51171875, "learning_rate": 1.0051812621595896e-06, "loss": 0.9763, "mean_token_accuracy": 0.7593851044774056, "num_tokens": 467615642.0, "step": 1800 }, { "entropy": 0.9775546994805336, "epoch": 0.9199403281949279, "grad_norm": 0.51953125, "learning_rate": 5.89722186347399e-07, "loss": 0.9727, "mean_token_accuracy": 0.7600829027593136, "num_tokens": 480607840.0, "step": 1850 }, { "entropy": 0.9843708150088787, "epoch": 0.9448035803083044, "grad_norm": 0.498046875, "learning_rate": 2.828461715710806e-07, "loss": 0.9798, "mean_token_accuracy": 0.7588261432945729, "num_tokens": 493591064.0, "step": 1900 }, { "entropy": 0.9855945162475109, "epoch": 0.9696668324216807, "grad_norm": 0.5234375, "learning_rate": 8.686556305214144e-08, "loss": 0.9812, "mean_token_accuracy": 0.7586447390913963, "num_tokens": 506570590.0, "step": 1950 }, { "entropy": 0.9880490954220295, "epoch": 0.9945300845350572, "grad_norm": 0.51171875, "learning_rate": 3.2570966767209166e-09, "loss": 0.9837, "mean_token_accuracy": 0.7583207984268665, "num_tokens": 519553398.0, "step": 2000 } ], "logging_steps": 50, "max_steps": 2011, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.417734959016444e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }