936 lines
27 KiB
JSON
936 lines
27 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 1.0,
|
|
"eval_steps": 500,
|
|
"global_step": 897,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"entropy": 0.8498557328246534,
|
|
"epoch": 0.011154489682097044,
|
|
"grad_norm": 44.5,
|
|
"learning_rate": 1e-05,
|
|
"loss": 3.704855728149414,
|
|
"mean_token_accuracy": 0.5304131177254021,
|
|
"num_tokens": 833799.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"entropy": 1.765550174564123,
|
|
"epoch": 0.022308979364194088,
|
|
"grad_norm": 10.25,
|
|
"learning_rate": 2.111111111111111e-05,
|
|
"loss": 1.9117172241210938,
|
|
"mean_token_accuracy": 0.6469556039199233,
|
|
"num_tokens": 1671910.0,
|
|
"step": 20
|
|
},
|
|
{
|
|
"entropy": 0.8713466321351007,
|
|
"epoch": 0.03346346904629113,
|
|
"grad_norm": 3.78125,
|
|
"learning_rate": 3.222222222222223e-05,
|
|
"loss": 0.8604758262634278,
|
|
"mean_token_accuracy": 0.8343592453747988,
|
|
"num_tokens": 2510443.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"entropy": 0.23340068878605963,
|
|
"epoch": 0.044617958728388175,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 4.3333333333333334e-05,
|
|
"loss": 0.24464244842529298,
|
|
"mean_token_accuracy": 0.9471705831587315,
|
|
"num_tokens": 3345229.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"entropy": 0.16152286342112349,
|
|
"epoch": 0.05577244841048522,
|
|
"grad_norm": 3.171875,
|
|
"learning_rate": 4.9997280790439974e-05,
|
|
"loss": 0.17044107913970946,
|
|
"mean_token_accuracy": 0.9553312979638576,
|
|
"num_tokens": 4184871.0,
|
|
"step": 50
|
|
},
|
|
{
|
|
"entropy": 0.1448873324552551,
|
|
"epoch": 0.06692693809258227,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 4.996669647581318e-05,
|
|
"loss": 0.14741255044937135,
|
|
"mean_token_accuracy": 0.9585917994379998,
|
|
"num_tokens": 5039068.0,
|
|
"step": 60
|
|
},
|
|
{
|
|
"entropy": 0.12383970170631073,
|
|
"epoch": 0.0780814277746793,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 4.990217055187362e-05,
|
|
"loss": 0.12599575519561768,
|
|
"mean_token_accuracy": 0.9621855434030294,
|
|
"num_tokens": 5904036.0,
|
|
"step": 70
|
|
},
|
|
{
|
|
"entropy": 0.11301726293168031,
|
|
"epoch": 0.08923591745677635,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 4.980379074002661e-05,
|
|
"loss": 0.11982399225234985,
|
|
"mean_token_accuracy": 0.964000066742301,
|
|
"num_tokens": 6747251.0,
|
|
"step": 80
|
|
},
|
|
{
|
|
"entropy": 0.10463761446881108,
|
|
"epoch": 0.1003904071388734,
|
|
"grad_norm": 0.71484375,
|
|
"learning_rate": 4.967169078520476e-05,
|
|
"loss": 0.11220132112503052,
|
|
"mean_token_accuracy": 0.9658373668789864,
|
|
"num_tokens": 7572769.0,
|
|
"step": 90
|
|
},
|
|
{
|
|
"entropy": 0.10532618285797071,
|
|
"epoch": 0.11154489682097044,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 4.9506050274045076e-05,
|
|
"loss": 0.11419826745986938,
|
|
"mean_token_accuracy": 0.965171106159687,
|
|
"num_tokens": 8410764.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"entropy": 0.10305561173590831,
|
|
"epoch": 0.12269938650306748,
|
|
"grad_norm": 0.89453125,
|
|
"learning_rate": 4.930709439074528e-05,
|
|
"loss": 0.10990087985992432,
|
|
"mean_token_accuracy": 0.9659170717000961,
|
|
"num_tokens": 9255104.0,
|
|
"step": 110
|
|
},
|
|
{
|
|
"entropy": 0.10211670671415049,
|
|
"epoch": 0.13385387618516453,
|
|
"grad_norm": 0.76953125,
|
|
"learning_rate": 4.90750936109315e-05,
|
|
"loss": 0.11037271022796631,
|
|
"mean_token_accuracy": 0.966075143776834,
|
|
"num_tokens": 10103000.0,
|
|
"step": 120
|
|
},
|
|
{
|
|
"entropy": 0.09717915701621678,
|
|
"epoch": 0.14500836586726157,
|
|
"grad_norm": 0.73046875,
|
|
"learning_rate": 4.881036333395329e-05,
|
|
"loss": 0.10295262336730956,
|
|
"mean_token_accuracy": 0.9677550513297319,
|
|
"num_tokens": 10945768.0,
|
|
"step": 130
|
|
},
|
|
{
|
|
"entropy": 0.09635820150724612,
|
|
"epoch": 0.1561628555493586,
|
|
"grad_norm": 0.7265625,
|
|
"learning_rate": 4.851326345410594e-05,
|
|
"loss": 0.10199121236801148,
|
|
"mean_token_accuracy": 0.9681327627971769,
|
|
"num_tokens": 11801129.0,
|
|
"step": 140
|
|
},
|
|
{
|
|
"entropy": 0.09002169943414629,
|
|
"epoch": 0.16731734523145567,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 4.818419787136311e-05,
|
|
"loss": 0.09567687511444092,
|
|
"mean_token_accuracy": 0.9701874911785126,
|
|
"num_tokens": 12636424.0,
|
|
"step": 150
|
|
},
|
|
{
|
|
"entropy": 0.09068954657413997,
|
|
"epoch": 0.1784718349135527,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 4.782361394228472e-05,
|
|
"loss": 0.0969263732433319,
|
|
"mean_token_accuracy": 0.9701515214517713,
|
|
"num_tokens": 13490177.0,
|
|
"step": 160
|
|
},
|
|
{
|
|
"entropy": 0.08610689677589108,
|
|
"epoch": 0.18962632459564974,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 4.74320018718467e-05,
|
|
"loss": 0.08986451625823974,
|
|
"mean_token_accuracy": 0.9720516135916114,
|
|
"num_tokens": 14328572.0,
|
|
"step": 170
|
|
},
|
|
{
|
|
"entropy": 0.08233372264367063,
|
|
"epoch": 0.2007808142777468,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 4.700989404701941e-05,
|
|
"loss": 0.08586806058883667,
|
|
"mean_token_accuracy": 0.9734413396567106,
|
|
"num_tokens": 15189703.0,
|
|
"step": 180
|
|
},
|
|
{
|
|
"entropy": 0.07736555864394176,
|
|
"epoch": 0.21193530395984383,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 4.6557864313000695e-05,
|
|
"loss": 0.07913717031478881,
|
|
"mean_token_accuracy": 0.9759283743798732,
|
|
"num_tokens": 16028746.0,
|
|
"step": 190
|
|
},
|
|
{
|
|
"entropy": 0.07013691037718672,
|
|
"epoch": 0.22308979364194087,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 4.60765271930874e-05,
|
|
"loss": 0.07151558399200439,
|
|
"mean_token_accuracy": 0.978206392377615,
|
|
"num_tokens": 16849567.0,
|
|
"step": 200
|
|
},
|
|
{
|
|
"entropy": 0.07075640709081199,
|
|
"epoch": 0.23424428332403793,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 4.55665370532461e-05,
|
|
"loss": 0.07073653936386108,
|
|
"mean_token_accuracy": 0.9784815656021237,
|
|
"num_tokens": 17705656.0,
|
|
"step": 210
|
|
},
|
|
{
|
|
"entropy": 0.06637019099725876,
|
|
"epoch": 0.24539877300613497,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 4.5028587212518705e-05,
|
|
"loss": 0.06597371697425843,
|
|
"mean_token_accuracy": 0.979678837954998,
|
|
"num_tokens": 18565065.0,
|
|
"step": 220
|
|
},
|
|
{
|
|
"entropy": 0.05923618896049447,
|
|
"epoch": 0.25655326268823203,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 4.4463409000472234e-05,
|
|
"loss": 0.058509671688079835,
|
|
"mean_token_accuracy": 0.9822878390550613,
|
|
"num_tokens": 19392725.0,
|
|
"step": 230
|
|
},
|
|
{
|
|
"entropy": 0.05874249367916491,
|
|
"epoch": 0.26770775237032907,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 4.3871770762974306e-05,
|
|
"loss": 0.05813463926315308,
|
|
"mean_token_accuracy": 0.9824939148500562,
|
|
"num_tokens": 20223583.0,
|
|
"step": 240
|
|
},
|
|
{
|
|
"entropy": 0.05689131756371353,
|
|
"epoch": 0.2788622420524261,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 4.325447681764586e-05,
|
|
"loss": 0.055121219158172606,
|
|
"mean_token_accuracy": 0.9830958772450685,
|
|
"num_tokens": 21085157.0,
|
|
"step": 250
|
|
},
|
|
{
|
|
"entropy": 0.05158787120017223,
|
|
"epoch": 0.29001673173452314,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 4.261236636041108e-05,
|
|
"loss": 0.05031982660293579,
|
|
"mean_token_accuracy": 0.984761236794293,
|
|
"num_tokens": 21904752.0,
|
|
"step": 260
|
|
},
|
|
{
|
|
"entropy": 0.05179886775440536,
|
|
"epoch": 0.30117122141662017,
|
|
"grad_norm": 0.78515625,
|
|
"learning_rate": 4.194631232463128e-05,
|
|
"loss": 0.0493065744638443,
|
|
"mean_token_accuracy": 0.9848343567922712,
|
|
"num_tokens": 22765981.0,
|
|
"step": 270
|
|
},
|
|
{
|
|
"entropy": 0.049823664524592456,
|
|
"epoch": 0.3123257110987172,
|
|
"grad_norm": 0.62890625,
|
|
"learning_rate": 4.1257220194373424e-05,
|
|
"loss": 0.04740493595600128,
|
|
"mean_token_accuracy": 0.9855156386271119,
|
|
"num_tokens": 23621926.0,
|
|
"step": 280
|
|
},
|
|
{
|
|
"entropy": 0.04758051415265072,
|
|
"epoch": 0.3234802007808143,
|
|
"grad_norm": 0.5,
|
|
"learning_rate": 4.054602677342684e-05,
|
|
"loss": 0.04637431204319,
|
|
"mean_token_accuracy": 0.9858794504776597,
|
|
"num_tokens": 24465607.0,
|
|
"step": 290
|
|
},
|
|
{
|
|
"entropy": 0.04668422270915471,
|
|
"epoch": 0.33463469046291133,
|
|
"grad_norm": 0.474609375,
|
|
"learning_rate": 3.981369891174155e-05,
|
|
"loss": 0.04507455825805664,
|
|
"mean_token_accuracy": 0.9860366908833385,
|
|
"num_tokens": 25299933.0,
|
|
"step": 300
|
|
},
|
|
{
|
|
"entropy": 0.04640703263867181,
|
|
"epoch": 0.34578918014500837,
|
|
"grad_norm": 0.390625,
|
|
"learning_rate": 3.906123219101952e-05,
|
|
"loss": 0.04516075849533081,
|
|
"mean_token_accuracy": 0.9859529983252286,
|
|
"num_tokens": 26154681.0,
|
|
"step": 310
|
|
},
|
|
{
|
|
"entropy": 0.04438853256579023,
|
|
"epoch": 0.3569436698271054,
|
|
"grad_norm": 0.423828125,
|
|
"learning_rate": 3.8289649571245885e-05,
|
|
"loss": 0.044096818566322325,
|
|
"mean_token_accuracy": 0.986466808244586,
|
|
"num_tokens": 26983541.0,
|
|
"step": 320
|
|
},
|
|
{
|
|
"entropy": 0.044660908347577785,
|
|
"epoch": 0.36809815950920244,
|
|
"grad_norm": 0.376953125,
|
|
"learning_rate": 3.7500000000000003e-05,
|
|
"loss": 0.043841251730918886,
|
|
"mean_token_accuracy": 0.986228640563786,
|
|
"num_tokens": 27833414.0,
|
|
"step": 330
|
|
},
|
|
{
|
|
"entropy": 0.043841811595484614,
|
|
"epoch": 0.3792526491912995,
|
|
"grad_norm": 0.328125,
|
|
"learning_rate": 3.669335698643704e-05,
|
|
"loss": 0.04326000213623047,
|
|
"mean_token_accuracy": 0.9865183688700199,
|
|
"num_tokens": 28677577.0,
|
|
"step": 340
|
|
},
|
|
{
|
|
"entropy": 0.04371235728031024,
|
|
"epoch": 0.39040713887339656,
|
|
"grad_norm": 0.390625,
|
|
"learning_rate": 3.587081714187874e-05,
|
|
"loss": 0.043233224749565126,
|
|
"mean_token_accuracy": 0.9865481941029429,
|
|
"num_tokens": 29512895.0,
|
|
"step": 350
|
|
},
|
|
{
|
|
"entropy": 0.04327065504912753,
|
|
"epoch": 0.4015616285554936,
|
|
"grad_norm": 0.330078125,
|
|
"learning_rate": 3.503349868899722e-05,
|
|
"loss": 0.04288822710514069,
|
|
"mean_token_accuracy": 0.986665309779346,
|
|
"num_tokens": 30343018.0,
|
|
"step": 360
|
|
},
|
|
{
|
|
"entropy": 0.04256358170532622,
|
|
"epoch": 0.41271611823759063,
|
|
"grad_norm": 0.376953125,
|
|
"learning_rate": 3.418253994161892e-05,
|
|
"loss": 0.042832252383232114,
|
|
"mean_token_accuracy": 0.9867880517616868,
|
|
"num_tokens": 31158940.0,
|
|
"step": 370
|
|
},
|
|
{
|
|
"entropy": 0.04262932341953274,
|
|
"epoch": 0.42387060791968767,
|
|
"grad_norm": 0.30078125,
|
|
"learning_rate": 3.3319097757214843e-05,
|
|
"loss": 0.04222110211849213,
|
|
"mean_token_accuracy": 0.9866109801456332,
|
|
"num_tokens": 32008434.0,
|
|
"step": 380
|
|
},
|
|
{
|
|
"entropy": 0.04266308699734509,
|
|
"epoch": 0.4350250976017847,
|
|
"grad_norm": 0.314453125,
|
|
"learning_rate": 3.244434596418139e-05,
|
|
"loss": 0.042472487688064574,
|
|
"mean_token_accuracy": 0.9866344403475523,
|
|
"num_tokens": 32854651.0,
|
|
"step": 390
|
|
},
|
|
{
|
|
"entropy": 0.0429983379173791,
|
|
"epoch": 0.44617958728388174,
|
|
"grad_norm": 0.357421875,
|
|
"learning_rate": 3.155947376604948e-05,
|
|
"loss": 0.04324407577514648,
|
|
"mean_token_accuracy": 0.9865379808470607,
|
|
"num_tokens": 33705423.0,
|
|
"step": 400
|
|
},
|
|
{
|
|
"entropy": 0.04282604140753392,
|
|
"epoch": 0.45733407696597883,
|
|
"grad_norm": 0.294921875,
|
|
"learning_rate": 3.066568412479167e-05,
|
|
"loss": 0.04259026348590851,
|
|
"mean_token_accuracy": 0.9867103593423963,
|
|
"num_tokens": 34538632.0,
|
|
"step": 410
|
|
},
|
|
{
|
|
"entropy": 0.04212904951127712,
|
|
"epoch": 0.46848856664807587,
|
|
"grad_norm": 0.4140625,
|
|
"learning_rate": 2.976419212542495e-05,
|
|
"loss": 0.04252048432826996,
|
|
"mean_token_accuracy": 0.9867507757619023,
|
|
"num_tokens": 35381587.0,
|
|
"step": 420
|
|
},
|
|
{
|
|
"entropy": 0.0412595656584017,
|
|
"epoch": 0.4796430563301729,
|
|
"grad_norm": 0.3125,
|
|
"learning_rate": 2.885622332413256e-05,
|
|
"loss": 0.041145503520965576,
|
|
"mean_token_accuracy": 0.9870552903041243,
|
|
"num_tokens": 36234112.0,
|
|
"step": 430
|
|
},
|
|
{
|
|
"entropy": 0.041778112881002014,
|
|
"epoch": 0.49079754601226994,
|
|
"grad_norm": 0.349609375,
|
|
"learning_rate": 2.7943012082150533e-05,
|
|
"loss": 0.041335687041282654,
|
|
"mean_token_accuracy": 0.9867611099034548,
|
|
"num_tokens": 37077497.0,
|
|
"step": 440
|
|
},
|
|
{
|
|
"entropy": 0.04076959296362474,
|
|
"epoch": 0.501952035694367,
|
|
"grad_norm": 0.318359375,
|
|
"learning_rate": 2.7025799887684002e-05,
|
|
"loss": 0.041106203198432924,
|
|
"mean_token_accuracy": 0.9871867259964346,
|
|
"num_tokens": 37919261.0,
|
|
"step": 450
|
|
},
|
|
{
|
|
"entropy": 0.04188558856840245,
|
|
"epoch": 0.5131065253764641,
|
|
"grad_norm": 0.314453125,
|
|
"learning_rate": 2.6105833668134473e-05,
|
|
"loss": 0.041896390914916995,
|
|
"mean_token_accuracy": 0.9867892485111952,
|
|
"num_tokens": 38782505.0,
|
|
"step": 460
|
|
},
|
|
{
|
|
"entropy": 0.04178600409068167,
|
|
"epoch": 0.524261015058561,
|
|
"grad_norm": 0.3359375,
|
|
"learning_rate": 2.518436409493281e-05,
|
|
"loss": 0.04188077747821808,
|
|
"mean_token_accuracy": 0.9868596900254488,
|
|
"num_tokens": 39610893.0,
|
|
"step": 470
|
|
},
|
|
{
|
|
"entropy": 0.04152031776320655,
|
|
"epoch": 0.5354155047406581,
|
|
"grad_norm": 0.267578125,
|
|
"learning_rate": 2.426264388328214e-05,
|
|
"loss": 0.04174352586269379,
|
|
"mean_token_accuracy": 0.9868257040157914,
|
|
"num_tokens": 40427636.0,
|
|
"step": 480
|
|
},
|
|
{
|
|
"entropy": 0.040754605704569256,
|
|
"epoch": 0.5465699944227551,
|
|
"grad_norm": 0.3125,
|
|
"learning_rate": 2.334192608912241e-05,
|
|
"loss": 0.04108997285366058,
|
|
"mean_token_accuracy": 0.9870152780786157,
|
|
"num_tokens": 41252001.0,
|
|
"step": 490
|
|
},
|
|
{
|
|
"entropy": 0.042179943548399025,
|
|
"epoch": 0.5577244841048522,
|
|
"grad_norm": 0.353515625,
|
|
"learning_rate": 2.2423462405631616e-05,
|
|
"loss": 0.04207477867603302,
|
|
"mean_token_accuracy": 0.9866394894197583,
|
|
"num_tokens": 42113694.0,
|
|
"step": 500
|
|
},
|
|
{
|
|
"entropy": 0.04122589101898484,
|
|
"epoch": 0.5688789737869493,
|
|
"grad_norm": 0.330078125,
|
|
"learning_rate": 2.150850146157985e-05,
|
|
"loss": 0.04146281182765961,
|
|
"mean_token_accuracy": 0.9869526766240597,
|
|
"num_tokens": 42941138.0,
|
|
"step": 510
|
|
},
|
|
{
|
|
"entropy": 0.04114197726012207,
|
|
"epoch": 0.5800334634690463,
|
|
"grad_norm": 0.345703125,
|
|
"learning_rate": 2.0598287123849095e-05,
|
|
"loss": 0.040973353385925296,
|
|
"mean_token_accuracy": 0.9871221456676722,
|
|
"num_tokens": 43770822.0,
|
|
"step": 520
|
|
},
|
|
{
|
|
"entropy": 0.04192428553069476,
|
|
"epoch": 0.5911879531511434,
|
|
"grad_norm": 0.302734375,
|
|
"learning_rate": 1.9694056806426928e-05,
|
|
"loss": 0.04169855713844299,
|
|
"mean_token_accuracy": 0.9866715084761382,
|
|
"num_tokens": 44637866.0,
|
|
"step": 530
|
|
},
|
|
{
|
|
"entropy": 0.03977415001136251,
|
|
"epoch": 0.6023424428332403,
|
|
"grad_norm": 0.314453125,
|
|
"learning_rate": 1.879703978817256e-05,
|
|
"loss": 0.04036850333213806,
|
|
"mean_token_accuracy": 0.9872556058689952,
|
|
"num_tokens": 45453923.0,
|
|
"step": 540
|
|
},
|
|
{
|
|
"entropy": 0.04234540155448485,
|
|
"epoch": 0.6134969325153374,
|
|
"grad_norm": 0.33203125,
|
|
"learning_rate": 1.7908455541642584e-05,
|
|
"loss": 0.04180983603000641,
|
|
"mean_token_accuracy": 0.9865613304078579,
|
|
"num_tokens": 46306551.0,
|
|
"step": 550
|
|
},
|
|
{
|
|
"entropy": 0.041263596300268546,
|
|
"epoch": 0.6246514221974344,
|
|
"grad_norm": 0.31640625,
|
|
"learning_rate": 1.7029512075247967e-05,
|
|
"loss": 0.04135525822639465,
|
|
"mean_token_accuracy": 0.986899808421731,
|
|
"num_tokens": 47143518.0,
|
|
"step": 560
|
|
},
|
|
{
|
|
"entropy": 0.04119314953277353,
|
|
"epoch": 0.6358059118795315,
|
|
"grad_norm": 0.3203125,
|
|
"learning_rate": 1.6161404290996412e-05,
|
|
"loss": 0.04146760106086731,
|
|
"mean_token_accuracy": 0.9868535120040178,
|
|
"num_tokens": 47992113.0,
|
|
"step": 570
|
|
},
|
|
{
|
|
"entropy": 0.04120078657870181,
|
|
"epoch": 0.6469604015616286,
|
|
"grad_norm": 0.30859375,
|
|
"learning_rate": 1.5305312360052442e-05,
|
|
"loss": 0.0413068950176239,
|
|
"mean_token_accuracy": 0.986842698045075,
|
|
"num_tokens": 48843712.0,
|
|
"step": 580
|
|
},
|
|
{
|
|
"entropy": 0.04128208919428289,
|
|
"epoch": 0.6581148912437256,
|
|
"grad_norm": 0.326171875,
|
|
"learning_rate": 1.4462400118323798e-05,
|
|
"loss": 0.041500210762023926,
|
|
"mean_token_accuracy": 0.9869369497522712,
|
|
"num_tokens": 49688129.0,
|
|
"step": 590
|
|
},
|
|
{
|
|
"entropy": 0.04088454471202567,
|
|
"epoch": 0.6692693809258227,
|
|
"grad_norm": 0.33203125,
|
|
"learning_rate": 1.3633813484255131e-05,
|
|
"loss": 0.041133826971054076,
|
|
"mean_token_accuracy": 0.9869873868301511,
|
|
"num_tokens": 50520741.0,
|
|
"step": 600
|
|
},
|
|
{
|
|
"entropy": 0.04227327090920881,
|
|
"epoch": 0.6804238706079196,
|
|
"grad_norm": 0.275390625,
|
|
"learning_rate": 1.2820678900980093e-05,
|
|
"loss": 0.04190162420272827,
|
|
"mean_token_accuracy": 0.9865590412169695,
|
|
"num_tokens": 51392294.0,
|
|
"step": 610
|
|
},
|
|
{
|
|
"entropy": 0.04065559499140363,
|
|
"epoch": 0.6915783602900167,
|
|
"grad_norm": 0.328125,
|
|
"learning_rate": 1.2024101804949638e-05,
|
|
"loss": 0.04115171730518341,
|
|
"mean_token_accuracy": 0.9869800698012113,
|
|
"num_tokens": 52240430.0,
|
|
"step": 620
|
|
},
|
|
{
|
|
"entropy": 0.042412614575005135,
|
|
"epoch": 0.7027328499721138,
|
|
"grad_norm": 0.3203125,
|
|
"learning_rate": 1.124516512311836e-05,
|
|
"loss": 0.04237264692783356,
|
|
"mean_token_accuracy": 0.9865791719406843,
|
|
"num_tokens": 53087953.0,
|
|
"step": 630
|
|
},
|
|
{
|
|
"entropy": 0.04051031620183494,
|
|
"epoch": 0.7138873396542108,
|
|
"grad_norm": 0.28515625,
|
|
"learning_rate": 1.0484927800731984e-05,
|
|
"loss": 0.040881377458572385,
|
|
"mean_token_accuracy": 0.9869993204250932,
|
|
"num_tokens": 53927989.0,
|
|
"step": 640
|
|
},
|
|
{
|
|
"entropy": 0.04193990352796391,
|
|
"epoch": 0.7250418293363079,
|
|
"grad_norm": 0.298828125,
|
|
"learning_rate": 9.744423361717323e-06,
|
|
"loss": 0.04187402129173279,
|
|
"mean_token_accuracy": 0.9866631610319019,
|
|
"num_tokens": 54774541.0,
|
|
"step": 650
|
|
},
|
|
{
|
|
"entropy": 0.03943951329856645,
|
|
"epoch": 0.7361963190184049,
|
|
"grad_norm": 0.3125,
|
|
"learning_rate": 9.024658503631967e-06,
|
|
"loss": 0.04017325043678284,
|
|
"mean_token_accuracy": 0.9874097904190421,
|
|
"num_tokens": 55607613.0,
|
|
"step": 660
|
|
},
|
|
{
|
|
"entropy": 0.041415746443090026,
|
|
"epoch": 0.747350808700502,
|
|
"grad_norm": 0.296875,
|
|
"learning_rate": 8.32661172908373e-06,
|
|
"loss": 0.04164916574954987,
|
|
"mean_token_accuracy": 0.9868578946217894,
|
|
"num_tokens": 56444089.0,
|
|
"step": 670
|
|
},
|
|
{
|
|
"entropy": 0.04108071085065603,
|
|
"epoch": 0.758505298382599,
|
|
"grad_norm": 0.265625,
|
|
"learning_rate": 7.651232015480462e-06,
|
|
"loss": 0.04107390642166138,
|
|
"mean_token_accuracy": 0.986830660328269,
|
|
"num_tokens": 57290368.0,
|
|
"step": 680
|
|
},
|
|
{
|
|
"entropy": 0.04229205273441039,
|
|
"epoch": 0.769659788064696,
|
|
"grad_norm": 0.306640625,
|
|
"learning_rate": 6.99943752491857e-06,
|
|
"loss": 0.04177336990833282,
|
|
"mean_token_accuracy": 0.9866125296801329,
|
|
"num_tokens": 58128968.0,
|
|
"step": 690
|
|
},
|
|
{
|
|
"entropy": 0.041548075363971294,
|
|
"epoch": 0.7808142777467931,
|
|
"grad_norm": 0.279296875,
|
|
"learning_rate": 6.372114355964293e-06,
|
|
"loss": 0.04167112410068512,
|
|
"mean_token_accuracy": 0.9867573702707887,
|
|
"num_tokens": 58984460.0,
|
|
"step": 700
|
|
},
|
|
{
|
|
"entropy": 0.0403200296277646,
|
|
"epoch": 0.7919687674288901,
|
|
"grad_norm": 0.306640625,
|
|
"learning_rate": 5.770115339024484e-06,
|
|
"loss": 0.04050106704235077,
|
|
"mean_token_accuracy": 0.9871903322637081,
|
|
"num_tokens": 59827367.0,
|
|
"step": 710
|
|
},
|
|
{
|
|
"entropy": 0.0406710500101326,
|
|
"epoch": 0.8031232571109872,
|
|
"grad_norm": 0.294921875,
|
|
"learning_rate": 5.194258876944705e-06,
|
|
"loss": 0.04084862470626831,
|
|
"mean_token_accuracy": 0.9871157312765717,
|
|
"num_tokens": 60654050.0,
|
|
"step": 720
|
|
},
|
|
{
|
|
"entropy": 0.040411298532853836,
|
|
"epoch": 0.8142777467930842,
|
|
"grad_norm": 0.28515625,
|
|
"learning_rate": 4.645327832410648e-06,
|
|
"loss": 0.040474030375480655,
|
|
"mean_token_accuracy": 0.9871165057644248,
|
|
"num_tokens": 61488530.0,
|
|
"step": 730
|
|
},
|
|
{
|
|
"entropy": 0.04196117307874374,
|
|
"epoch": 0.8254322364751813,
|
|
"grad_norm": 0.310546875,
|
|
"learning_rate": 4.12406846366562e-06,
|
|
"loss": 0.04189785122871399,
|
|
"mean_token_accuracy": 0.9867719961330295,
|
|
"num_tokens": 62326744.0,
|
|
"step": 740
|
|
},
|
|
{
|
|
"entropy": 0.040386362894787455,
|
|
"epoch": 0.8365867261572784,
|
|
"grad_norm": 0.2734375,
|
|
"learning_rate": 3.631189409990815e-06,
|
|
"loss": 0.04039705097675324,
|
|
"mean_token_accuracy": 0.9871017251163721,
|
|
"num_tokens": 63170753.0,
|
|
"step": 750
|
|
},
|
|
{
|
|
"entropy": 0.040684799235896206,
|
|
"epoch": 0.8477412158393753,
|
|
"grad_norm": 0.265625,
|
|
"learning_rate": 3.1673607283276813e-06,
|
|
"loss": 0.04109015464782715,
|
|
"mean_token_accuracy": 0.9869557719677686,
|
|
"num_tokens": 64013316.0,
|
|
"step": 760
|
|
},
|
|
{
|
|
"entropy": 0.042057951152673925,
|
|
"epoch": 0.8588957055214724,
|
|
"grad_norm": 0.275390625,
|
|
"learning_rate": 2.733212982351957e-06,
|
|
"loss": 0.04174878001213074,
|
|
"mean_token_accuracy": 0.986495653167367,
|
|
"num_tokens": 64863074.0,
|
|
"step": 770
|
|
},
|
|
{
|
|
"entropy": 0.04044588297838345,
|
|
"epoch": 0.8700501952035694,
|
|
"grad_norm": 0.283203125,
|
|
"learning_rate": 2.3293363852379125e-06,
|
|
"loss": 0.04043938219547272,
|
|
"mean_token_accuracy": 0.9872395290061832,
|
|
"num_tokens": 65709101.0,
|
|
"step": 780
|
|
},
|
|
{
|
|
"entropy": 0.04246396276575979,
|
|
"epoch": 0.8812046848856665,
|
|
"grad_norm": 0.275390625,
|
|
"learning_rate": 1.956279997278043e-06,
|
|
"loss": 0.041996100544929506,
|
|
"mean_token_accuracy": 0.9866539994254708,
|
|
"num_tokens": 66553598.0,
|
|
"step": 790
|
|
},
|
|
{
|
|
"entropy": 0.041515190360951235,
|
|
"epoch": 0.8923591745677635,
|
|
"grad_norm": 0.322265625,
|
|
"learning_rate": 1.6145509794491364e-06,
|
|
"loss": 0.041551712155342105,
|
|
"mean_token_accuracy": 0.9867776447907091,
|
|
"num_tokens": 67405428.0,
|
|
"step": 800
|
|
},
|
|
{
|
|
"entropy": 0.04269145799044054,
|
|
"epoch": 0.9035136642498606,
|
|
"grad_norm": 0.283203125,
|
|
"learning_rate": 1.3046139039394e-06,
|
|
"loss": 0.042556726932525636,
|
|
"mean_token_accuracy": 0.9866596391424537,
|
|
"num_tokens": 68245631.0,
|
|
"step": 810
|
|
},
|
|
{
|
|
"entropy": 0.04082234081579372,
|
|
"epoch": 0.9146681539319577,
|
|
"grad_norm": 0.30859375,
|
|
"learning_rate": 1.026890122573998e-06,
|
|
"loss": 0.04080590307712555,
|
|
"mean_token_accuracy": 0.987078714184463,
|
|
"num_tokens": 69069024.0,
|
|
"step": 820
|
|
},
|
|
{
|
|
"entropy": 0.04289137564774137,
|
|
"epoch": 0.9258226436140546,
|
|
"grad_norm": 0.29296875,
|
|
"learning_rate": 7.817571939976288e-07,
|
|
"loss": 0.04283967912197113,
|
|
"mean_token_accuracy": 0.9864464558660984,
|
|
"num_tokens": 69903803.0,
|
|
"step": 830
|
|
},
|
|
{
|
|
"entropy": 0.041589401010423896,
|
|
"epoch": 0.9369771332961517,
|
|
"grad_norm": 0.2890625,
|
|
"learning_rate": 5.695483703928306e-07,
|
|
"loss": 0.04148242473602295,
|
|
"mean_token_accuracy": 0.9868634788319468,
|
|
"num_tokens": 70747562.0,
|
|
"step": 840
|
|
},
|
|
{
|
|
"entropy": 0.04105772517505102,
|
|
"epoch": 0.9481316229782487,
|
|
"grad_norm": 0.271484375,
|
|
"learning_rate": 3.905521444318605e-07,
|
|
"loss": 0.04133652150630951,
|
|
"mean_token_accuracy": 0.986898991279304,
|
|
"num_tokens": 71601623.0,
|
|
"step": 850
|
|
},
|
|
{
|
|
"entropy": 0.040625668261782266,
|
|
"epoch": 0.9592861126603458,
|
|
"grad_norm": 0.322265625,
|
|
"learning_rate": 2.450118570779786e-07,
|
|
"loss": 0.04106319844722748,
|
|
"mean_token_accuracy": 0.9870263114571571,
|
|
"num_tokens": 72449591.0,
|
|
"step": 860
|
|
},
|
|
{
|
|
"entropy": 0.041737568736425604,
|
|
"epoch": 0.9704406023424428,
|
|
"grad_norm": 0.28515625,
|
|
"learning_rate": 1.3312536676942377e-07,
|
|
"loss": 0.04145269989967346,
|
|
"mean_token_accuracy": 0.9866915429010987,
|
|
"num_tokens": 73305562.0,
|
|
"step": 870
|
|
},
|
|
{
|
|
"entropy": 0.04121337772812694,
|
|
"epoch": 0.9815950920245399,
|
|
"grad_norm": 0.28515625,
|
|
"learning_rate": 5.5044780435722923e-08,
|
|
"loss": 0.04093064665794373,
|
|
"mean_token_accuracy": 0.986900057643652,
|
|
"num_tokens": 74143208.0,
|
|
"step": 880
|
|
},
|
|
{
|
|
"entropy": 0.040503930338309145,
|
|
"epoch": 0.992749581706637,
|
|
"grad_norm": 0.287109375,
|
|
"learning_rate": 1.0876246712074322e-08,
|
|
"loss": 0.0403281182050705,
|
|
"mean_token_accuracy": 0.9871442951261997,
|
|
"num_tokens": 74991508.0,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"eval_entropy": 0.041321987748146057,
|
|
"eval_loss": 0.03999410942196846,
|
|
"eval_mean_token_accuracy": 0.9868998980522156,
|
|
"eval_num_tokens": 75541075.0,
|
|
"eval_runtime": 50.0483,
|
|
"eval_samples_per_second": 19.981,
|
|
"eval_steps_per_second": 9.99,
|
|
"step": 897
|
|
}
|
|
],
|
|
"logging_steps": 10,
|
|
"max_steps": 897,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 5.495254912950835e+16,
|
|
"train_batch_size": 2,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|