5004 lines
133 KiB
JSON
5004 lines
133 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 4.6268656716417915,
|
|
"eval_steps": 500,
|
|
"global_step": 620,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.007462686567164179,
|
|
"grad_norm": 11.35859680736035,
|
|
"learning_rate": 0.0,
|
|
"loss": 1.047095537185669,
|
|
"num_tokens": 940173.0,
|
|
"step": 1
|
|
},
|
|
{
|
|
"epoch": 0.014925373134328358,
|
|
"grad_norm": 11.310520487616877,
|
|
"learning_rate": 5.263157894736843e-07,
|
|
"loss": 1.0946075916290283,
|
|
"num_tokens": 1940908.0,
|
|
"step": 2
|
|
},
|
|
{
|
|
"epoch": 0.022388059701492536,
|
|
"grad_norm": 11.106569322922516,
|
|
"learning_rate": 1.0526315789473685e-06,
|
|
"loss": 1.0278105735778809,
|
|
"num_tokens": 2857302.0,
|
|
"step": 3
|
|
},
|
|
{
|
|
"epoch": 0.029850746268656716,
|
|
"grad_norm": 10.881054443812134,
|
|
"learning_rate": 1.5789473684210526e-06,
|
|
"loss": 1.0398736000061035,
|
|
"num_tokens": 3696299.0,
|
|
"step": 4
|
|
},
|
|
{
|
|
"epoch": 0.03731343283582089,
|
|
"grad_norm": 10.448295115598174,
|
|
"learning_rate": 2.105263157894737e-06,
|
|
"loss": 1.0615425109863281,
|
|
"num_tokens": 4528104.0,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.04477611940298507,
|
|
"grad_norm": 10.151241780828355,
|
|
"learning_rate": 2.631578947368421e-06,
|
|
"loss": 1.0268486738204956,
|
|
"num_tokens": 5554518.0,
|
|
"step": 6
|
|
},
|
|
{
|
|
"epoch": 0.05223880597014925,
|
|
"grad_norm": 8.119312484055971,
|
|
"learning_rate": 3.157894736842105e-06,
|
|
"loss": 0.9329569935798645,
|
|
"num_tokens": 6422948.0,
|
|
"step": 7
|
|
},
|
|
{
|
|
"epoch": 0.05970149253731343,
|
|
"grad_norm": 7.409758964343402,
|
|
"learning_rate": 3.6842105263157896e-06,
|
|
"loss": 0.8917287588119507,
|
|
"num_tokens": 7201431.0,
|
|
"step": 8
|
|
},
|
|
{
|
|
"epoch": 0.06716417910447761,
|
|
"grad_norm": 5.971479536888058,
|
|
"learning_rate": 4.210526315789474e-06,
|
|
"loss": 0.8006043434143066,
|
|
"num_tokens": 8128474.0,
|
|
"step": 9
|
|
},
|
|
{
|
|
"epoch": 0.07462686567164178,
|
|
"grad_norm": 3.4445244902185927,
|
|
"learning_rate": 4.736842105263158e-06,
|
|
"loss": 0.7708431482315063,
|
|
"num_tokens": 9073762.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.08208955223880597,
|
|
"grad_norm": 2.227913040407572,
|
|
"learning_rate": 5.263157894736842e-06,
|
|
"loss": 0.689713716506958,
|
|
"num_tokens": 9950348.0,
|
|
"step": 11
|
|
},
|
|
{
|
|
"epoch": 0.08955223880597014,
|
|
"grad_norm": 1.8665254369252244,
|
|
"learning_rate": 5.789473684210527e-06,
|
|
"loss": 0.7132350206375122,
|
|
"num_tokens": 10884740.0,
|
|
"step": 12
|
|
},
|
|
{
|
|
"epoch": 0.09701492537313433,
|
|
"grad_norm": 2.952404437976229,
|
|
"learning_rate": 6.31578947368421e-06,
|
|
"loss": 0.713362455368042,
|
|
"num_tokens": 11697616.0,
|
|
"step": 13
|
|
},
|
|
{
|
|
"epoch": 0.1044776119402985,
|
|
"grad_norm": 2.826605099421276,
|
|
"learning_rate": 6.842105263157896e-06,
|
|
"loss": 0.6958507895469666,
|
|
"num_tokens": 12632232.0,
|
|
"step": 14
|
|
},
|
|
{
|
|
"epoch": 0.11194029850746269,
|
|
"grad_norm": 2.4454572403082926,
|
|
"learning_rate": 7.368421052631579e-06,
|
|
"loss": 0.6733378171920776,
|
|
"num_tokens": 13568493.0,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.11940298507462686,
|
|
"grad_norm": 2.0537063830263924,
|
|
"learning_rate": 7.894736842105265e-06,
|
|
"loss": 0.6741904020309448,
|
|
"num_tokens": 14533820.0,
|
|
"step": 16
|
|
},
|
|
{
|
|
"epoch": 0.12686567164179105,
|
|
"grad_norm": 1.4727507656008452,
|
|
"learning_rate": 8.421052631578948e-06,
|
|
"loss": 0.6536232829093933,
|
|
"num_tokens": 15435498.0,
|
|
"step": 17
|
|
},
|
|
{
|
|
"epoch": 0.13432835820895522,
|
|
"grad_norm": 1.054376608380898,
|
|
"learning_rate": 8.947368421052632e-06,
|
|
"loss": 0.6000441312789917,
|
|
"num_tokens": 16351791.0,
|
|
"step": 18
|
|
},
|
|
{
|
|
"epoch": 0.1417910447761194,
|
|
"grad_norm": 0.9835940111044099,
|
|
"learning_rate": 9.473684210526315e-06,
|
|
"loss": 0.6027337312698364,
|
|
"num_tokens": 17276920.0,
|
|
"step": 19
|
|
},
|
|
{
|
|
"epoch": 0.14925373134328357,
|
|
"grad_norm": 0.916308840098788,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6199864149093628,
|
|
"num_tokens": 18270172.0,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.15671641791044777,
|
|
"grad_norm": 0.6212633844448718,
|
|
"learning_rate": 9.999938520216343e-06,
|
|
"loss": 0.5760895609855652,
|
|
"num_tokens": 19308005.0,
|
|
"step": 21
|
|
},
|
|
{
|
|
"epoch": 0.16417910447761194,
|
|
"grad_norm": 0.5315615385439493,
|
|
"learning_rate": 9.999754082545261e-06,
|
|
"loss": 0.5423388481140137,
|
|
"num_tokens": 20162217.0,
|
|
"step": 22
|
|
},
|
|
{
|
|
"epoch": 0.17164179104477612,
|
|
"grad_norm": 0.5852277738108399,
|
|
"learning_rate": 9.999446692026396e-06,
|
|
"loss": 0.5618520975112915,
|
|
"num_tokens": 20980497.0,
|
|
"step": 23
|
|
},
|
|
{
|
|
"epoch": 0.1791044776119403,
|
|
"grad_norm": 0.5256536336611786,
|
|
"learning_rate": 9.999016357058996e-06,
|
|
"loss": 0.5482994914054871,
|
|
"num_tokens": 21857362.0,
|
|
"step": 24
|
|
},
|
|
{
|
|
"epoch": 0.1865671641791045,
|
|
"grad_norm": 0.436253543862231,
|
|
"learning_rate": 9.99846308940168e-06,
|
|
"loss": 0.5038638710975647,
|
|
"num_tokens": 22792620.0,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.19402985074626866,
|
|
"grad_norm": 0.47872306271108794,
|
|
"learning_rate": 9.997786904172126e-06,
|
|
"loss": 0.5729074478149414,
|
|
"num_tokens": 23723110.0,
|
|
"step": 26
|
|
},
|
|
{
|
|
"epoch": 0.20149253731343283,
|
|
"grad_norm": 0.3887165593913177,
|
|
"learning_rate": 9.996987819846656e-06,
|
|
"loss": 0.5251473188400269,
|
|
"num_tokens": 24725024.0,
|
|
"step": 27
|
|
},
|
|
{
|
|
"epoch": 0.208955223880597,
|
|
"grad_norm": 0.4864210479565411,
|
|
"learning_rate": 9.996065858259729e-06,
|
|
"loss": 0.560759425163269,
|
|
"num_tokens": 25729987.0,
|
|
"step": 28
|
|
},
|
|
{
|
|
"epoch": 0.21641791044776118,
|
|
"grad_norm": 0.4545327828204722,
|
|
"learning_rate": 9.995021044603343e-06,
|
|
"loss": 0.5304505825042725,
|
|
"num_tokens": 26557013.0,
|
|
"step": 29
|
|
},
|
|
{
|
|
"epoch": 0.22388059701492538,
|
|
"grad_norm": 0.369912070212526,
|
|
"learning_rate": 9.993853407426353e-06,
|
|
"loss": 0.5103640556335449,
|
|
"num_tokens": 27503464.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.23134328358208955,
|
|
"grad_norm": 0.32843421942348455,
|
|
"learning_rate": 9.99256297863368e-06,
|
|
"loss": 0.5005761384963989,
|
|
"num_tokens": 28533732.0,
|
|
"step": 31
|
|
},
|
|
{
|
|
"epoch": 0.23880597014925373,
|
|
"grad_norm": 0.36571377121484666,
|
|
"learning_rate": 9.991149793485453e-06,
|
|
"loss": 0.5339782238006592,
|
|
"num_tokens": 29340667.0,
|
|
"step": 32
|
|
},
|
|
{
|
|
"epoch": 0.2462686567164179,
|
|
"grad_norm": 0.3706600251055638,
|
|
"learning_rate": 9.989613890596034e-06,
|
|
"loss": 0.5353128910064697,
|
|
"num_tokens": 30210961.0,
|
|
"step": 33
|
|
},
|
|
{
|
|
"epoch": 0.2537313432835821,
|
|
"grad_norm": 0.3689913973205178,
|
|
"learning_rate": 9.987955311932968e-06,
|
|
"loss": 0.5166599750518799,
|
|
"num_tokens": 31101886.0,
|
|
"step": 34
|
|
},
|
|
{
|
|
"epoch": 0.26119402985074625,
|
|
"grad_norm": 0.33967789101967927,
|
|
"learning_rate": 9.986174102815837e-06,
|
|
"loss": 0.5018597841262817,
|
|
"num_tokens": 31897310.0,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.26865671641791045,
|
|
"grad_norm": 0.34077171626781105,
|
|
"learning_rate": 9.984270311915019e-06,
|
|
"loss": 0.48667871952056885,
|
|
"num_tokens": 32540943.0,
|
|
"step": 36
|
|
},
|
|
{
|
|
"epoch": 0.27611940298507465,
|
|
"grad_norm": 0.3621091474207233,
|
|
"learning_rate": 9.982243991250359e-06,
|
|
"loss": 0.5088210105895996,
|
|
"num_tokens": 33542067.0,
|
|
"step": 37
|
|
},
|
|
{
|
|
"epoch": 0.2835820895522388,
|
|
"grad_norm": 0.3534080682731624,
|
|
"learning_rate": 9.980095196189748e-06,
|
|
"loss": 0.4913540482521057,
|
|
"num_tokens": 34504224.0,
|
|
"step": 38
|
|
},
|
|
{
|
|
"epoch": 0.291044776119403,
|
|
"grad_norm": 0.34385148887540573,
|
|
"learning_rate": 9.977823985447613e-06,
|
|
"loss": 0.5291423797607422,
|
|
"num_tokens": 35410799.0,
|
|
"step": 39
|
|
},
|
|
{
|
|
"epoch": 0.29850746268656714,
|
|
"grad_norm": 0.3614616882970318,
|
|
"learning_rate": 9.975430421083307e-06,
|
|
"loss": 0.5238292217254639,
|
|
"num_tokens": 36306291.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.30597014925373134,
|
|
"grad_norm": 0.34380854428467267,
|
|
"learning_rate": 9.972914568499412e-06,
|
|
"loss": 0.49555328488349915,
|
|
"num_tokens": 37195796.0,
|
|
"step": 41
|
|
},
|
|
{
|
|
"epoch": 0.31343283582089554,
|
|
"grad_norm": 0.32872739996760125,
|
|
"learning_rate": 9.970276496439967e-06,
|
|
"loss": 0.48128455877304077,
|
|
"num_tokens": 38111088.0,
|
|
"step": 42
|
|
},
|
|
{
|
|
"epoch": 0.3208955223880597,
|
|
"grad_norm": 0.32224419409640415,
|
|
"learning_rate": 9.967516276988569e-06,
|
|
"loss": 0.47381213307380676,
|
|
"num_tokens": 38854783.0,
|
|
"step": 43
|
|
},
|
|
{
|
|
"epoch": 0.3283582089552239,
|
|
"grad_norm": 0.313605152437139,
|
|
"learning_rate": 9.964633985566412e-06,
|
|
"loss": 0.4922352433204651,
|
|
"num_tokens": 39832057.0,
|
|
"step": 44
|
|
},
|
|
{
|
|
"epoch": 0.3358208955223881,
|
|
"grad_norm": 0.3221801938329887,
|
|
"learning_rate": 9.961629700930236e-06,
|
|
"loss": 0.5065716505050659,
|
|
"num_tokens": 40758959.0,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.34328358208955223,
|
|
"grad_norm": 0.34336243037288433,
|
|
"learning_rate": 9.958503505170158e-06,
|
|
"loss": 0.4985169470310211,
|
|
"num_tokens": 41744543.0,
|
|
"step": 46
|
|
},
|
|
{
|
|
"epoch": 0.35074626865671643,
|
|
"grad_norm": 0.323405267106758,
|
|
"learning_rate": 9.95525548370744e-06,
|
|
"loss": 0.4811803996562958,
|
|
"num_tokens": 42685398.0,
|
|
"step": 47
|
|
},
|
|
{
|
|
"epoch": 0.3582089552238806,
|
|
"grad_norm": 0.3472754733495145,
|
|
"learning_rate": 9.951885725292152e-06,
|
|
"loss": 0.4971832036972046,
|
|
"num_tokens": 43509328.0,
|
|
"step": 48
|
|
},
|
|
{
|
|
"epoch": 0.3656716417910448,
|
|
"grad_norm": 0.30314939517994505,
|
|
"learning_rate": 9.948394322000747e-06,
|
|
"loss": 0.4676430821418762,
|
|
"num_tokens": 44360961.0,
|
|
"step": 49
|
|
},
|
|
{
|
|
"epoch": 0.373134328358209,
|
|
"grad_norm": 0.3115400700181878,
|
|
"learning_rate": 9.944781369233544e-06,
|
|
"loss": 0.4450893700122833,
|
|
"num_tokens": 45215408.0,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.3805970149253731,
|
|
"grad_norm": 0.3274967224701377,
|
|
"learning_rate": 9.941046965712124e-06,
|
|
"loss": 0.4661027491092682,
|
|
"num_tokens": 46008801.0,
|
|
"step": 51
|
|
},
|
|
{
|
|
"epoch": 0.3880597014925373,
|
|
"grad_norm": 0.3185260501598265,
|
|
"learning_rate": 9.937191213476627e-06,
|
|
"loss": 0.45998284220695496,
|
|
"num_tokens": 46857304.0,
|
|
"step": 52
|
|
},
|
|
{
|
|
"epoch": 0.39552238805970147,
|
|
"grad_norm": 0.3187630499897143,
|
|
"learning_rate": 9.933214217882973e-06,
|
|
"loss": 0.49932676553726196,
|
|
"num_tokens": 47835515.0,
|
|
"step": 53
|
|
},
|
|
{
|
|
"epoch": 0.40298507462686567,
|
|
"grad_norm": 0.3126440220395918,
|
|
"learning_rate": 9.929116087599973e-06,
|
|
"loss": 0.49588972330093384,
|
|
"num_tokens": 48834826.0,
|
|
"step": 54
|
|
},
|
|
{
|
|
"epoch": 0.41044776119402987,
|
|
"grad_norm": 0.31909099806625735,
|
|
"learning_rate": 9.924896934606365e-06,
|
|
"loss": 0.49547284841537476,
|
|
"num_tokens": 49858718.0,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 0.417910447761194,
|
|
"grad_norm": 0.2999327415505548,
|
|
"learning_rate": 9.920556874187757e-06,
|
|
"loss": 0.45831602811813354,
|
|
"num_tokens": 50784650.0,
|
|
"step": 56
|
|
},
|
|
{
|
|
"epoch": 0.4253731343283582,
|
|
"grad_norm": 0.33478138187870804,
|
|
"learning_rate": 9.91609602493347e-06,
|
|
"loss": 0.44470953941345215,
|
|
"num_tokens": 51788903.0,
|
|
"step": 57
|
|
},
|
|
{
|
|
"epoch": 0.43283582089552236,
|
|
"grad_norm": 0.3098385124963181,
|
|
"learning_rate": 9.911514508733307e-06,
|
|
"loss": 0.48413345217704773,
|
|
"num_tokens": 52740886.0,
|
|
"step": 58
|
|
},
|
|
{
|
|
"epoch": 0.44029850746268656,
|
|
"grad_norm": 0.31570000266376347,
|
|
"learning_rate": 9.906812450774207e-06,
|
|
"loss": 0.5016104578971863,
|
|
"num_tokens": 53671576.0,
|
|
"step": 59
|
|
},
|
|
{
|
|
"epoch": 0.44776119402985076,
|
|
"grad_norm": 0.3184241179650494,
|
|
"learning_rate": 9.901989979536841e-06,
|
|
"loss": 0.4333784580230713,
|
|
"num_tokens": 54565325.0,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.4552238805970149,
|
|
"grad_norm": 0.3257766657124954,
|
|
"learning_rate": 9.897047226792093e-06,
|
|
"loss": 0.47651222348213196,
|
|
"num_tokens": 55458901.0,
|
|
"step": 61
|
|
},
|
|
{
|
|
"epoch": 0.4626865671641791,
|
|
"grad_norm": 0.2817242291155619,
|
|
"learning_rate": 9.891984327597462e-06,
|
|
"loss": 0.4714818000793457,
|
|
"num_tokens": 56519373.0,
|
|
"step": 62
|
|
},
|
|
{
|
|
"epoch": 0.4701492537313433,
|
|
"grad_norm": 0.32585513855646564,
|
|
"learning_rate": 9.886801420293365e-06,
|
|
"loss": 0.4708700180053711,
|
|
"num_tokens": 57420562.0,
|
|
"step": 63
|
|
},
|
|
{
|
|
"epoch": 0.47761194029850745,
|
|
"grad_norm": 0.32958409535328365,
|
|
"learning_rate": 9.88149864649937e-06,
|
|
"loss": 0.49606209993362427,
|
|
"num_tokens": 58259052.0,
|
|
"step": 64
|
|
},
|
|
{
|
|
"epoch": 0.48507462686567165,
|
|
"grad_norm": 0.31230811419608556,
|
|
"learning_rate": 9.876076151110313e-06,
|
|
"loss": 0.4840630888938904,
|
|
"num_tokens": 59121922.0,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 0.4925373134328358,
|
|
"grad_norm": 0.31050271225919246,
|
|
"learning_rate": 9.870534082292349e-06,
|
|
"loss": 0.4600119888782501,
|
|
"num_tokens": 60031785.0,
|
|
"step": 66
|
|
},
|
|
{
|
|
"epoch": 0.5,
|
|
"grad_norm": 0.2885380845506061,
|
|
"learning_rate": 9.864872591478895e-06,
|
|
"loss": 0.44136810302734375,
|
|
"num_tokens": 60972704.0,
|
|
"step": 67
|
|
},
|
|
{
|
|
"epoch": 0.5074626865671642,
|
|
"grad_norm": 0.28887203572406756,
|
|
"learning_rate": 9.859091833366498e-06,
|
|
"loss": 0.4619043469429016,
|
|
"num_tokens": 61912202.0,
|
|
"step": 68
|
|
},
|
|
{
|
|
"epoch": 0.5149253731343284,
|
|
"grad_norm": 0.297913211640831,
|
|
"learning_rate": 9.853191965910606e-06,
|
|
"loss": 0.48681432008743286,
|
|
"num_tokens": 62799081.0,
|
|
"step": 69
|
|
},
|
|
{
|
|
"epoch": 0.5223880597014925,
|
|
"grad_norm": 0.2978081791490928,
|
|
"learning_rate": 9.847173150321252e-06,
|
|
"loss": 0.4710129499435425,
|
|
"num_tokens": 63821360.0,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.5298507462686567,
|
|
"grad_norm": 0.33901428896502994,
|
|
"learning_rate": 9.84103555105865e-06,
|
|
"loss": 0.46070268750190735,
|
|
"num_tokens": 64698236.0,
|
|
"step": 71
|
|
},
|
|
{
|
|
"epoch": 0.5373134328358209,
|
|
"grad_norm": 0.2863724536535567,
|
|
"learning_rate": 9.8347793358287e-06,
|
|
"loss": 0.43551623821258545,
|
|
"num_tokens": 65531533.0,
|
|
"step": 72
|
|
},
|
|
{
|
|
"epoch": 0.5447761194029851,
|
|
"grad_norm": 0.30884498358581325,
|
|
"learning_rate": 9.828404675578405e-06,
|
|
"loss": 0.43174412846565247,
|
|
"num_tokens": 66409682.0,
|
|
"step": 73
|
|
},
|
|
{
|
|
"epoch": 0.5522388059701493,
|
|
"grad_norm": 0.39653106497260543,
|
|
"learning_rate": 9.821911744491203e-06,
|
|
"loss": 0.47224926948547363,
|
|
"num_tokens": 67201739.0,
|
|
"step": 74
|
|
},
|
|
{
|
|
"epoch": 0.5597014925373134,
|
|
"grad_norm": 0.34427781009373076,
|
|
"learning_rate": 9.815300719982204e-06,
|
|
"loss": 0.46234217286109924,
|
|
"num_tokens": 68054610.0,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 0.5671641791044776,
|
|
"grad_norm": 0.28593313207513976,
|
|
"learning_rate": 9.808571782693345e-06,
|
|
"loss": 0.4445508122444153,
|
|
"num_tokens": 68905436.0,
|
|
"step": 76
|
|
},
|
|
{
|
|
"epoch": 0.5746268656716418,
|
|
"grad_norm": 0.27754253103287374,
|
|
"learning_rate": 9.80172511648845e-06,
|
|
"loss": 0.4535985291004181,
|
|
"num_tokens": 69815159.0,
|
|
"step": 77
|
|
},
|
|
{
|
|
"epoch": 0.582089552238806,
|
|
"grad_norm": 0.2751626726169941,
|
|
"learning_rate": 9.794760908448215e-06,
|
|
"loss": 0.4778493642807007,
|
|
"num_tokens": 70800960.0,
|
|
"step": 78
|
|
},
|
|
{
|
|
"epoch": 0.5895522388059702,
|
|
"grad_norm": 0.2878195146653705,
|
|
"learning_rate": 9.787679348865082e-06,
|
|
"loss": 0.43559134006500244,
|
|
"num_tokens": 71706284.0,
|
|
"step": 79
|
|
},
|
|
{
|
|
"epoch": 0.5970149253731343,
|
|
"grad_norm": 0.3046702186252135,
|
|
"learning_rate": 9.780480631238052e-06,
|
|
"loss": 0.45745372772216797,
|
|
"num_tokens": 72585611.0,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.6044776119402985,
|
|
"grad_norm": 0.2580161347993156,
|
|
"learning_rate": 9.773164952267394e-06,
|
|
"loss": 0.44172853231430054,
|
|
"num_tokens": 73603712.0,
|
|
"step": 81
|
|
},
|
|
{
|
|
"epoch": 0.6119402985074627,
|
|
"grad_norm": 0.31823458045045494,
|
|
"learning_rate": 9.765732511849269e-06,
|
|
"loss": 0.4543741047382355,
|
|
"num_tokens": 74510353.0,
|
|
"step": 82
|
|
},
|
|
{
|
|
"epoch": 0.6194029850746269,
|
|
"grad_norm": 0.3262276808903542,
|
|
"learning_rate": 9.758183513070266e-06,
|
|
"loss": 0.48102468252182007,
|
|
"num_tokens": 75426311.0,
|
|
"step": 83
|
|
},
|
|
{
|
|
"epoch": 0.6268656716417911,
|
|
"grad_norm": 0.298246592306743,
|
|
"learning_rate": 9.750518162201858e-06,
|
|
"loss": 0.45155635476112366,
|
|
"num_tokens": 76290512.0,
|
|
"step": 84
|
|
},
|
|
{
|
|
"epoch": 0.6343283582089553,
|
|
"grad_norm": 0.30840978846450423,
|
|
"learning_rate": 9.74273666869476e-06,
|
|
"loss": 0.4398882985115051,
|
|
"num_tokens": 77207410.0,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 0.6417910447761194,
|
|
"grad_norm": 0.2986447882814022,
|
|
"learning_rate": 9.734839245173213e-06,
|
|
"loss": 0.43722379207611084,
|
|
"num_tokens": 78061170.0,
|
|
"step": 86
|
|
},
|
|
{
|
|
"epoch": 0.6492537313432836,
|
|
"grad_norm": 0.3213308600234638,
|
|
"learning_rate": 9.726826107429168e-06,
|
|
"loss": 0.44796180725097656,
|
|
"num_tokens": 78868118.0,
|
|
"step": 87
|
|
},
|
|
{
|
|
"epoch": 0.6567164179104478,
|
|
"grad_norm": 0.3249532753373927,
|
|
"learning_rate": 9.71869747441639e-06,
|
|
"loss": 0.4503297805786133,
|
|
"num_tokens": 79869363.0,
|
|
"step": 88
|
|
},
|
|
{
|
|
"epoch": 0.664179104477612,
|
|
"grad_norm": 0.5892356895414527,
|
|
"learning_rate": 9.71045356824448e-06,
|
|
"loss": 0.4414302110671997,
|
|
"num_tokens": 80709876.0,
|
|
"step": 89
|
|
},
|
|
{
|
|
"epoch": 0.6716417910447762,
|
|
"grad_norm": 0.32884534307528746,
|
|
"learning_rate": 9.7020946141728e-06,
|
|
"loss": 0.42054399847984314,
|
|
"num_tokens": 81535856.0,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.6791044776119403,
|
|
"grad_norm": 0.2754517512669749,
|
|
"learning_rate": 9.693620840604326e-06,
|
|
"loss": 0.4349040985107422,
|
|
"num_tokens": 82583455.0,
|
|
"step": 91
|
|
},
|
|
{
|
|
"epoch": 0.6865671641791045,
|
|
"grad_norm": 0.3190387165435769,
|
|
"learning_rate": 9.685032479079394e-06,
|
|
"loss": 0.44351187348365784,
|
|
"num_tokens": 83425036.0,
|
|
"step": 92
|
|
},
|
|
{
|
|
"epoch": 0.6940298507462687,
|
|
"grad_norm": 0.29203678336341016,
|
|
"learning_rate": 9.676329764269385e-06,
|
|
"loss": 0.4587559103965759,
|
|
"num_tokens": 84446952.0,
|
|
"step": 93
|
|
},
|
|
{
|
|
"epoch": 0.7014925373134329,
|
|
"grad_norm": 0.2977218953461726,
|
|
"learning_rate": 9.667512933970315e-06,
|
|
"loss": 0.429887980222702,
|
|
"num_tokens": 85254048.0,
|
|
"step": 94
|
|
},
|
|
{
|
|
"epoch": 0.7089552238805971,
|
|
"grad_norm": 0.319328445980617,
|
|
"learning_rate": 9.65858222909632e-06,
|
|
"loss": 0.4590649902820587,
|
|
"num_tokens": 86163467.0,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 0.7164179104477612,
|
|
"grad_norm": 0.5444784762173913,
|
|
"learning_rate": 9.649537893673096e-06,
|
|
"loss": 0.4472053647041321,
|
|
"num_tokens": 86980140.0,
|
|
"step": 96
|
|
},
|
|
{
|
|
"epoch": 0.7238805970149254,
|
|
"grad_norm": 0.33070572527793457,
|
|
"learning_rate": 9.640380174831209e-06,
|
|
"loss": 0.44589415192604065,
|
|
"num_tokens": 87928454.0,
|
|
"step": 97
|
|
},
|
|
{
|
|
"epoch": 0.7313432835820896,
|
|
"grad_norm": 0.31480720093895037,
|
|
"learning_rate": 9.631109322799362e-06,
|
|
"loss": 0.45890533924102783,
|
|
"num_tokens": 88687125.0,
|
|
"step": 98
|
|
},
|
|
{
|
|
"epoch": 0.7388059701492538,
|
|
"grad_norm": 0.3045515849614143,
|
|
"learning_rate": 9.621725590897544e-06,
|
|
"loss": 0.4472447633743286,
|
|
"num_tokens": 89545040.0,
|
|
"step": 99
|
|
},
|
|
{
|
|
"epoch": 0.746268656716418,
|
|
"grad_norm": 0.31053505819411625,
|
|
"learning_rate": 9.61222923553011e-06,
|
|
"loss": 0.44827064871788025,
|
|
"num_tokens": 90294885.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.753731343283582,
|
|
"grad_norm": 0.3029175634429252,
|
|
"learning_rate": 9.60262051617879e-06,
|
|
"loss": 0.4412766695022583,
|
|
"num_tokens": 91184198.0,
|
|
"step": 101
|
|
},
|
|
{
|
|
"epoch": 0.7611940298507462,
|
|
"grad_norm": 0.31643279761949383,
|
|
"learning_rate": 9.592899695395569e-06,
|
|
"loss": 0.4483514428138733,
|
|
"num_tokens": 91984545.0,
|
|
"step": 102
|
|
},
|
|
{
|
|
"epoch": 0.7686567164179104,
|
|
"grad_norm": 0.29772953486777926,
|
|
"learning_rate": 9.583067038795547e-06,
|
|
"loss": 0.48575955629348755,
|
|
"num_tokens": 92895986.0,
|
|
"step": 103
|
|
},
|
|
{
|
|
"epoch": 0.7761194029850746,
|
|
"grad_norm": 0.3103900650504769,
|
|
"learning_rate": 9.57312281504965e-06,
|
|
"loss": 0.4450864791870117,
|
|
"num_tokens": 93788383.0,
|
|
"step": 104
|
|
},
|
|
{
|
|
"epoch": 0.7835820895522388,
|
|
"grad_norm": 0.2842262724404981,
|
|
"learning_rate": 9.563067295877319e-06,
|
|
"loss": 0.4178208112716675,
|
|
"num_tokens": 94636525.0,
|
|
"step": 105
|
|
},
|
|
{
|
|
"epoch": 0.7910447761194029,
|
|
"grad_norm": 0.318233292303752,
|
|
"learning_rate": 9.552900756039057e-06,
|
|
"loss": 0.48816001415252686,
|
|
"num_tokens": 95397416.0,
|
|
"step": 106
|
|
},
|
|
{
|
|
"epoch": 0.7985074626865671,
|
|
"grad_norm": 0.3031459599411157,
|
|
"learning_rate": 9.54262347332894e-06,
|
|
"loss": 0.4687079191207886,
|
|
"num_tokens": 96224288.0,
|
|
"step": 107
|
|
},
|
|
{
|
|
"epoch": 0.8059701492537313,
|
|
"grad_norm": 0.3044834471531261,
|
|
"learning_rate": 9.532235728567025e-06,
|
|
"loss": 0.4333556890487671,
|
|
"num_tokens": 97053744.0,
|
|
"step": 108
|
|
},
|
|
{
|
|
"epoch": 0.8134328358208955,
|
|
"grad_norm": 0.382174488436462,
|
|
"learning_rate": 9.521737805591662e-06,
|
|
"loss": 0.45386844873428345,
|
|
"num_tokens": 97941243.0,
|
|
"step": 109
|
|
},
|
|
{
|
|
"epoch": 0.8208955223880597,
|
|
"grad_norm": 0.29853935870773984,
|
|
"learning_rate": 9.511129991251755e-06,
|
|
"loss": 0.4180367588996887,
|
|
"num_tokens": 98814023.0,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.8283582089552238,
|
|
"grad_norm": 0.3152812743712433,
|
|
"learning_rate": 9.500412575398923e-06,
|
|
"loss": 0.45900076627731323,
|
|
"num_tokens": 99770911.0,
|
|
"step": 111
|
|
},
|
|
{
|
|
"epoch": 0.835820895522388,
|
|
"grad_norm": 0.2798327916645599,
|
|
"learning_rate": 9.489585850879565e-06,
|
|
"loss": 0.4589983820915222,
|
|
"num_tokens": 100802886.0,
|
|
"step": 112
|
|
},
|
|
{
|
|
"epoch": 0.8432835820895522,
|
|
"grad_norm": 0.3302819245429099,
|
|
"learning_rate": 9.478650113526875e-06,
|
|
"loss": 0.44858676195144653,
|
|
"num_tokens": 101744970.0,
|
|
"step": 113
|
|
},
|
|
{
|
|
"epoch": 0.8507462686567164,
|
|
"grad_norm": 0.29962088349132515,
|
|
"learning_rate": 9.467605662152746e-06,
|
|
"loss": 0.4746031165122986,
|
|
"num_tokens": 102730722.0,
|
|
"step": 114
|
|
},
|
|
{
|
|
"epoch": 0.8582089552238806,
|
|
"grad_norm": 0.2939144591705004,
|
|
"learning_rate": 9.456452798539617e-06,
|
|
"loss": 0.4174093008041382,
|
|
"num_tokens": 103574949.0,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 0.8656716417910447,
|
|
"grad_norm": 0.3825239836099086,
|
|
"learning_rate": 9.445191827432216e-06,
|
|
"loss": 0.439868301153183,
|
|
"num_tokens": 104504791.0,
|
|
"step": 116
|
|
},
|
|
{
|
|
"epoch": 0.8731343283582089,
|
|
"grad_norm": 0.30386076772048964,
|
|
"learning_rate": 9.433823056529241e-06,
|
|
"loss": 0.47291260957717896,
|
|
"num_tokens": 105479834.0,
|
|
"step": 117
|
|
},
|
|
{
|
|
"epoch": 0.8805970149253731,
|
|
"grad_norm": 0.2762720558491326,
|
|
"learning_rate": 9.42234679647495e-06,
|
|
"loss": 0.4426780045032501,
|
|
"num_tokens": 106438084.0,
|
|
"step": 118
|
|
},
|
|
{
|
|
"epoch": 0.8880597014925373,
|
|
"grad_norm": 0.3057508592926945,
|
|
"learning_rate": 9.410763360850666e-06,
|
|
"loss": 0.4623616933822632,
|
|
"num_tokens": 107262750.0,
|
|
"step": 119
|
|
},
|
|
{
|
|
"epoch": 0.8955223880597015,
|
|
"grad_norm": 0.3127855621446368,
|
|
"learning_rate": 9.399073066166218e-06,
|
|
"loss": 0.4572855234146118,
|
|
"num_tokens": 108143548.0,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.9029850746268657,
|
|
"grad_norm": 0.32166755849704814,
|
|
"learning_rate": 9.387276231851292e-06,
|
|
"loss": 0.4610549211502075,
|
|
"num_tokens": 109031239.0,
|
|
"step": 121
|
|
},
|
|
{
|
|
"epoch": 0.9104477611940298,
|
|
"grad_norm": 0.308391680528446,
|
|
"learning_rate": 9.375373180246698e-06,
|
|
"loss": 0.4695647358894348,
|
|
"num_tokens": 109986382.0,
|
|
"step": 122
|
|
},
|
|
{
|
|
"epoch": 0.917910447761194,
|
|
"grad_norm": 0.2975657588114746,
|
|
"learning_rate": 9.363364236595561e-06,
|
|
"loss": 0.47796621918678284,
|
|
"num_tokens": 110966120.0,
|
|
"step": 123
|
|
},
|
|
{
|
|
"epoch": 0.9253731343283582,
|
|
"grad_norm": 0.31052979583373397,
|
|
"learning_rate": 9.351249729034441e-06,
|
|
"loss": 0.46253445744514465,
|
|
"num_tokens": 111841748.0,
|
|
"step": 124
|
|
},
|
|
{
|
|
"epoch": 0.9328358208955224,
|
|
"grad_norm": 0.30804176635348807,
|
|
"learning_rate": 9.339029988584364e-06,
|
|
"loss": 0.45033249258995056,
|
|
"num_tokens": 112797621.0,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 0.9402985074626866,
|
|
"grad_norm": 0.2896323126815727,
|
|
"learning_rate": 9.326705349141772e-06,
|
|
"loss": 0.46928197145462036,
|
|
"num_tokens": 113854322.0,
|
|
"step": 126
|
|
},
|
|
{
|
|
"epoch": 0.9477611940298507,
|
|
"grad_norm": 0.2863377703738466,
|
|
"learning_rate": 9.31427614746941e-06,
|
|
"loss": 0.44036608934402466,
|
|
"num_tokens": 114797592.0,
|
|
"step": 127
|
|
},
|
|
{
|
|
"epoch": 0.9552238805970149,
|
|
"grad_norm": 0.3136460841921916,
|
|
"learning_rate": 9.301742723187106e-06,
|
|
"loss": 0.4462299644947052,
|
|
"num_tokens": 115756574.0,
|
|
"step": 128
|
|
},
|
|
{
|
|
"epoch": 0.9626865671641791,
|
|
"grad_norm": 0.30712216569223755,
|
|
"learning_rate": 9.289105418762512e-06,
|
|
"loss": 0.46634775400161743,
|
|
"num_tokens": 116620827.0,
|
|
"step": 129
|
|
},
|
|
{
|
|
"epoch": 0.9701492537313433,
|
|
"grad_norm": 0.30150157073298506,
|
|
"learning_rate": 9.276364579501743e-06,
|
|
"loss": 0.4525374174118042,
|
|
"num_tokens": 117496028.0,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.9776119402985075,
|
|
"grad_norm": 0.2863498319159055,
|
|
"learning_rate": 9.263520553539919e-06,
|
|
"loss": 0.43308988213539124,
|
|
"num_tokens": 118326101.0,
|
|
"step": 131
|
|
},
|
|
{
|
|
"epoch": 0.9850746268656716,
|
|
"grad_norm": 0.31739713823558746,
|
|
"learning_rate": 9.250573691831688e-06,
|
|
"loss": 0.4591742753982544,
|
|
"num_tokens": 119217901.0,
|
|
"step": 132
|
|
},
|
|
{
|
|
"epoch": 0.9925373134328358,
|
|
"grad_norm": 0.3107389978804748,
|
|
"learning_rate": 9.2375243481416e-06,
|
|
"loss": 0.4491395056247711,
|
|
"num_tokens": 120120192.0,
|
|
"step": 133
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 0.29934735002842794,
|
|
"learning_rate": 9.224372879034471e-06,
|
|
"loss": 0.44749873876571655,
|
|
"num_tokens": 121051485.0,
|
|
"step": 134
|
|
},
|
|
{
|
|
"epoch": 1.007462686567164,
|
|
"grad_norm": 0.33488387869414854,
|
|
"learning_rate": 9.211119643865626e-06,
|
|
"loss": 0.4307776689529419,
|
|
"num_tokens": 121991896.0,
|
|
"step": 135
|
|
},
|
|
{
|
|
"epoch": 1.0149253731343284,
|
|
"grad_norm": 0.32499655410029626,
|
|
"learning_rate": 9.197765004771074e-06,
|
|
"loss": 0.4204443097114563,
|
|
"num_tokens": 122819690.0,
|
|
"step": 136
|
|
},
|
|
{
|
|
"epoch": 1.0223880597014925,
|
|
"grad_norm": 0.34181089478733623,
|
|
"learning_rate": 9.184309326657627e-06,
|
|
"loss": 0.41079288721084595,
|
|
"num_tokens": 123657032.0,
|
|
"step": 137
|
|
},
|
|
{
|
|
"epoch": 1.0298507462686568,
|
|
"grad_norm": 0.5825488788426431,
|
|
"learning_rate": 9.17075297719292e-06,
|
|
"loss": 0.4082901179790497,
|
|
"num_tokens": 124550556.0,
|
|
"step": 138
|
|
},
|
|
{
|
|
"epoch": 1.037313432835821,
|
|
"grad_norm": 1.1799244713672623,
|
|
"learning_rate": 9.157096326795369e-06,
|
|
"loss": 0.42325854301452637,
|
|
"num_tokens": 125328617.0,
|
|
"step": 139
|
|
},
|
|
{
|
|
"epoch": 1.044776119402985,
|
|
"grad_norm": 0.3981431547057968,
|
|
"learning_rate": 9.143339748624044e-06,
|
|
"loss": 0.40712812542915344,
|
|
"num_tokens": 126306594.0,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 1.0522388059701493,
|
|
"grad_norm": 0.32884099051410826,
|
|
"learning_rate": 9.129483618568478e-06,
|
|
"loss": 0.4147931933403015,
|
|
"num_tokens": 127215038.0,
|
|
"step": 141
|
|
},
|
|
{
|
|
"epoch": 1.0597014925373134,
|
|
"grad_norm": 0.3071551975535917,
|
|
"learning_rate": 9.115528315238396e-06,
|
|
"loss": 0.4247783422470093,
|
|
"num_tokens": 128054129.0,
|
|
"step": 142
|
|
},
|
|
{
|
|
"epoch": 1.0671641791044777,
|
|
"grad_norm": 0.3132240777032372,
|
|
"learning_rate": 9.101474219953367e-06,
|
|
"loss": 0.4133056104183197,
|
|
"num_tokens": 128952014.0,
|
|
"step": 143
|
|
},
|
|
{
|
|
"epoch": 1.0746268656716418,
|
|
"grad_norm": 0.31895939410654406,
|
|
"learning_rate": 9.087321716732384e-06,
|
|
"loss": 0.4213321805000305,
|
|
"num_tokens": 129774041.0,
|
|
"step": 144
|
|
},
|
|
{
|
|
"epoch": 1.0820895522388059,
|
|
"grad_norm": 0.32304487832880724,
|
|
"learning_rate": 9.073071192283374e-06,
|
|
"loss": 0.4195047616958618,
|
|
"num_tokens": 130656187.0,
|
|
"step": 145
|
|
},
|
|
{
|
|
"epoch": 1.0895522388059702,
|
|
"grad_norm": 0.31668877560620456,
|
|
"learning_rate": 9.058723035992632e-06,
|
|
"loss": 0.4216320514678955,
|
|
"num_tokens": 131546421.0,
|
|
"step": 146
|
|
},
|
|
{
|
|
"epoch": 1.0970149253731343,
|
|
"grad_norm": 0.30109857359574926,
|
|
"learning_rate": 9.044277639914177e-06,
|
|
"loss": 0.4255885183811188,
|
|
"num_tokens": 132482644.0,
|
|
"step": 147
|
|
},
|
|
{
|
|
"epoch": 1.1044776119402986,
|
|
"grad_norm": 0.28611352244816046,
|
|
"learning_rate": 9.029735398759044e-06,
|
|
"loss": 0.4004859924316406,
|
|
"num_tokens": 133363098.0,
|
|
"step": 148
|
|
},
|
|
{
|
|
"epoch": 1.1119402985074627,
|
|
"grad_norm": 0.3246541214309705,
|
|
"learning_rate": 9.015096709884493e-06,
|
|
"loss": 0.41801226139068604,
|
|
"num_tokens": 134281169.0,
|
|
"step": 149
|
|
},
|
|
{
|
|
"epoch": 1.1194029850746268,
|
|
"grad_norm": 0.39523810160114464,
|
|
"learning_rate": 9.00036197328316e-06,
|
|
"loss": 0.39403271675109863,
|
|
"num_tokens": 135132326.0,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 1.126865671641791,
|
|
"grad_norm": 0.3372219635650443,
|
|
"learning_rate": 8.985531591572117e-06,
|
|
"loss": 0.40995997190475464,
|
|
"num_tokens": 136009199.0,
|
|
"step": 151
|
|
},
|
|
{
|
|
"epoch": 1.1343283582089552,
|
|
"grad_norm": 0.2880187226242739,
|
|
"learning_rate": 8.97060596998188e-06,
|
|
"loss": 0.44250696897506714,
|
|
"num_tokens": 136974761.0,
|
|
"step": 152
|
|
},
|
|
{
|
|
"epoch": 1.1417910447761195,
|
|
"grad_norm": 0.2840439662929065,
|
|
"learning_rate": 8.955585516345333e-06,
|
|
"loss": 0.41125112771987915,
|
|
"num_tokens": 137953131.0,
|
|
"step": 153
|
|
},
|
|
{
|
|
"epoch": 1.1492537313432836,
|
|
"grad_norm": 0.30854018310336556,
|
|
"learning_rate": 8.940470641086583e-06,
|
|
"loss": 0.41466018557548523,
|
|
"num_tokens": 138890202.0,
|
|
"step": 154
|
|
},
|
|
{
|
|
"epoch": 1.1567164179104479,
|
|
"grad_norm": 0.2861522107018775,
|
|
"learning_rate": 8.925261757209744e-06,
|
|
"loss": 0.4421645998954773,
|
|
"num_tokens": 139921851.0,
|
|
"step": 155
|
|
},
|
|
{
|
|
"epoch": 1.164179104477612,
|
|
"grad_norm": 0.30184466401361404,
|
|
"learning_rate": 8.909959280287657e-06,
|
|
"loss": 0.41726770997047424,
|
|
"num_tokens": 140840212.0,
|
|
"step": 156
|
|
},
|
|
{
|
|
"epoch": 1.171641791044776,
|
|
"grad_norm": 0.29786414496705443,
|
|
"learning_rate": 8.894563628450534e-06,
|
|
"loss": 0.4137997627258301,
|
|
"num_tokens": 141681181.0,
|
|
"step": 157
|
|
},
|
|
{
|
|
"epoch": 1.1791044776119404,
|
|
"grad_norm": 0.27612956474353256,
|
|
"learning_rate": 8.879075222374522e-06,
|
|
"loss": 0.3967845141887665,
|
|
"num_tokens": 142603331.0,
|
|
"step": 158
|
|
},
|
|
{
|
|
"epoch": 1.1865671641791045,
|
|
"grad_norm": 0.2936198747641151,
|
|
"learning_rate": 8.863494485270228e-06,
|
|
"loss": 0.3882240355014801,
|
|
"num_tokens": 143438386.0,
|
|
"step": 159
|
|
},
|
|
{
|
|
"epoch": 1.1940298507462686,
|
|
"grad_norm": 0.28750782577222145,
|
|
"learning_rate": 8.847821842871137e-06,
|
|
"loss": 0.42263633012771606,
|
|
"num_tokens": 144352522.0,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 1.2014925373134329,
|
|
"grad_norm": 0.32255178451364774,
|
|
"learning_rate": 8.832057723421989e-06,
|
|
"loss": 0.42398497462272644,
|
|
"num_tokens": 145160558.0,
|
|
"step": 161
|
|
},
|
|
{
|
|
"epoch": 1.208955223880597,
|
|
"grad_norm": 0.32016607068719616,
|
|
"learning_rate": 8.816202557667076e-06,
|
|
"loss": 0.40889400243759155,
|
|
"num_tokens": 145970221.0,
|
|
"step": 162
|
|
},
|
|
{
|
|
"epoch": 1.2164179104477613,
|
|
"grad_norm": 0.30212941397274007,
|
|
"learning_rate": 8.800256778838468e-06,
|
|
"loss": 0.3960338234901428,
|
|
"num_tokens": 146893310.0,
|
|
"step": 163
|
|
},
|
|
{
|
|
"epoch": 1.2238805970149254,
|
|
"grad_norm": 0.31197303744834676,
|
|
"learning_rate": 8.78422082264418e-06,
|
|
"loss": 0.44305476546287537,
|
|
"num_tokens": 147701963.0,
|
|
"step": 164
|
|
},
|
|
{
|
|
"epoch": 1.2313432835820897,
|
|
"grad_norm": 0.2823293130053843,
|
|
"learning_rate": 8.768095127256263e-06,
|
|
"loss": 0.3833114206790924,
|
|
"num_tokens": 148634179.0,
|
|
"step": 165
|
|
},
|
|
{
|
|
"epoch": 1.2388059701492538,
|
|
"grad_norm": 0.2811151003410808,
|
|
"learning_rate": 8.751880133298834e-06,
|
|
"loss": 0.4171923100948334,
|
|
"num_tokens": 149594443.0,
|
|
"step": 166
|
|
},
|
|
{
|
|
"epoch": 1.2462686567164178,
|
|
"grad_norm": 0.31565679619489956,
|
|
"learning_rate": 8.735576283836039e-06,
|
|
"loss": 0.43264657258987427,
|
|
"num_tokens": 150495465.0,
|
|
"step": 167
|
|
},
|
|
{
|
|
"epoch": 1.2537313432835822,
|
|
"grad_norm": 0.3023001398731657,
|
|
"learning_rate": 8.719184024359935e-06,
|
|
"loss": 0.4185860753059387,
|
|
"num_tokens": 151402535.0,
|
|
"step": 168
|
|
},
|
|
{
|
|
"epoch": 1.2611940298507462,
|
|
"grad_norm": 0.3114367097991156,
|
|
"learning_rate": 8.702703802778332e-06,
|
|
"loss": 0.444894403219223,
|
|
"num_tokens": 152354215.0,
|
|
"step": 169
|
|
},
|
|
{
|
|
"epoch": 1.2686567164179103,
|
|
"grad_norm": 0.3130958107073367,
|
|
"learning_rate": 8.686136069402542e-06,
|
|
"loss": 0.3862420916557312,
|
|
"num_tokens": 153135819.0,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 1.2761194029850746,
|
|
"grad_norm": 0.32026467648986173,
|
|
"learning_rate": 8.669481276935085e-06,
|
|
"loss": 0.43771523237228394,
|
|
"num_tokens": 154060950.0,
|
|
"step": 171
|
|
},
|
|
{
|
|
"epoch": 1.2835820895522387,
|
|
"grad_norm": 0.33753040760769915,
|
|
"learning_rate": 8.652739880457309e-06,
|
|
"loss": 0.4314393401145935,
|
|
"num_tokens": 154999582.0,
|
|
"step": 172
|
|
},
|
|
{
|
|
"epoch": 1.291044776119403,
|
|
"grad_norm": 0.31404977555481944,
|
|
"learning_rate": 8.635912337416963e-06,
|
|
"loss": 0.4238457679748535,
|
|
"num_tokens": 155889540.0,
|
|
"step": 173
|
|
},
|
|
{
|
|
"epoch": 1.2985074626865671,
|
|
"grad_norm": 0.2917828706410469,
|
|
"learning_rate": 8.618999107615694e-06,
|
|
"loss": 0.4157620072364807,
|
|
"num_tokens": 156887223.0,
|
|
"step": 174
|
|
},
|
|
{
|
|
"epoch": 1.3059701492537314,
|
|
"grad_norm": 0.2929002597150211,
|
|
"learning_rate": 8.602000653196484e-06,
|
|
"loss": 0.4093779921531677,
|
|
"num_tokens": 157776705.0,
|
|
"step": 175
|
|
},
|
|
{
|
|
"epoch": 1.3134328358208955,
|
|
"grad_norm": 0.2981368517552101,
|
|
"learning_rate": 8.584917438631022e-06,
|
|
"loss": 0.4151228070259094,
|
|
"num_tokens": 158724790.0,
|
|
"step": 176
|
|
},
|
|
{
|
|
"epoch": 1.3208955223880596,
|
|
"grad_norm": 0.307459834676784,
|
|
"learning_rate": 8.567749930707012e-06,
|
|
"loss": 0.42905163764953613,
|
|
"num_tokens": 159719326.0,
|
|
"step": 177
|
|
},
|
|
{
|
|
"epoch": 1.328358208955224,
|
|
"grad_norm": 0.3174851983597954,
|
|
"learning_rate": 8.55049859851542e-06,
|
|
"loss": 0.44639986753463745,
|
|
"num_tokens": 160650411.0,
|
|
"step": 178
|
|
},
|
|
{
|
|
"epoch": 1.335820895522388,
|
|
"grad_norm": 0.37310729673210785,
|
|
"learning_rate": 8.533163913437657e-06,
|
|
"loss": 0.4070381820201874,
|
|
"num_tokens": 161685151.0,
|
|
"step": 179
|
|
},
|
|
{
|
|
"epoch": 1.3432835820895521,
|
|
"grad_norm": 0.34243880652688075,
|
|
"learning_rate": 8.515746349132693e-06,
|
|
"loss": 0.40524742007255554,
|
|
"num_tokens": 162668291.0,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 1.3507462686567164,
|
|
"grad_norm": 0.3314697629279733,
|
|
"learning_rate": 8.498246381524123e-06,
|
|
"loss": 0.39374542236328125,
|
|
"num_tokens": 163602019.0,
|
|
"step": 181
|
|
},
|
|
{
|
|
"epoch": 1.3582089552238805,
|
|
"grad_norm": 0.39714424307879675,
|
|
"learning_rate": 8.480664488787157e-06,
|
|
"loss": 0.41536325216293335,
|
|
"num_tokens": 164374987.0,
|
|
"step": 182
|
|
},
|
|
{
|
|
"epoch": 1.3656716417910448,
|
|
"grad_norm": 0.30470654817019394,
|
|
"learning_rate": 8.463001151335556e-06,
|
|
"loss": 0.420206755399704,
|
|
"num_tokens": 165277351.0,
|
|
"step": 183
|
|
},
|
|
{
|
|
"epoch": 1.373134328358209,
|
|
"grad_norm": 0.30147269826178413,
|
|
"learning_rate": 8.445256851808504e-06,
|
|
"loss": 0.40577423572540283,
|
|
"num_tokens": 166179864.0,
|
|
"step": 184
|
|
},
|
|
{
|
|
"epoch": 1.3805970149253732,
|
|
"grad_norm": 0.3160553991473881,
|
|
"learning_rate": 8.427432075057422e-06,
|
|
"loss": 0.3979928195476532,
|
|
"num_tokens": 167127067.0,
|
|
"step": 185
|
|
},
|
|
{
|
|
"epoch": 1.3880597014925373,
|
|
"grad_norm": 0.31665903933128287,
|
|
"learning_rate": 8.409527308132717e-06,
|
|
"loss": 0.4436604976654053,
|
|
"num_tokens": 168100947.0,
|
|
"step": 186
|
|
},
|
|
{
|
|
"epoch": 1.3955223880597014,
|
|
"grad_norm": 0.296181555140025,
|
|
"learning_rate": 8.391543040270477e-06,
|
|
"loss": 0.42373591661453247,
|
|
"num_tokens": 168977100.0,
|
|
"step": 187
|
|
},
|
|
{
|
|
"epoch": 1.4029850746268657,
|
|
"grad_norm": 0.340781706854354,
|
|
"learning_rate": 8.373479762879104e-06,
|
|
"loss": 0.4242423474788666,
|
|
"num_tokens": 169809036.0,
|
|
"step": 188
|
|
},
|
|
{
|
|
"epoch": 1.4104477611940298,
|
|
"grad_norm": 0.2912347476979519,
|
|
"learning_rate": 8.355337969525876e-06,
|
|
"loss": 0.3881043791770935,
|
|
"num_tokens": 170799001.0,
|
|
"step": 189
|
|
},
|
|
{
|
|
"epoch": 1.417910447761194,
|
|
"grad_norm": 0.3167891630018227,
|
|
"learning_rate": 8.337118155923474e-06,
|
|
"loss": 0.417064368724823,
|
|
"num_tokens": 171563636.0,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 1.4253731343283582,
|
|
"grad_norm": 0.32116936347486175,
|
|
"learning_rate": 8.318820819916433e-06,
|
|
"loss": 0.40856266021728516,
|
|
"num_tokens": 172297711.0,
|
|
"step": 191
|
|
},
|
|
{
|
|
"epoch": 1.4328358208955223,
|
|
"grad_norm": 0.3019887016574649,
|
|
"learning_rate": 8.300446461467533e-06,
|
|
"loss": 0.4446168541908264,
|
|
"num_tokens": 173246434.0,
|
|
"step": 192
|
|
},
|
|
{
|
|
"epoch": 1.4402985074626866,
|
|
"grad_norm": 0.3138769818399579,
|
|
"learning_rate": 8.281995582644145e-06,
|
|
"loss": 0.4181920289993286,
|
|
"num_tokens": 174149904.0,
|
|
"step": 193
|
|
},
|
|
{
|
|
"epoch": 1.4477611940298507,
|
|
"grad_norm": 0.313975344503838,
|
|
"learning_rate": 8.263468687604508e-06,
|
|
"loss": 0.4371890425682068,
|
|
"num_tokens": 174963687.0,
|
|
"step": 194
|
|
},
|
|
{
|
|
"epoch": 1.455223880597015,
|
|
"grad_norm": 0.29628794439446526,
|
|
"learning_rate": 8.244866282583957e-06,
|
|
"loss": 0.43816518783569336,
|
|
"num_tokens": 175988598.0,
|
|
"step": 195
|
|
},
|
|
{
|
|
"epoch": 1.462686567164179,
|
|
"grad_norm": 0.2963583065242463,
|
|
"learning_rate": 8.226188875881082e-06,
|
|
"loss": 0.41185736656188965,
|
|
"num_tokens": 176960311.0,
|
|
"step": 196
|
|
},
|
|
{
|
|
"epoch": 1.4701492537313432,
|
|
"grad_norm": 0.2991189293307387,
|
|
"learning_rate": 8.20743697784385e-06,
|
|
"loss": 0.46473461389541626,
|
|
"num_tokens": 177889691.0,
|
|
"step": 197
|
|
},
|
|
{
|
|
"epoch": 1.4776119402985075,
|
|
"grad_norm": 0.26573849496019714,
|
|
"learning_rate": 8.188611100855656e-06,
|
|
"loss": 0.3865639567375183,
|
|
"num_tokens": 178835508.0,
|
|
"step": 198
|
|
},
|
|
{
|
|
"epoch": 1.4850746268656716,
|
|
"grad_norm": 0.28471866573069565,
|
|
"learning_rate": 8.169711759321318e-06,
|
|
"loss": 0.4254840612411499,
|
|
"num_tokens": 179780829.0,
|
|
"step": 199
|
|
},
|
|
{
|
|
"epoch": 1.4925373134328357,
|
|
"grad_norm": 0.27591064975620333,
|
|
"learning_rate": 8.150739469653026e-06,
|
|
"loss": 0.3821393847465515,
|
|
"num_tokens": 180675259.0,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 1.5,
|
|
"grad_norm": 0.2912891463065521,
|
|
"learning_rate": 8.131694750256234e-06,
|
|
"loss": 0.4260258972644806,
|
|
"num_tokens": 181593083.0,
|
|
"step": 201
|
|
},
|
|
{
|
|
"epoch": 1.5074626865671643,
|
|
"grad_norm": 0.3470505245514532,
|
|
"learning_rate": 8.112578121515485e-06,
|
|
"loss": 0.42295166850090027,
|
|
"num_tokens": 182453649.0,
|
|
"step": 202
|
|
},
|
|
{
|
|
"epoch": 1.5149253731343284,
|
|
"grad_norm": 0.333624297966994,
|
|
"learning_rate": 8.0933901057802e-06,
|
|
"loss": 0.4165676534175873,
|
|
"num_tokens": 183252908.0,
|
|
"step": 203
|
|
},
|
|
{
|
|
"epoch": 1.5223880597014925,
|
|
"grad_norm": 0.2999450247966616,
|
|
"learning_rate": 8.074131227350408e-06,
|
|
"loss": 0.42348137497901917,
|
|
"num_tokens": 184218061.0,
|
|
"step": 204
|
|
},
|
|
{
|
|
"epoch": 1.5298507462686568,
|
|
"grad_norm": 0.33075885588759496,
|
|
"learning_rate": 8.05480201246241e-06,
|
|
"loss": 0.4413604140281677,
|
|
"num_tokens": 185123701.0,
|
|
"step": 205
|
|
},
|
|
{
|
|
"epoch": 1.537313432835821,
|
|
"grad_norm": 0.3236918821990334,
|
|
"learning_rate": 8.035402989274402e-06,
|
|
"loss": 0.4267103970050812,
|
|
"num_tokens": 186020054.0,
|
|
"step": 206
|
|
},
|
|
{
|
|
"epoch": 1.544776119402985,
|
|
"grad_norm": 0.28545115313146596,
|
|
"learning_rate": 8.015934687852053e-06,
|
|
"loss": 0.4010322690010071,
|
|
"num_tokens": 186957926.0,
|
|
"step": 207
|
|
},
|
|
{
|
|
"epoch": 1.5522388059701493,
|
|
"grad_norm": 0.33525388932605726,
|
|
"learning_rate": 7.996397640154012e-06,
|
|
"loss": 0.43479830026626587,
|
|
"num_tokens": 187967937.0,
|
|
"step": 208
|
|
},
|
|
{
|
|
"epoch": 1.5597014925373134,
|
|
"grad_norm": 0.2852110581692416,
|
|
"learning_rate": 7.976792380017374e-06,
|
|
"loss": 0.3835904002189636,
|
|
"num_tokens": 188699883.0,
|
|
"step": 209
|
|
},
|
|
{
|
|
"epoch": 1.5671641791044775,
|
|
"grad_norm": 0.38746256380732114,
|
|
"learning_rate": 7.957119443143093e-06,
|
|
"loss": 0.43473392724990845,
|
|
"num_tokens": 189533459.0,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 1.5746268656716418,
|
|
"grad_norm": 0.30040372660742176,
|
|
"learning_rate": 7.937379367081356e-06,
|
|
"loss": 0.4094908535480499,
|
|
"num_tokens": 190331401.0,
|
|
"step": 211
|
|
},
|
|
{
|
|
"epoch": 1.582089552238806,
|
|
"grad_norm": 0.35097170028371405,
|
|
"learning_rate": 7.917572691216868e-06,
|
|
"loss": 0.44787487387657166,
|
|
"num_tokens": 191163315.0,
|
|
"step": 212
|
|
},
|
|
{
|
|
"epoch": 1.5895522388059702,
|
|
"grad_norm": 0.29035162522974023,
|
|
"learning_rate": 7.897699956754142e-06,
|
|
"loss": 0.41564756631851196,
|
|
"num_tokens": 192105809.0,
|
|
"step": 213
|
|
},
|
|
{
|
|
"epoch": 1.5970149253731343,
|
|
"grad_norm": 0.3234055460991543,
|
|
"learning_rate": 7.877761706702698e-06,
|
|
"loss": 0.42737478017807007,
|
|
"num_tokens": 193098168.0,
|
|
"step": 214
|
|
},
|
|
{
|
|
"epoch": 1.6044776119402986,
|
|
"grad_norm": 0.3181366599415042,
|
|
"learning_rate": 7.85775848586222e-06,
|
|
"loss": 0.4263436794281006,
|
|
"num_tokens": 193975959.0,
|
|
"step": 215
|
|
},
|
|
{
|
|
"epoch": 1.6119402985074627,
|
|
"grad_norm": 0.3047597849777916,
|
|
"learning_rate": 7.837690840807688e-06,
|
|
"loss": 0.4356343150138855,
|
|
"num_tokens": 194828963.0,
|
|
"step": 216
|
|
},
|
|
{
|
|
"epoch": 1.6194029850746268,
|
|
"grad_norm": 0.2953366209904587,
|
|
"learning_rate": 7.817559319874417e-06,
|
|
"loss": 0.39498403668403625,
|
|
"num_tokens": 195757337.0,
|
|
"step": 217
|
|
},
|
|
{
|
|
"epoch": 1.626865671641791,
|
|
"grad_norm": 0.2936401683412748,
|
|
"learning_rate": 7.797364473143105e-06,
|
|
"loss": 0.4154474139213562,
|
|
"num_tokens": 196731181.0,
|
|
"step": 218
|
|
},
|
|
{
|
|
"epoch": 1.6343283582089554,
|
|
"grad_norm": 0.2898185408597091,
|
|
"learning_rate": 7.77710685242477e-06,
|
|
"loss": 0.42473846673965454,
|
|
"num_tokens": 197621017.0,
|
|
"step": 219
|
|
},
|
|
{
|
|
"epoch": 1.6417910447761193,
|
|
"grad_norm": 0.29114088952907274,
|
|
"learning_rate": 7.7567870112457e-06,
|
|
"loss": 0.4433613419532776,
|
|
"num_tokens": 198631859.0,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 1.6492537313432836,
|
|
"grad_norm": 0.31287064287880717,
|
|
"learning_rate": 7.736405504832314e-06,
|
|
"loss": 0.4322376549243927,
|
|
"num_tokens": 199557498.0,
|
|
"step": 221
|
|
},
|
|
{
|
|
"epoch": 1.6567164179104479,
|
|
"grad_norm": 0.3031132335175992,
|
|
"learning_rate": 7.715962890095988e-06,
|
|
"loss": 0.41872939467430115,
|
|
"num_tokens": 200455519.0,
|
|
"step": 222
|
|
},
|
|
{
|
|
"epoch": 1.664179104477612,
|
|
"grad_norm": 0.5127084447985639,
|
|
"learning_rate": 7.695459725617851e-06,
|
|
"loss": 0.4426816999912262,
|
|
"num_tokens": 201364168.0,
|
|
"step": 223
|
|
},
|
|
{
|
|
"epoch": 1.671641791044776,
|
|
"grad_norm": 0.36355358662257686,
|
|
"learning_rate": 7.674896571633507e-06,
|
|
"loss": 0.3920941650867462,
|
|
"num_tokens": 202272665.0,
|
|
"step": 224
|
|
},
|
|
{
|
|
"epoch": 1.6791044776119404,
|
|
"grad_norm": 0.2918543179655489,
|
|
"learning_rate": 7.654273990017742e-06,
|
|
"loss": 0.3865686058998108,
|
|
"num_tokens": 203236852.0,
|
|
"step": 225
|
|
},
|
|
{
|
|
"epoch": 1.6865671641791045,
|
|
"grad_norm": 0.29443958475831755,
|
|
"learning_rate": 7.633592544269152e-06,
|
|
"loss": 0.41160887479782104,
|
|
"num_tokens": 204144409.0,
|
|
"step": 226
|
|
},
|
|
{
|
|
"epoch": 1.6940298507462686,
|
|
"grad_norm": 0.29368087510062574,
|
|
"learning_rate": 7.61285279949477e-06,
|
|
"loss": 0.41996899247169495,
|
|
"num_tokens": 205087641.0,
|
|
"step": 227
|
|
},
|
|
{
|
|
"epoch": 1.7014925373134329,
|
|
"grad_norm": 0.2981876720268518,
|
|
"learning_rate": 7.592055322394602e-06,
|
|
"loss": 0.4322773814201355,
|
|
"num_tokens": 205964269.0,
|
|
"step": 228
|
|
},
|
|
{
|
|
"epoch": 1.7089552238805972,
|
|
"grad_norm": 0.3032205060654827,
|
|
"learning_rate": 7.5712006812461595e-06,
|
|
"loss": 0.4357481002807617,
|
|
"num_tokens": 206853325.0,
|
|
"step": 229
|
|
},
|
|
{
|
|
"epoch": 1.716417910447761,
|
|
"grad_norm": 0.30382769873452287,
|
|
"learning_rate": 7.5502894458889154e-06,
|
|
"loss": 0.42187392711639404,
|
|
"num_tokens": 207780456.0,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 1.7238805970149254,
|
|
"grad_norm": 0.28458753280851,
|
|
"learning_rate": 7.529322187708752e-06,
|
|
"loss": 0.4417547583580017,
|
|
"num_tokens": 208692271.0,
|
|
"step": 231
|
|
},
|
|
{
|
|
"epoch": 1.7313432835820897,
|
|
"grad_norm": 0.28678480761878283,
|
|
"learning_rate": 7.5082994796223355e-06,
|
|
"loss": 0.4000692367553711,
|
|
"num_tokens": 209542301.0,
|
|
"step": 232
|
|
},
|
|
{
|
|
"epoch": 1.7388059701492538,
|
|
"grad_norm": 0.3105804034516556,
|
|
"learning_rate": 7.487221896061458e-06,
|
|
"loss": 0.43237993121147156,
|
|
"num_tokens": 210462903.0,
|
|
"step": 233
|
|
},
|
|
{
|
|
"epoch": 1.7462686567164178,
|
|
"grad_norm": 0.3069476203994755,
|
|
"learning_rate": 7.466090012957361e-06,
|
|
"loss": 0.4426308274269104,
|
|
"num_tokens": 211451379.0,
|
|
"step": 234
|
|
},
|
|
{
|
|
"epoch": 1.7537313432835822,
|
|
"grad_norm": 0.29187302592713965,
|
|
"learning_rate": 7.444904407724973e-06,
|
|
"loss": 0.4144989252090454,
|
|
"num_tokens": 212341336.0,
|
|
"step": 235
|
|
},
|
|
{
|
|
"epoch": 1.7611940298507462,
|
|
"grad_norm": 0.2715020106858522,
|
|
"learning_rate": 7.423665659247154e-06,
|
|
"loss": 0.4140280485153198,
|
|
"num_tokens": 213184565.0,
|
|
"step": 236
|
|
},
|
|
{
|
|
"epoch": 1.7686567164179103,
|
|
"grad_norm": 0.3042751492929567,
|
|
"learning_rate": 7.402374347858862e-06,
|
|
"loss": 0.4220738708972931,
|
|
"num_tokens": 214162910.0,
|
|
"step": 237
|
|
},
|
|
{
|
|
"epoch": 1.7761194029850746,
|
|
"grad_norm": 0.283596579410495,
|
|
"learning_rate": 7.381031055331306e-06,
|
|
"loss": 0.43350133299827576,
|
|
"num_tokens": 215182240.0,
|
|
"step": 238
|
|
},
|
|
{
|
|
"epoch": 1.783582089552239,
|
|
"grad_norm": 0.29114085647177373,
|
|
"learning_rate": 7.3596363648560445e-06,
|
|
"loss": 0.4327085316181183,
|
|
"num_tokens": 216074554.0,
|
|
"step": 239
|
|
},
|
|
{
|
|
"epoch": 1.7910447761194028,
|
|
"grad_norm": 0.28379283338161987,
|
|
"learning_rate": 7.338190861029052e-06,
|
|
"loss": 0.4293884038925171,
|
|
"num_tokens": 216989156.0,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 1.7985074626865671,
|
|
"grad_norm": 0.31407525298001004,
|
|
"learning_rate": 7.316695129834744e-06,
|
|
"loss": 0.4033690392971039,
|
|
"num_tokens": 217859754.0,
|
|
"step": 241
|
|
},
|
|
{
|
|
"epoch": 1.8059701492537314,
|
|
"grad_norm": 0.3013707320804031,
|
|
"learning_rate": 7.2951497586299665e-06,
|
|
"loss": 0.415780246257782,
|
|
"num_tokens": 218674048.0,
|
|
"step": 242
|
|
},
|
|
{
|
|
"epoch": 1.8134328358208955,
|
|
"grad_norm": 0.3130414485143585,
|
|
"learning_rate": 7.273555336127948e-06,
|
|
"loss": 0.4289485216140747,
|
|
"num_tokens": 219544627.0,
|
|
"step": 243
|
|
},
|
|
{
|
|
"epoch": 1.8208955223880596,
|
|
"grad_norm": 0.271886252549519,
|
|
"learning_rate": 7.251912452382206e-06,
|
|
"loss": 0.4117184579372406,
|
|
"num_tokens": 220510777.0,
|
|
"step": 244
|
|
},
|
|
{
|
|
"epoch": 1.828358208955224,
|
|
"grad_norm": 0.3095984364408915,
|
|
"learning_rate": 7.2302216987704395e-06,
|
|
"loss": 0.40528762340545654,
|
|
"num_tokens": 221358648.0,
|
|
"step": 245
|
|
},
|
|
{
|
|
"epoch": 1.835820895522388,
|
|
"grad_norm": 0.28537942146166506,
|
|
"learning_rate": 7.208483667978351e-06,
|
|
"loss": 0.37842410802841187,
|
|
"num_tokens": 222227328.0,
|
|
"step": 246
|
|
},
|
|
{
|
|
"epoch": 1.8432835820895521,
|
|
"grad_norm": 0.3285002711937223,
|
|
"learning_rate": 7.186698953983466e-06,
|
|
"loss": 0.4463423192501068,
|
|
"num_tokens": 223216379.0,
|
|
"step": 247
|
|
},
|
|
{
|
|
"epoch": 1.8507462686567164,
|
|
"grad_norm": 0.29900827070350944,
|
|
"learning_rate": 7.164868152038899e-06,
|
|
"loss": 0.42675986886024475,
|
|
"num_tokens": 224109870.0,
|
|
"step": 248
|
|
},
|
|
{
|
|
"epoch": 1.8582089552238807,
|
|
"grad_norm": 0.27490080435841,
|
|
"learning_rate": 7.1429918586570815e-06,
|
|
"loss": 0.4331856667995453,
|
|
"num_tokens": 225101205.0,
|
|
"step": 249
|
|
},
|
|
{
|
|
"epoch": 1.8656716417910446,
|
|
"grad_norm": 0.2935787072389711,
|
|
"learning_rate": 7.121070671593477e-06,
|
|
"loss": 0.4262286424636841,
|
|
"num_tokens": 226119986.0,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 1.873134328358209,
|
|
"grad_norm": 0.3045861994484339,
|
|
"learning_rate": 7.099105189830235e-06,
|
|
"loss": 0.4218306541442871,
|
|
"num_tokens": 226995732.0,
|
|
"step": 251
|
|
},
|
|
{
|
|
"epoch": 1.8805970149253732,
|
|
"grad_norm": 0.27595409032706397,
|
|
"learning_rate": 7.077096013559831e-06,
|
|
"loss": 0.4189199209213257,
|
|
"num_tokens": 227872634.0,
|
|
"step": 252
|
|
},
|
|
{
|
|
"epoch": 1.8880597014925373,
|
|
"grad_norm": 0.289326233334052,
|
|
"learning_rate": 7.055043744168658e-06,
|
|
"loss": 0.44568511843681335,
|
|
"num_tokens": 228843256.0,
|
|
"step": 253
|
|
},
|
|
{
|
|
"epoch": 1.8955223880597014,
|
|
"grad_norm": 0.3108178596802667,
|
|
"learning_rate": 7.032948984220611e-06,
|
|
"loss": 0.39977630972862244,
|
|
"num_tokens": 229749232.0,
|
|
"step": 254
|
|
},
|
|
{
|
|
"epoch": 1.9029850746268657,
|
|
"grad_norm": 0.3029945133044889,
|
|
"learning_rate": 7.0108123374406046e-06,
|
|
"loss": 0.41192835569381714,
|
|
"num_tokens": 230524739.0,
|
|
"step": 255
|
|
},
|
|
{
|
|
"epoch": 1.9104477611940298,
|
|
"grad_norm": 0.25289759257512634,
|
|
"learning_rate": 6.988634408698083e-06,
|
|
"loss": 0.38565781712532043,
|
|
"num_tokens": 231455850.0,
|
|
"step": 256
|
|
},
|
|
{
|
|
"epoch": 1.917910447761194,
|
|
"grad_norm": 0.298108417839461,
|
|
"learning_rate": 6.966415803990501e-06,
|
|
"loss": 0.4397220015525818,
|
|
"num_tokens": 232349234.0,
|
|
"step": 257
|
|
},
|
|
{
|
|
"epoch": 1.9253731343283582,
|
|
"grad_norm": 0.30576254773905986,
|
|
"learning_rate": 6.944157130426745e-06,
|
|
"loss": 0.43654486536979675,
|
|
"num_tokens": 233187315.0,
|
|
"step": 258
|
|
},
|
|
{
|
|
"epoch": 1.9328358208955225,
|
|
"grad_norm": 0.28668295683966216,
|
|
"learning_rate": 6.9218589962105695e-06,
|
|
"loss": 0.40597644448280334,
|
|
"num_tokens": 234091956.0,
|
|
"step": 259
|
|
},
|
|
{
|
|
"epoch": 1.9402985074626866,
|
|
"grad_norm": 0.2807573548073224,
|
|
"learning_rate": 6.899522010623959e-06,
|
|
"loss": 0.42698317766189575,
|
|
"num_tokens": 235133005.0,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 1.9477611940298507,
|
|
"grad_norm": 0.2676937710994811,
|
|
"learning_rate": 6.877146784010486e-06,
|
|
"loss": 0.4118936061859131,
|
|
"num_tokens": 235967243.0,
|
|
"step": 261
|
|
},
|
|
{
|
|
"epoch": 1.955223880597015,
|
|
"grad_norm": 0.29199333652094117,
|
|
"learning_rate": 6.854733927758636e-06,
|
|
"loss": 0.42816537618637085,
|
|
"num_tokens": 236876001.0,
|
|
"step": 262
|
|
},
|
|
{
|
|
"epoch": 1.962686567164179,
|
|
"grad_norm": 0.3572922506463511,
|
|
"learning_rate": 6.832284054285101e-06,
|
|
"loss": 0.43847325444221497,
|
|
"num_tokens": 237876952.0,
|
|
"step": 263
|
|
},
|
|
{
|
|
"epoch": 1.9701492537313432,
|
|
"grad_norm": 0.2960985809182997,
|
|
"learning_rate": 6.809797777018041e-06,
|
|
"loss": 0.43155139684677124,
|
|
"num_tokens": 238704164.0,
|
|
"step": 264
|
|
},
|
|
{
|
|
"epoch": 1.9776119402985075,
|
|
"grad_norm": 0.3169980642916318,
|
|
"learning_rate": 6.78727571038033e-06,
|
|
"loss": 0.4308193027973175,
|
|
"num_tokens": 239595870.0,
|
|
"step": 265
|
|
},
|
|
{
|
|
"epoch": 1.9850746268656716,
|
|
"grad_norm": 0.3191747061655072,
|
|
"learning_rate": 6.764718469772759e-06,
|
|
"loss": 0.4188956022262573,
|
|
"num_tokens": 240337386.0,
|
|
"step": 266
|
|
},
|
|
{
|
|
"epoch": 1.9925373134328357,
|
|
"grad_norm": 0.28286588606011187,
|
|
"learning_rate": 6.7421266715572275e-06,
|
|
"loss": 0.40036123991012573,
|
|
"num_tokens": 241215348.0,
|
|
"step": 267
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"grad_norm": 0.2981753233991589,
|
|
"learning_rate": 6.719500933039898e-06,
|
|
"loss": 0.41549932956695557,
|
|
"num_tokens": 242121111.0,
|
|
"step": 268
|
|
},
|
|
{
|
|
"epoch": 2.0074626865671643,
|
|
"grad_norm": 0.33640737374184443,
|
|
"learning_rate": 6.696841872454332e-06,
|
|
"loss": 0.4132290482521057,
|
|
"num_tokens": 243025320.0,
|
|
"step": 269
|
|
},
|
|
{
|
|
"epoch": 2.014925373134328,
|
|
"grad_norm": 0.2822051764181089,
|
|
"learning_rate": 6.674150108944593e-06,
|
|
"loss": 0.37781068682670593,
|
|
"num_tokens": 243793916.0,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 2.0223880597014925,
|
|
"grad_norm": 0.38987929902231017,
|
|
"learning_rate": 6.651426262548326e-06,
|
|
"loss": 0.40918004512786865,
|
|
"num_tokens": 244799351.0,
|
|
"step": 271
|
|
},
|
|
{
|
|
"epoch": 2.029850746268657,
|
|
"grad_norm": 0.348061447310908,
|
|
"learning_rate": 6.62867095417983e-06,
|
|
"loss": 0.3939589858055115,
|
|
"num_tokens": 245795313.0,
|
|
"step": 272
|
|
},
|
|
{
|
|
"epoch": 2.0373134328358207,
|
|
"grad_norm": 0.3046732710135438,
|
|
"learning_rate": 6.605884805613073e-06,
|
|
"loss": 0.36584192514419556,
|
|
"num_tokens": 246732184.0,
|
|
"step": 273
|
|
},
|
|
{
|
|
"epoch": 2.044776119402985,
|
|
"grad_norm": 0.3664198494618375,
|
|
"learning_rate": 6.583068439464716e-06,
|
|
"loss": 0.4081302881240845,
|
|
"num_tokens": 247606091.0,
|
|
"step": 274
|
|
},
|
|
{
|
|
"epoch": 2.0522388059701493,
|
|
"grad_norm": 0.3112614984470978,
|
|
"learning_rate": 6.560222479177095e-06,
|
|
"loss": 0.3947848081588745,
|
|
"num_tokens": 248474307.0,
|
|
"step": 275
|
|
},
|
|
{
|
|
"epoch": 2.0597014925373136,
|
|
"grad_norm": 0.3268123714386943,
|
|
"learning_rate": 6.537347549001184e-06,
|
|
"loss": 0.39627498388290405,
|
|
"num_tokens": 249293743.0,
|
|
"step": 276
|
|
},
|
|
{
|
|
"epoch": 2.0671641791044775,
|
|
"grad_norm": 0.30038025917744793,
|
|
"learning_rate": 6.514444273979544e-06,
|
|
"loss": 0.3961779773235321,
|
|
"num_tokens": 250164041.0,
|
|
"step": 277
|
|
},
|
|
{
|
|
"epoch": 2.074626865671642,
|
|
"grad_norm": 0.30941665860783496,
|
|
"learning_rate": 6.491513279929238e-06,
|
|
"loss": 0.3704898953437805,
|
|
"num_tokens": 251063865.0,
|
|
"step": 278
|
|
},
|
|
{
|
|
"epoch": 2.082089552238806,
|
|
"grad_norm": 0.2822311579038674,
|
|
"learning_rate": 6.468555193424736e-06,
|
|
"loss": 0.3888505697250366,
|
|
"num_tokens": 251954121.0,
|
|
"step": 279
|
|
},
|
|
{
|
|
"epoch": 2.08955223880597,
|
|
"grad_norm": 0.2838966427637005,
|
|
"learning_rate": 6.445570641780786e-06,
|
|
"loss": 0.3732953667640686,
|
|
"num_tokens": 252767775.0,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 2.0970149253731343,
|
|
"grad_norm": 0.30198287700287857,
|
|
"learning_rate": 6.422560253035287e-06,
|
|
"loss": 0.3989664614200592,
|
|
"num_tokens": 253671573.0,
|
|
"step": 281
|
|
},
|
|
{
|
|
"epoch": 2.1044776119402986,
|
|
"grad_norm": 0.3143195160978541,
|
|
"learning_rate": 6.399524655932111e-06,
|
|
"loss": 0.4071004390716553,
|
|
"num_tokens": 254540226.0,
|
|
"step": 282
|
|
},
|
|
{
|
|
"epoch": 2.111940298507463,
|
|
"grad_norm": 0.29633039155095714,
|
|
"learning_rate": 6.376464479903938e-06,
|
|
"loss": 0.3590371012687683,
|
|
"num_tokens": 255292355.0,
|
|
"step": 283
|
|
},
|
|
{
|
|
"epoch": 2.1194029850746268,
|
|
"grad_norm": 0.2746728490799242,
|
|
"learning_rate": 6.353380355055051e-06,
|
|
"loss": 0.38884416222572327,
|
|
"num_tokens": 256176530.0,
|
|
"step": 284
|
|
},
|
|
{
|
|
"epoch": 2.126865671641791,
|
|
"grad_norm": 0.2951568696719758,
|
|
"learning_rate": 6.330272912144116e-06,
|
|
"loss": 0.42871013283729553,
|
|
"num_tokens": 257090645.0,
|
|
"step": 285
|
|
},
|
|
{
|
|
"epoch": 2.1343283582089554,
|
|
"grad_norm": 0.2902093873074645,
|
|
"learning_rate": 6.307142782566952e-06,
|
|
"loss": 0.3986203670501709,
|
|
"num_tokens": 258131119.0,
|
|
"step": 286
|
|
},
|
|
{
|
|
"epoch": 2.1417910447761193,
|
|
"grad_norm": 0.3900114303550773,
|
|
"learning_rate": 6.283990598339274e-06,
|
|
"loss": 0.390123188495636,
|
|
"num_tokens": 258880552.0,
|
|
"step": 287
|
|
},
|
|
{
|
|
"epoch": 2.1492537313432836,
|
|
"grad_norm": 0.2806374479908933,
|
|
"learning_rate": 6.2608169920794314e-06,
|
|
"loss": 0.36130136251449585,
|
|
"num_tokens": 259758999.0,
|
|
"step": 288
|
|
},
|
|
{
|
|
"epoch": 2.156716417910448,
|
|
"grad_norm": 0.2942927245657638,
|
|
"learning_rate": 6.237622596991106e-06,
|
|
"loss": 0.40030941367149353,
|
|
"num_tokens": 260602559.0,
|
|
"step": 289
|
|
},
|
|
{
|
|
"epoch": 2.1641791044776117,
|
|
"grad_norm": 0.3214957885578966,
|
|
"learning_rate": 6.214408046846034e-06,
|
|
"loss": 0.39499810338020325,
|
|
"num_tokens": 261439646.0,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 2.171641791044776,
|
|
"grad_norm": 0.27240683635483437,
|
|
"learning_rate": 6.191173975966669e-06,
|
|
"loss": 0.3880019783973694,
|
|
"num_tokens": 262474020.0,
|
|
"step": 291
|
|
},
|
|
{
|
|
"epoch": 2.1791044776119404,
|
|
"grad_norm": 0.34023027676143563,
|
|
"learning_rate": 6.167921019208851e-06,
|
|
"loss": 0.42268985509872437,
|
|
"num_tokens": 263528820.0,
|
|
"step": 292
|
|
},
|
|
{
|
|
"epoch": 2.1865671641791047,
|
|
"grad_norm": 0.287848829860692,
|
|
"learning_rate": 6.144649811944474e-06,
|
|
"loss": 0.3913387656211853,
|
|
"num_tokens": 264372315.0,
|
|
"step": 293
|
|
},
|
|
{
|
|
"epoch": 2.1940298507462686,
|
|
"grad_norm": 0.29220713499868917,
|
|
"learning_rate": 6.121360990044107e-06,
|
|
"loss": 0.40157270431518555,
|
|
"num_tokens": 265188957.0,
|
|
"step": 294
|
|
},
|
|
{
|
|
"epoch": 2.201492537313433,
|
|
"grad_norm": 0.286455151799939,
|
|
"learning_rate": 6.098055189859634e-06,
|
|
"loss": 0.3945062756538391,
|
|
"num_tokens": 266184697.0,
|
|
"step": 295
|
|
},
|
|
{
|
|
"epoch": 2.208955223880597,
|
|
"grad_norm": 0.289286738435993,
|
|
"learning_rate": 6.074733048206852e-06,
|
|
"loss": 0.3945891559123993,
|
|
"num_tokens": 267190971.0,
|
|
"step": 296
|
|
},
|
|
{
|
|
"epoch": 2.216417910447761,
|
|
"grad_norm": 0.27448176767847715,
|
|
"learning_rate": 6.051395202348089e-06,
|
|
"loss": 0.3953642249107361,
|
|
"num_tokens": 268121281.0,
|
|
"step": 297
|
|
},
|
|
{
|
|
"epoch": 2.2238805970149254,
|
|
"grad_norm": 0.297149102735408,
|
|
"learning_rate": 6.028042289974768e-06,
|
|
"loss": 0.3815913796424866,
|
|
"num_tokens": 269026334.0,
|
|
"step": 298
|
|
},
|
|
{
|
|
"epoch": 2.2313432835820897,
|
|
"grad_norm": 0.29135459719595014,
|
|
"learning_rate": 6.004674949190004e-06,
|
|
"loss": 0.3744094967842102,
|
|
"num_tokens": 269848673.0,
|
|
"step": 299
|
|
},
|
|
{
|
|
"epoch": 2.2388059701492535,
|
|
"grad_norm": 0.3163386130777747,
|
|
"learning_rate": 5.981293818491153e-06,
|
|
"loss": 0.411973237991333,
|
|
"num_tokens": 270729219.0,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 2.246268656716418,
|
|
"grad_norm": 0.2996160649578529,
|
|
"learning_rate": 5.957899536752373e-06,
|
|
"loss": 0.4180707335472107,
|
|
"num_tokens": 271647605.0,
|
|
"step": 301
|
|
},
|
|
{
|
|
"epoch": 2.253731343283582,
|
|
"grad_norm": 0.2744717376139136,
|
|
"learning_rate": 5.934492743207168e-06,
|
|
"loss": 0.36764925718307495,
|
|
"num_tokens": 272444857.0,
|
|
"step": 302
|
|
},
|
|
{
|
|
"epoch": 2.2611940298507465,
|
|
"grad_norm": 0.3051287913390687,
|
|
"learning_rate": 5.911074077430917e-06,
|
|
"loss": 0.3950934410095215,
|
|
"num_tokens": 273313831.0,
|
|
"step": 303
|
|
},
|
|
{
|
|
"epoch": 2.2686567164179103,
|
|
"grad_norm": 0.2740805047822694,
|
|
"learning_rate": 5.887644179323403e-06,
|
|
"loss": 0.38602137565612793,
|
|
"num_tokens": 274151817.0,
|
|
"step": 304
|
|
},
|
|
{
|
|
"epoch": 2.2761194029850746,
|
|
"grad_norm": 0.2811027592780593,
|
|
"learning_rate": 5.864203689091316e-06,
|
|
"loss": 0.40490180253982544,
|
|
"num_tokens": 275023603.0,
|
|
"step": 305
|
|
},
|
|
{
|
|
"epoch": 2.283582089552239,
|
|
"grad_norm": 0.37103511230501807,
|
|
"learning_rate": 5.840753247230781e-06,
|
|
"loss": 0.39756178855895996,
|
|
"num_tokens": 275922951.0,
|
|
"step": 306
|
|
},
|
|
{
|
|
"epoch": 2.291044776119403,
|
|
"grad_norm": 0.260165834106451,
|
|
"learning_rate": 5.817293494509836e-06,
|
|
"loss": 0.3657914996147156,
|
|
"num_tokens": 276733073.0,
|
|
"step": 307
|
|
},
|
|
{
|
|
"epoch": 2.298507462686567,
|
|
"grad_norm": 0.2676322746611296,
|
|
"learning_rate": 5.793825071950936e-06,
|
|
"loss": 0.3826783299446106,
|
|
"num_tokens": 277699551.0,
|
|
"step": 308
|
|
},
|
|
{
|
|
"epoch": 2.3059701492537314,
|
|
"grad_norm": 0.3171630796152734,
|
|
"learning_rate": 5.770348620813433e-06,
|
|
"loss": 0.38245660066604614,
|
|
"num_tokens": 278695133.0,
|
|
"step": 309
|
|
},
|
|
{
|
|
"epoch": 2.3134328358208958,
|
|
"grad_norm": 0.2749216503608562,
|
|
"learning_rate": 5.746864782576054e-06,
|
|
"loss": 0.38771188259124756,
|
|
"num_tokens": 279483451.0,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 2.3208955223880596,
|
|
"grad_norm": 0.34619757766961257,
|
|
"learning_rate": 5.723374198919376e-06,
|
|
"loss": 0.40358829498291016,
|
|
"num_tokens": 280316518.0,
|
|
"step": 311
|
|
},
|
|
{
|
|
"epoch": 2.328358208955224,
|
|
"grad_norm": 0.2628421365077709,
|
|
"learning_rate": 5.699877511708285e-06,
|
|
"loss": 0.37161552906036377,
|
|
"num_tokens": 281300113.0,
|
|
"step": 312
|
|
},
|
|
{
|
|
"epoch": 2.3358208955223883,
|
|
"grad_norm": 0.2865924626367908,
|
|
"learning_rate": 5.67637536297445e-06,
|
|
"loss": 0.3707822561264038,
|
|
"num_tokens": 282213553.0,
|
|
"step": 313
|
|
},
|
|
{
|
|
"epoch": 2.343283582089552,
|
|
"grad_norm": 0.2782360921000711,
|
|
"learning_rate": 5.652868394898766e-06,
|
|
"loss": 0.38021302223205566,
|
|
"num_tokens": 283069634.0,
|
|
"step": 314
|
|
},
|
|
{
|
|
"epoch": 2.3507462686567164,
|
|
"grad_norm": 0.274968159536365,
|
|
"learning_rate": 5.6293572497938165e-06,
|
|
"loss": 0.4070481061935425,
|
|
"num_tokens": 284055909.0,
|
|
"step": 315
|
|
},
|
|
{
|
|
"epoch": 2.3582089552238807,
|
|
"grad_norm": 0.25137582516547385,
|
|
"learning_rate": 5.605842570086321e-06,
|
|
"loss": 0.38819169998168945,
|
|
"num_tokens": 285072190.0,
|
|
"step": 316
|
|
},
|
|
{
|
|
"epoch": 2.3656716417910446,
|
|
"grad_norm": 0.27416935469654424,
|
|
"learning_rate": 5.582324998299573e-06,
|
|
"loss": 0.3976019620895386,
|
|
"num_tokens": 285997942.0,
|
|
"step": 317
|
|
},
|
|
{
|
|
"epoch": 2.373134328358209,
|
|
"grad_norm": 0.28976153755834105,
|
|
"learning_rate": 5.558805177035902e-06,
|
|
"loss": 0.39910900592803955,
|
|
"num_tokens": 286957228.0,
|
|
"step": 318
|
|
},
|
|
{
|
|
"epoch": 2.3805970149253732,
|
|
"grad_norm": 0.3526174425898886,
|
|
"learning_rate": 5.53528374895909e-06,
|
|
"loss": 0.37735995650291443,
|
|
"num_tokens": 287834123.0,
|
|
"step": 319
|
|
},
|
|
{
|
|
"epoch": 2.388059701492537,
|
|
"grad_norm": 0.2753135236966283,
|
|
"learning_rate": 5.511761356776834e-06,
|
|
"loss": 0.3974205553531647,
|
|
"num_tokens": 288755581.0,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 2.3955223880597014,
|
|
"grad_norm": 0.2836500955971764,
|
|
"learning_rate": 5.488238643223167e-06,
|
|
"loss": 0.4040617346763611,
|
|
"num_tokens": 289616887.0,
|
|
"step": 321
|
|
},
|
|
{
|
|
"epoch": 2.4029850746268657,
|
|
"grad_norm": 0.3001483066578534,
|
|
"learning_rate": 5.464716251040911e-06,
|
|
"loss": 0.39118584990501404,
|
|
"num_tokens": 290466034.0,
|
|
"step": 322
|
|
},
|
|
{
|
|
"epoch": 2.41044776119403,
|
|
"grad_norm": 0.29609458212755346,
|
|
"learning_rate": 5.4411948229641e-06,
|
|
"loss": 0.4012300372123718,
|
|
"num_tokens": 291327531.0,
|
|
"step": 323
|
|
},
|
|
{
|
|
"epoch": 2.417910447761194,
|
|
"grad_norm": 0.282307409973888,
|
|
"learning_rate": 5.417675001700428e-06,
|
|
"loss": 0.39297211170196533,
|
|
"num_tokens": 292249211.0,
|
|
"step": 324
|
|
},
|
|
{
|
|
"epoch": 2.425373134328358,
|
|
"grad_norm": 0.31947796875203593,
|
|
"learning_rate": 5.394157429913681e-06,
|
|
"loss": 0.43389707803726196,
|
|
"num_tokens": 293154262.0,
|
|
"step": 325
|
|
},
|
|
{
|
|
"epoch": 2.4328358208955225,
|
|
"grad_norm": 0.2806921837500959,
|
|
"learning_rate": 5.370642750206184e-06,
|
|
"loss": 0.4193563461303711,
|
|
"num_tokens": 294190925.0,
|
|
"step": 326
|
|
},
|
|
{
|
|
"epoch": 2.4402985074626864,
|
|
"grad_norm": 0.28217215862589007,
|
|
"learning_rate": 5.347131605101237e-06,
|
|
"loss": 0.42073380947113037,
|
|
"num_tokens": 295155201.0,
|
|
"step": 327
|
|
},
|
|
{
|
|
"epoch": 2.4477611940298507,
|
|
"grad_norm": 0.2595127351145338,
|
|
"learning_rate": 5.323624637025552e-06,
|
|
"loss": 0.38413190841674805,
|
|
"num_tokens": 296039941.0,
|
|
"step": 328
|
|
},
|
|
{
|
|
"epoch": 2.455223880597015,
|
|
"grad_norm": 0.27537880701127315,
|
|
"learning_rate": 5.300122488291717e-06,
|
|
"loss": 0.3896210193634033,
|
|
"num_tokens": 296897125.0,
|
|
"step": 329
|
|
},
|
|
{
|
|
"epoch": 2.4626865671641793,
|
|
"grad_norm": 0.2806456708250513,
|
|
"learning_rate": 5.276625801080626e-06,
|
|
"loss": 0.40547412633895874,
|
|
"num_tokens": 297829206.0,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 2.470149253731343,
|
|
"grad_norm": 0.3233513262930407,
|
|
"learning_rate": 5.253135217423948e-06,
|
|
"loss": 0.3998452425003052,
|
|
"num_tokens": 298813976.0,
|
|
"step": 331
|
|
},
|
|
{
|
|
"epoch": 2.4776119402985075,
|
|
"grad_norm": 0.2870679405386201,
|
|
"learning_rate": 5.229651379186569e-06,
|
|
"loss": 0.41445013880729675,
|
|
"num_tokens": 299755392.0,
|
|
"step": 332
|
|
},
|
|
{
|
|
"epoch": 2.485074626865672,
|
|
"grad_norm": 0.2623639243435129,
|
|
"learning_rate": 5.206174928049066e-06,
|
|
"loss": 0.3996489644050598,
|
|
"num_tokens": 300745461.0,
|
|
"step": 333
|
|
},
|
|
{
|
|
"epoch": 2.4925373134328357,
|
|
"grad_norm": 0.2657883700801823,
|
|
"learning_rate": 5.182706505490166e-06,
|
|
"loss": 0.3919597864151001,
|
|
"num_tokens": 301635785.0,
|
|
"step": 334
|
|
},
|
|
{
|
|
"epoch": 2.5,
|
|
"grad_norm": 0.2730887704012263,
|
|
"learning_rate": 5.15924675276922e-06,
|
|
"loss": 0.37381941080093384,
|
|
"num_tokens": 302529314.0,
|
|
"step": 335
|
|
},
|
|
{
|
|
"epoch": 2.5074626865671643,
|
|
"grad_norm": 0.27926647905507407,
|
|
"learning_rate": 5.135796310908685e-06,
|
|
"loss": 0.4020169675350189,
|
|
"num_tokens": 303325140.0,
|
|
"step": 336
|
|
},
|
|
{
|
|
"epoch": 2.5149253731343286,
|
|
"grad_norm": 0.2573449307599577,
|
|
"learning_rate": 5.1123558206766e-06,
|
|
"loss": 0.3959069848060608,
|
|
"num_tokens": 304291697.0,
|
|
"step": 337
|
|
},
|
|
{
|
|
"epoch": 2.5223880597014925,
|
|
"grad_norm": 0.2713627052957801,
|
|
"learning_rate": 5.088925922569084e-06,
|
|
"loss": 0.4036637246608734,
|
|
"num_tokens": 305167326.0,
|
|
"step": 338
|
|
},
|
|
{
|
|
"epoch": 2.529850746268657,
|
|
"grad_norm": 0.29137688284390684,
|
|
"learning_rate": 5.065507256792833e-06,
|
|
"loss": 0.40749210119247437,
|
|
"num_tokens": 306083413.0,
|
|
"step": 339
|
|
},
|
|
{
|
|
"epoch": 2.5373134328358207,
|
|
"grad_norm": 0.27645786153124524,
|
|
"learning_rate": 5.04210046324763e-06,
|
|
"loss": 0.3960036039352417,
|
|
"num_tokens": 306930925.0,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 2.544776119402985,
|
|
"grad_norm": 0.2959257579408876,
|
|
"learning_rate": 5.018706181508851e-06,
|
|
"loss": 0.40943804383277893,
|
|
"num_tokens": 307667223.0,
|
|
"step": 341
|
|
},
|
|
{
|
|
"epoch": 2.5522388059701493,
|
|
"grad_norm": 0.2941768147406628,
|
|
"learning_rate": 4.995325050809999e-06,
|
|
"loss": 0.42352843284606934,
|
|
"num_tokens": 308548843.0,
|
|
"step": 342
|
|
},
|
|
{
|
|
"epoch": 2.5597014925373136,
|
|
"grad_norm": 0.3093404075043933,
|
|
"learning_rate": 4.971957710025235e-06,
|
|
"loss": 0.4167254567146301,
|
|
"num_tokens": 309456869.0,
|
|
"step": 343
|
|
},
|
|
{
|
|
"epoch": 2.5671641791044775,
|
|
"grad_norm": 0.285830294036988,
|
|
"learning_rate": 4.948604797651914e-06,
|
|
"loss": 0.41970574855804443,
|
|
"num_tokens": 310374426.0,
|
|
"step": 344
|
|
},
|
|
{
|
|
"epoch": 2.574626865671642,
|
|
"grad_norm": 0.2822303940696211,
|
|
"learning_rate": 4.925266951793149e-06,
|
|
"loss": 0.39743444323539734,
|
|
"num_tokens": 311185331.0,
|
|
"step": 345
|
|
},
|
|
{
|
|
"epoch": 2.582089552238806,
|
|
"grad_norm": 0.2722209732746419,
|
|
"learning_rate": 4.90194481014037e-06,
|
|
"loss": 0.4093334674835205,
|
|
"num_tokens": 312287344.0,
|
|
"step": 346
|
|
},
|
|
{
|
|
"epoch": 2.58955223880597,
|
|
"grad_norm": 0.3685744506907742,
|
|
"learning_rate": 4.878639009955896e-06,
|
|
"loss": 0.3837957978248596,
|
|
"num_tokens": 313203808.0,
|
|
"step": 347
|
|
},
|
|
{
|
|
"epoch": 2.5970149253731343,
|
|
"grad_norm": 0.26210814461472964,
|
|
"learning_rate": 4.855350188055528e-06,
|
|
"loss": 0.374228835105896,
|
|
"num_tokens": 314127724.0,
|
|
"step": 348
|
|
},
|
|
{
|
|
"epoch": 2.6044776119402986,
|
|
"grad_norm": 0.26577422679986124,
|
|
"learning_rate": 4.83207898079115e-06,
|
|
"loss": 0.3950842022895813,
|
|
"num_tokens": 315094649.0,
|
|
"step": 349
|
|
},
|
|
{
|
|
"epoch": 2.611940298507463,
|
|
"grad_norm": 0.2694330124125045,
|
|
"learning_rate": 4.808826024033334e-06,
|
|
"loss": 0.3894980251789093,
|
|
"num_tokens": 315902867.0,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 2.6194029850746268,
|
|
"grad_norm": 0.30012143917049156,
|
|
"learning_rate": 4.785591953153966e-06,
|
|
"loss": 0.3923467695713043,
|
|
"num_tokens": 316809248.0,
|
|
"step": 351
|
|
},
|
|
{
|
|
"epoch": 2.626865671641791,
|
|
"grad_norm": 0.27202743774586025,
|
|
"learning_rate": 4.762377403008895e-06,
|
|
"loss": 0.40671366453170776,
|
|
"num_tokens": 317806785.0,
|
|
"step": 352
|
|
},
|
|
{
|
|
"epoch": 2.6343283582089554,
|
|
"grad_norm": 0.2663498979647159,
|
|
"learning_rate": 4.739183007920572e-06,
|
|
"loss": 0.40148887038230896,
|
|
"num_tokens": 318773135.0,
|
|
"step": 353
|
|
},
|
|
{
|
|
"epoch": 2.6417910447761193,
|
|
"grad_norm": 0.26964724456667694,
|
|
"learning_rate": 4.716009401660728e-06,
|
|
"loss": 0.36810237169265747,
|
|
"num_tokens": 319712540.0,
|
|
"step": 354
|
|
},
|
|
{
|
|
"epoch": 2.6492537313432836,
|
|
"grad_norm": 0.2745583940218022,
|
|
"learning_rate": 4.69285721743305e-06,
|
|
"loss": 0.3969258666038513,
|
|
"num_tokens": 320623524.0,
|
|
"step": 355
|
|
},
|
|
{
|
|
"epoch": 2.656716417910448,
|
|
"grad_norm": 0.2691069702675602,
|
|
"learning_rate": 4.669727087855886e-06,
|
|
"loss": 0.39531204104423523,
|
|
"num_tokens": 321558026.0,
|
|
"step": 356
|
|
},
|
|
{
|
|
"epoch": 2.664179104477612,
|
|
"grad_norm": 0.2790488198361277,
|
|
"learning_rate": 4.646619644944951e-06,
|
|
"loss": 0.3691323399543762,
|
|
"num_tokens": 322457137.0,
|
|
"step": 357
|
|
},
|
|
{
|
|
"epoch": 2.671641791044776,
|
|
"grad_norm": 0.25676092193729705,
|
|
"learning_rate": 4.623535520096063e-06,
|
|
"loss": 0.3830498456954956,
|
|
"num_tokens": 323406835.0,
|
|
"step": 358
|
|
},
|
|
{
|
|
"epoch": 2.6791044776119404,
|
|
"grad_norm": 0.27765790893840286,
|
|
"learning_rate": 4.6004753440678894e-06,
|
|
"loss": 0.38582926988601685,
|
|
"num_tokens": 324270762.0,
|
|
"step": 359
|
|
},
|
|
{
|
|
"epoch": 2.6865671641791042,
|
|
"grad_norm": 0.2578194748970744,
|
|
"learning_rate": 4.577439746964715e-06,
|
|
"loss": 0.39646175503730774,
|
|
"num_tokens": 325172716.0,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 2.6940298507462686,
|
|
"grad_norm": 0.26611474982215905,
|
|
"learning_rate": 4.554429358219214e-06,
|
|
"loss": 0.38044852018356323,
|
|
"num_tokens": 326161663.0,
|
|
"step": 361
|
|
},
|
|
{
|
|
"epoch": 2.701492537313433,
|
|
"grad_norm": 0.2670566328628317,
|
|
"learning_rate": 4.531444806575266e-06,
|
|
"loss": 0.40564393997192383,
|
|
"num_tokens": 327106201.0,
|
|
"step": 362
|
|
},
|
|
{
|
|
"epoch": 2.708955223880597,
|
|
"grad_norm": 0.274772662861299,
|
|
"learning_rate": 4.508486720070761e-06,
|
|
"loss": 0.39564812183380127,
|
|
"num_tokens": 328050673.0,
|
|
"step": 363
|
|
},
|
|
{
|
|
"epoch": 2.716417910447761,
|
|
"grad_norm": 0.3094439511198801,
|
|
"learning_rate": 4.485555726020455e-06,
|
|
"loss": 0.3800423741340637,
|
|
"num_tokens": 328859100.0,
|
|
"step": 364
|
|
},
|
|
{
|
|
"epoch": 2.7238805970149254,
|
|
"grad_norm": 0.2875993414193674,
|
|
"learning_rate": 4.462652450998816e-06,
|
|
"loss": 0.4001840353012085,
|
|
"num_tokens": 329666962.0,
|
|
"step": 365
|
|
},
|
|
{
|
|
"epoch": 2.7313432835820897,
|
|
"grad_norm": 0.27308262203119327,
|
|
"learning_rate": 4.439777520822905e-06,
|
|
"loss": 0.39083579182624817,
|
|
"num_tokens": 330477732.0,
|
|
"step": 366
|
|
},
|
|
{
|
|
"epoch": 2.7388059701492535,
|
|
"grad_norm": 0.2708315720399402,
|
|
"learning_rate": 4.416931560535284e-06,
|
|
"loss": 0.39352381229400635,
|
|
"num_tokens": 331330359.0,
|
|
"step": 367
|
|
},
|
|
{
|
|
"epoch": 2.746268656716418,
|
|
"grad_norm": 0.2678850422820554,
|
|
"learning_rate": 4.394115194386928e-06,
|
|
"loss": 0.38045477867126465,
|
|
"num_tokens": 332347647.0,
|
|
"step": 368
|
|
},
|
|
{
|
|
"epoch": 2.753731343283582,
|
|
"grad_norm": 0.2753212357157175,
|
|
"learning_rate": 4.371329045820172e-06,
|
|
"loss": 0.3969570994377136,
|
|
"num_tokens": 333284873.0,
|
|
"step": 369
|
|
},
|
|
{
|
|
"epoch": 2.7611940298507465,
|
|
"grad_norm": 0.28683339254512785,
|
|
"learning_rate": 4.3485737374516745e-06,
|
|
"loss": 0.4235033392906189,
|
|
"num_tokens": 334098107.0,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 2.7686567164179103,
|
|
"grad_norm": 0.2698726522878529,
|
|
"learning_rate": 4.3258498910554095e-06,
|
|
"loss": 0.38629546761512756,
|
|
"num_tokens": 334979408.0,
|
|
"step": 371
|
|
},
|
|
{
|
|
"epoch": 2.7761194029850746,
|
|
"grad_norm": 0.2615554761622241,
|
|
"learning_rate": 4.303158127545669e-06,
|
|
"loss": 0.3924221694469452,
|
|
"num_tokens": 335891381.0,
|
|
"step": 372
|
|
},
|
|
{
|
|
"epoch": 2.783582089552239,
|
|
"grad_norm": 0.26064429917011145,
|
|
"learning_rate": 4.280499066960102e-06,
|
|
"loss": 0.3906182050704956,
|
|
"num_tokens": 336949128.0,
|
|
"step": 373
|
|
},
|
|
{
|
|
"epoch": 2.791044776119403,
|
|
"grad_norm": 0.27127505364411514,
|
|
"learning_rate": 4.257873328442774e-06,
|
|
"loss": 0.3783274292945862,
|
|
"num_tokens": 337776659.0,
|
|
"step": 374
|
|
},
|
|
{
|
|
"epoch": 2.798507462686567,
|
|
"grad_norm": 0.27410164043945023,
|
|
"learning_rate": 4.2352815302272425e-06,
|
|
"loss": 0.3829938471317291,
|
|
"num_tokens": 338685204.0,
|
|
"step": 375
|
|
},
|
|
{
|
|
"epoch": 2.8059701492537314,
|
|
"grad_norm": 0.2706332327188829,
|
|
"learning_rate": 4.212724289619672e-06,
|
|
"loss": 0.37140512466430664,
|
|
"num_tokens": 339492119.0,
|
|
"step": 376
|
|
},
|
|
{
|
|
"epoch": 2.8134328358208958,
|
|
"grad_norm": 0.29552966231342986,
|
|
"learning_rate": 4.190202222981959e-06,
|
|
"loss": 0.41518405079841614,
|
|
"num_tokens": 340414044.0,
|
|
"step": 377
|
|
},
|
|
{
|
|
"epoch": 2.8208955223880596,
|
|
"grad_norm": 0.4384124363415056,
|
|
"learning_rate": 4.1677159457149005e-06,
|
|
"loss": 0.3670823574066162,
|
|
"num_tokens": 341275739.0,
|
|
"step": 378
|
|
},
|
|
{
|
|
"epoch": 2.828358208955224,
|
|
"grad_norm": 0.2818008385366561,
|
|
"learning_rate": 4.145266072241365e-06,
|
|
"loss": 0.38579511642456055,
|
|
"num_tokens": 342203284.0,
|
|
"step": 379
|
|
},
|
|
{
|
|
"epoch": 2.835820895522388,
|
|
"grad_norm": 0.26814078006971265,
|
|
"learning_rate": 4.122853215989515e-06,
|
|
"loss": 0.4062846899032593,
|
|
"num_tokens": 343206534.0,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 2.843283582089552,
|
|
"grad_norm": 0.27452179515826414,
|
|
"learning_rate": 4.1004779893760424e-06,
|
|
"loss": 0.397432416677475,
|
|
"num_tokens": 344154341.0,
|
|
"step": 381
|
|
},
|
|
{
|
|
"epoch": 2.8507462686567164,
|
|
"grad_norm": 0.27288188181425943,
|
|
"learning_rate": 4.078141003789431e-06,
|
|
"loss": 0.391731858253479,
|
|
"num_tokens": 345024971.0,
|
|
"step": 382
|
|
},
|
|
{
|
|
"epoch": 2.8582089552238807,
|
|
"grad_norm": 0.2967872715212152,
|
|
"learning_rate": 4.055842869573256e-06,
|
|
"loss": 0.400160551071167,
|
|
"num_tokens": 345812228.0,
|
|
"step": 383
|
|
},
|
|
{
|
|
"epoch": 2.8656716417910446,
|
|
"grad_norm": 0.27985989099065167,
|
|
"learning_rate": 4.0335841960095025e-06,
|
|
"loss": 0.3944920599460602,
|
|
"num_tokens": 346769134.0,
|
|
"step": 384
|
|
},
|
|
{
|
|
"epoch": 2.873134328358209,
|
|
"grad_norm": 0.2548795141867926,
|
|
"learning_rate": 4.011365591301918e-06,
|
|
"loss": 0.404415488243103,
|
|
"num_tokens": 347740543.0,
|
|
"step": 385
|
|
},
|
|
{
|
|
"epoch": 2.8805970149253732,
|
|
"grad_norm": 0.2353554630176529,
|
|
"learning_rate": 3.989187662559397e-06,
|
|
"loss": 0.3925011157989502,
|
|
"num_tokens": 348799551.0,
|
|
"step": 386
|
|
},
|
|
{
|
|
"epoch": 2.888059701492537,
|
|
"grad_norm": 0.4371240438139863,
|
|
"learning_rate": 3.967051015779389e-06,
|
|
"loss": 0.394489049911499,
|
|
"num_tokens": 349833256.0,
|
|
"step": 387
|
|
},
|
|
{
|
|
"epoch": 2.8955223880597014,
|
|
"grad_norm": 0.492017294414543,
|
|
"learning_rate": 3.944956255831342e-06,
|
|
"loss": 0.3901214003562927,
|
|
"num_tokens": 350675901.0,
|
|
"step": 388
|
|
},
|
|
{
|
|
"epoch": 2.9029850746268657,
|
|
"grad_norm": 0.28604462735158265,
|
|
"learning_rate": 3.922903986440171e-06,
|
|
"loss": 0.3956416845321655,
|
|
"num_tokens": 351593161.0,
|
|
"step": 389
|
|
},
|
|
{
|
|
"epoch": 2.91044776119403,
|
|
"grad_norm": 0.3019009320890686,
|
|
"learning_rate": 3.900894810169766e-06,
|
|
"loss": 0.4037666618824005,
|
|
"num_tokens": 352556035.0,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 2.917910447761194,
|
|
"grad_norm": 0.2929989612906795,
|
|
"learning_rate": 3.878929328406524e-06,
|
|
"loss": 0.38326603174209595,
|
|
"num_tokens": 353175046.0,
|
|
"step": 391
|
|
},
|
|
{
|
|
"epoch": 2.925373134328358,
|
|
"grad_norm": 0.2811533155158446,
|
|
"learning_rate": 3.857008141342921e-06,
|
|
"loss": 0.3970789909362793,
|
|
"num_tokens": 354040412.0,
|
|
"step": 392
|
|
},
|
|
{
|
|
"epoch": 2.9328358208955225,
|
|
"grad_norm": 0.2642763742866724,
|
|
"learning_rate": 3.8351318479611045e-06,
|
|
"loss": 0.40754109621047974,
|
|
"num_tokens": 354957977.0,
|
|
"step": 393
|
|
},
|
|
{
|
|
"epoch": 2.9402985074626864,
|
|
"grad_norm": 0.2553969638436942,
|
|
"learning_rate": 3.8133010460165364e-06,
|
|
"loss": 0.3917849361896515,
|
|
"num_tokens": 355897000.0,
|
|
"step": 394
|
|
},
|
|
{
|
|
"epoch": 2.9477611940298507,
|
|
"grad_norm": 0.3227768986284808,
|
|
"learning_rate": 3.791516332021651e-06,
|
|
"loss": 0.38059675693511963,
|
|
"num_tokens": 356775946.0,
|
|
"step": 395
|
|
},
|
|
{
|
|
"epoch": 2.955223880597015,
|
|
"grad_norm": 0.26373506539724473,
|
|
"learning_rate": 3.769778301229562e-06,
|
|
"loss": 0.392505407333374,
|
|
"num_tokens": 357732570.0,
|
|
"step": 396
|
|
},
|
|
{
|
|
"epoch": 2.9626865671641793,
|
|
"grad_norm": 0.27141559638214446,
|
|
"learning_rate": 3.748087547617795e-06,
|
|
"loss": 0.38036075234413147,
|
|
"num_tokens": 358510667.0,
|
|
"step": 397
|
|
},
|
|
{
|
|
"epoch": 2.970149253731343,
|
|
"grad_norm": 0.24786828522735252,
|
|
"learning_rate": 3.7264446638720542e-06,
|
|
"loss": 0.37426790595054626,
|
|
"num_tokens": 359444745.0,
|
|
"step": 398
|
|
},
|
|
{
|
|
"epoch": 2.9776119402985075,
|
|
"grad_norm": 0.25219066519802286,
|
|
"learning_rate": 3.704850241370035e-06,
|
|
"loss": 0.3932304382324219,
|
|
"num_tokens": 360351403.0,
|
|
"step": 399
|
|
},
|
|
{
|
|
"epoch": 2.9850746268656714,
|
|
"grad_norm": 0.2314040595153558,
|
|
"learning_rate": 3.6833048701652574e-06,
|
|
"loss": 0.3921104669570923,
|
|
"num_tokens": 361414260.0,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 2.9925373134328357,
|
|
"grad_norm": 0.24531758323496658,
|
|
"learning_rate": 3.661809138970951e-06,
|
|
"loss": 0.39479339122772217,
|
|
"num_tokens": 362313539.0,
|
|
"step": 401
|
|
},
|
|
{
|
|
"epoch": 3.0,
|
|
"grad_norm": 0.269225436814872,
|
|
"learning_rate": 3.6403636351439577e-06,
|
|
"loss": 0.39549848437309265,
|
|
"num_tokens": 363114852.0,
|
|
"step": 402
|
|
},
|
|
{
|
|
"epoch": 3.0074626865671643,
|
|
"grad_norm": 0.28662511975668975,
|
|
"learning_rate": 3.618968944668696e-06,
|
|
"loss": 0.35942816734313965,
|
|
"num_tokens": 363883703.0,
|
|
"step": 403
|
|
},
|
|
{
|
|
"epoch": 3.014925373134328,
|
|
"grad_norm": 0.2897343949926782,
|
|
"learning_rate": 3.5976256521411402e-06,
|
|
"loss": 0.37709563970565796,
|
|
"num_tokens": 364726957.0,
|
|
"step": 404
|
|
},
|
|
{
|
|
"epoch": 3.0223880597014925,
|
|
"grad_norm": 0.25819303755354905,
|
|
"learning_rate": 3.576334340752847e-06,
|
|
"loss": 0.3720802664756775,
|
|
"num_tokens": 365712205.0,
|
|
"step": 405
|
|
},
|
|
{
|
|
"epoch": 3.029850746268657,
|
|
"grad_norm": 0.28009429409591957,
|
|
"learning_rate": 3.5550955922750275e-06,
|
|
"loss": 0.3992989659309387,
|
|
"num_tokens": 366502371.0,
|
|
"step": 406
|
|
},
|
|
{
|
|
"epoch": 3.0373134328358207,
|
|
"grad_norm": 0.2764674226920931,
|
|
"learning_rate": 3.533909987042642e-06,
|
|
"loss": 0.39246252179145813,
|
|
"num_tokens": 367405016.0,
|
|
"step": 407
|
|
},
|
|
{
|
|
"epoch": 3.044776119402985,
|
|
"grad_norm": 0.30985373317019865,
|
|
"learning_rate": 3.512778103938542e-06,
|
|
"loss": 0.4023834466934204,
|
|
"num_tokens": 368186973.0,
|
|
"step": 408
|
|
},
|
|
{
|
|
"epoch": 3.0522388059701493,
|
|
"grad_norm": 0.28547534212425507,
|
|
"learning_rate": 3.491700520377667e-06,
|
|
"loss": 0.38294538855552673,
|
|
"num_tokens": 369054384.0,
|
|
"step": 409
|
|
},
|
|
{
|
|
"epoch": 3.0597014925373136,
|
|
"grad_norm": 0.2749822220227637,
|
|
"learning_rate": 3.470677812291248e-06,
|
|
"loss": 0.3690488636493683,
|
|
"num_tokens": 370021137.0,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 3.0671641791044775,
|
|
"grad_norm": 0.2617585883370724,
|
|
"learning_rate": 3.4497105541110847e-06,
|
|
"loss": 0.39320921897888184,
|
|
"num_tokens": 370954131.0,
|
|
"step": 411
|
|
},
|
|
{
|
|
"epoch": 3.074626865671642,
|
|
"grad_norm": 0.276121676089303,
|
|
"learning_rate": 3.4287993187538445e-06,
|
|
"loss": 0.3605678975582123,
|
|
"num_tokens": 371779138.0,
|
|
"step": 412
|
|
},
|
|
{
|
|
"epoch": 3.082089552238806,
|
|
"grad_norm": 0.3190227559580631,
|
|
"learning_rate": 3.407944677605399e-06,
|
|
"loss": 0.408037006855011,
|
|
"num_tokens": 372652437.0,
|
|
"step": 413
|
|
},
|
|
{
|
|
"epoch": 3.08955223880597,
|
|
"grad_norm": 0.3764832484269211,
|
|
"learning_rate": 3.387147200505232e-06,
|
|
"loss": 0.38565126061439514,
|
|
"num_tokens": 373477902.0,
|
|
"step": 414
|
|
},
|
|
{
|
|
"epoch": 3.0970149253731343,
|
|
"grad_norm": 0.28107007973769577,
|
|
"learning_rate": 3.366407455730849e-06,
|
|
"loss": 0.414955735206604,
|
|
"num_tokens": 374298186.0,
|
|
"step": 415
|
|
},
|
|
{
|
|
"epoch": 3.1044776119402986,
|
|
"grad_norm": 0.2538068604333711,
|
|
"learning_rate": 3.345726009982262e-06,
|
|
"loss": 0.3739873766899109,
|
|
"num_tokens": 375232722.0,
|
|
"step": 416
|
|
},
|
|
{
|
|
"epoch": 3.111940298507463,
|
|
"grad_norm": 0.25345140165817104,
|
|
"learning_rate": 3.3251034283664945e-06,
|
|
"loss": 0.39425763487815857,
|
|
"num_tokens": 376192544.0,
|
|
"step": 417
|
|
},
|
|
{
|
|
"epoch": 3.1194029850746268,
|
|
"grad_norm": 0.26126693334804235,
|
|
"learning_rate": 3.304540274382151e-06,
|
|
"loss": 0.3673323094844818,
|
|
"num_tokens": 377142524.0,
|
|
"step": 418
|
|
},
|
|
{
|
|
"epoch": 3.126865671641791,
|
|
"grad_norm": 0.2718425837582604,
|
|
"learning_rate": 3.284037109904013e-06,
|
|
"loss": 0.38800495862960815,
|
|
"num_tokens": 378076354.0,
|
|
"step": 419
|
|
},
|
|
{
|
|
"epoch": 3.1343283582089554,
|
|
"grad_norm": 0.24762599606026042,
|
|
"learning_rate": 3.263594495167688e-06,
|
|
"loss": 0.3551333248615265,
|
|
"num_tokens": 378966330.0,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 3.1417910447761193,
|
|
"grad_norm": 0.3979931015660995,
|
|
"learning_rate": 3.2432129887543026e-06,
|
|
"loss": 0.3955429196357727,
|
|
"num_tokens": 379888904.0,
|
|
"step": 421
|
|
},
|
|
{
|
|
"epoch": 3.1492537313432836,
|
|
"grad_norm": 0.27409522127657593,
|
|
"learning_rate": 3.2228931475752323e-06,
|
|
"loss": 0.35347574949264526,
|
|
"num_tokens": 380738966.0,
|
|
"step": 422
|
|
},
|
|
{
|
|
"epoch": 3.156716417910448,
|
|
"grad_norm": 0.26157991571638095,
|
|
"learning_rate": 3.2026355268568987e-06,
|
|
"loss": 0.35351991653442383,
|
|
"num_tokens": 381614529.0,
|
|
"step": 423
|
|
},
|
|
{
|
|
"epoch": 3.1641791044776117,
|
|
"grad_norm": 0.253961852327095,
|
|
"learning_rate": 3.1824406801255836e-06,
|
|
"loss": 0.36370548605918884,
|
|
"num_tokens": 382513458.0,
|
|
"step": 424
|
|
},
|
|
{
|
|
"epoch": 3.171641791044776,
|
|
"grad_norm": 0.24868042189319053,
|
|
"learning_rate": 3.162309159192316e-06,
|
|
"loss": 0.3607192635536194,
|
|
"num_tokens": 383449861.0,
|
|
"step": 425
|
|
},
|
|
{
|
|
"epoch": 3.1791044776119404,
|
|
"grad_norm": 0.26485700184898936,
|
|
"learning_rate": 3.1422415141377815e-06,
|
|
"loss": 0.3481111228466034,
|
|
"num_tokens": 384253017.0,
|
|
"step": 426
|
|
},
|
|
{
|
|
"epoch": 3.1865671641791047,
|
|
"grad_norm": 0.28281284316278155,
|
|
"learning_rate": 3.122238293297305e-06,
|
|
"loss": 0.3816152811050415,
|
|
"num_tokens": 385257443.0,
|
|
"step": 427
|
|
},
|
|
{
|
|
"epoch": 3.1940298507462686,
|
|
"grad_norm": 0.2628707804556158,
|
|
"learning_rate": 3.10230004324586e-06,
|
|
"loss": 0.349966824054718,
|
|
"num_tokens": 386017753.0,
|
|
"step": 428
|
|
},
|
|
{
|
|
"epoch": 3.201492537313433,
|
|
"grad_norm": 0.2606711695382564,
|
|
"learning_rate": 3.0824273087831335e-06,
|
|
"loss": 0.38945478200912476,
|
|
"num_tokens": 386978912.0,
|
|
"step": 429
|
|
},
|
|
{
|
|
"epoch": 3.208955223880597,
|
|
"grad_norm": 0.2747230623623624,
|
|
"learning_rate": 3.062620632918648e-06,
|
|
"loss": 0.3638556897640228,
|
|
"num_tokens": 387852467.0,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 3.216417910447761,
|
|
"grad_norm": 0.2803007110615389,
|
|
"learning_rate": 3.0428805568569076e-06,
|
|
"loss": 0.38482367992401123,
|
|
"num_tokens": 388658923.0,
|
|
"step": 431
|
|
},
|
|
{
|
|
"epoch": 3.2238805970149254,
|
|
"grad_norm": 0.2645967994643593,
|
|
"learning_rate": 3.023207619982629e-06,
|
|
"loss": 0.36384740471839905,
|
|
"num_tokens": 389508858.0,
|
|
"step": 432
|
|
},
|
|
{
|
|
"epoch": 3.2313432835820897,
|
|
"grad_norm": 0.27202749711662244,
|
|
"learning_rate": 3.0036023598459895e-06,
|
|
"loss": 0.39492571353912354,
|
|
"num_tokens": 390450838.0,
|
|
"step": 433
|
|
},
|
|
{
|
|
"epoch": 3.2388059701492535,
|
|
"grad_norm": 0.2858842639475798,
|
|
"learning_rate": 2.9840653121479478e-06,
|
|
"loss": 0.3738439679145813,
|
|
"num_tokens": 391283207.0,
|
|
"step": 434
|
|
},
|
|
{
|
|
"epoch": 3.246268656716418,
|
|
"grad_norm": 0.24793258551891303,
|
|
"learning_rate": 2.9645970107255997e-06,
|
|
"loss": 0.35694074630737305,
|
|
"num_tokens": 392285292.0,
|
|
"step": 435
|
|
},
|
|
{
|
|
"epoch": 3.253731343283582,
|
|
"grad_norm": 0.2819278079547717,
|
|
"learning_rate": 2.9451979875375913e-06,
|
|
"loss": 0.3710547387599945,
|
|
"num_tokens": 393145041.0,
|
|
"step": 436
|
|
},
|
|
{
|
|
"epoch": 3.2611940298507465,
|
|
"grad_norm": 0.2631494530375275,
|
|
"learning_rate": 2.925868772649591e-06,
|
|
"loss": 0.3825373351573944,
|
|
"num_tokens": 394022264.0,
|
|
"step": 437
|
|
},
|
|
{
|
|
"epoch": 3.2686567164179103,
|
|
"grad_norm": 0.2555768738323888,
|
|
"learning_rate": 2.9066098942197995e-06,
|
|
"loss": 0.36353516578674316,
|
|
"num_tokens": 394892104.0,
|
|
"step": 438
|
|
},
|
|
{
|
|
"epoch": 3.2761194029850746,
|
|
"grad_norm": 0.252531665467931,
|
|
"learning_rate": 2.887421878484516e-06,
|
|
"loss": 0.38284653425216675,
|
|
"num_tokens": 395835092.0,
|
|
"step": 439
|
|
},
|
|
{
|
|
"epoch": 3.283582089552239,
|
|
"grad_norm": 0.2845282411268666,
|
|
"learning_rate": 2.8683052497437665e-06,
|
|
"loss": 0.3927590548992157,
|
|
"num_tokens": 396725722.0,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 3.291044776119403,
|
|
"grad_norm": 0.26812964504110554,
|
|
"learning_rate": 2.8492605303469732e-06,
|
|
"loss": 0.37616321444511414,
|
|
"num_tokens": 397618546.0,
|
|
"step": 441
|
|
},
|
|
{
|
|
"epoch": 3.298507462686567,
|
|
"grad_norm": 0.25144632819615587,
|
|
"learning_rate": 2.8302882406786817e-06,
|
|
"loss": 0.382343053817749,
|
|
"num_tokens": 398571441.0,
|
|
"step": 442
|
|
},
|
|
{
|
|
"epoch": 3.3059701492537314,
|
|
"grad_norm": 0.29981470486255846,
|
|
"learning_rate": 2.811388899144345e-06,
|
|
"loss": 0.3775964379310608,
|
|
"num_tokens": 399409770.0,
|
|
"step": 443
|
|
},
|
|
{
|
|
"epoch": 3.3134328358208958,
|
|
"grad_norm": 0.37890241833609745,
|
|
"learning_rate": 2.7925630221561506e-06,
|
|
"loss": 0.37770912051200867,
|
|
"num_tokens": 400392695.0,
|
|
"step": 444
|
|
},
|
|
{
|
|
"epoch": 3.3208955223880596,
|
|
"grad_norm": 0.2755196059695862,
|
|
"learning_rate": 2.7738111241189185e-06,
|
|
"loss": 0.3694460690021515,
|
|
"num_tokens": 401345623.0,
|
|
"step": 445
|
|
},
|
|
{
|
|
"epoch": 3.328358208955224,
|
|
"grad_norm": 0.2680514510877795,
|
|
"learning_rate": 2.755133717416043e-06,
|
|
"loss": 0.3776453137397766,
|
|
"num_tokens": 402260500.0,
|
|
"step": 446
|
|
},
|
|
{
|
|
"epoch": 3.3358208955223883,
|
|
"grad_norm": 0.24171155783588386,
|
|
"learning_rate": 2.7365313123954916e-06,
|
|
"loss": 0.3985833525657654,
|
|
"num_tokens": 403276687.0,
|
|
"step": 447
|
|
},
|
|
{
|
|
"epoch": 3.343283582089552,
|
|
"grad_norm": 0.2569574542175994,
|
|
"learning_rate": 2.718004417355855e-06,
|
|
"loss": 0.3654242157936096,
|
|
"num_tokens": 404190134.0,
|
|
"step": 448
|
|
},
|
|
{
|
|
"epoch": 3.3507462686567164,
|
|
"grad_norm": 0.2493193792220214,
|
|
"learning_rate": 2.699553538532467e-06,
|
|
"loss": 0.3807545006275177,
|
|
"num_tokens": 405215802.0,
|
|
"step": 449
|
|
},
|
|
{
|
|
"epoch": 3.3582089552238807,
|
|
"grad_norm": 0.35218345161133224,
|
|
"learning_rate": 2.6811791800835684e-06,
|
|
"loss": 0.37028026580810547,
|
|
"num_tokens": 406189028.0,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 3.3656716417910446,
|
|
"grad_norm": 0.2630947673101325,
|
|
"learning_rate": 2.662881844076527e-06,
|
|
"loss": 0.3866269886493683,
|
|
"num_tokens": 407112961.0,
|
|
"step": 451
|
|
},
|
|
{
|
|
"epoch": 3.373134328358209,
|
|
"grad_norm": 0.24502727833486168,
|
|
"learning_rate": 2.6446620304741267e-06,
|
|
"loss": 0.3389516770839691,
|
|
"num_tokens": 407955720.0,
|
|
"step": 452
|
|
},
|
|
{
|
|
"epoch": 3.3805970149253732,
|
|
"grad_norm": 0.29642792873153473,
|
|
"learning_rate": 2.6265202371208985e-06,
|
|
"loss": 0.3727038502693176,
|
|
"num_tokens": 408861534.0,
|
|
"step": 453
|
|
},
|
|
{
|
|
"epoch": 3.388059701492537,
|
|
"grad_norm": 0.2729055281837691,
|
|
"learning_rate": 2.6084569597295227e-06,
|
|
"loss": 0.37226539850234985,
|
|
"num_tokens": 409769033.0,
|
|
"step": 454
|
|
},
|
|
{
|
|
"epoch": 3.3955223880597014,
|
|
"grad_norm": 0.2591963730270879,
|
|
"learning_rate": 2.590472691867284e-06,
|
|
"loss": 0.3665540814399719,
|
|
"num_tokens": 410734429.0,
|
|
"step": 455
|
|
},
|
|
{
|
|
"epoch": 3.4029850746268657,
|
|
"grad_norm": 0.24603379464247438,
|
|
"learning_rate": 2.57256792494258e-06,
|
|
"loss": 0.3557394742965698,
|
|
"num_tokens": 411668760.0,
|
|
"step": 456
|
|
},
|
|
{
|
|
"epoch": 3.41044776119403,
|
|
"grad_norm": 0.26710941082613454,
|
|
"learning_rate": 2.5547431481914973e-06,
|
|
"loss": 0.3810808062553406,
|
|
"num_tokens": 412593612.0,
|
|
"step": 457
|
|
},
|
|
{
|
|
"epoch": 3.417910447761194,
|
|
"grad_norm": 0.24517969588523647,
|
|
"learning_rate": 2.536998848664445e-06,
|
|
"loss": 0.36506032943725586,
|
|
"num_tokens": 413566574.0,
|
|
"step": 458
|
|
},
|
|
{
|
|
"epoch": 3.425373134328358,
|
|
"grad_norm": 0.26080308677293107,
|
|
"learning_rate": 2.5193355112128436e-06,
|
|
"loss": 0.375240683555603,
|
|
"num_tokens": 414490201.0,
|
|
"step": 459
|
|
},
|
|
{
|
|
"epoch": 3.4328358208955225,
|
|
"grad_norm": 0.2507828857248926,
|
|
"learning_rate": 2.501753618475877e-06,
|
|
"loss": 0.3682469129562378,
|
|
"num_tokens": 415392501.0,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 3.4402985074626864,
|
|
"grad_norm": 0.2692051104680508,
|
|
"learning_rate": 2.4842536508673087e-06,
|
|
"loss": 0.37688201665878296,
|
|
"num_tokens": 416317197.0,
|
|
"step": 461
|
|
},
|
|
{
|
|
"epoch": 3.4477611940298507,
|
|
"grad_norm": 0.2549135365443059,
|
|
"learning_rate": 2.466836086562345e-06,
|
|
"loss": 0.36603114008903503,
|
|
"num_tokens": 417156988.0,
|
|
"step": 462
|
|
},
|
|
{
|
|
"epoch": 3.455223880597015,
|
|
"grad_norm": 0.2453940193702825,
|
|
"learning_rate": 2.4495014014845807e-06,
|
|
"loss": 0.3681268095970154,
|
|
"num_tokens": 418076187.0,
|
|
"step": 463
|
|
},
|
|
{
|
|
"epoch": 3.4626865671641793,
|
|
"grad_norm": 0.2725566155237126,
|
|
"learning_rate": 2.432250069292989e-06,
|
|
"loss": 0.37921467423439026,
|
|
"num_tokens": 418901462.0,
|
|
"step": 464
|
|
},
|
|
{
|
|
"epoch": 3.470149253731343,
|
|
"grad_norm": 0.25907951334448015,
|
|
"learning_rate": 2.415082561368979e-06,
|
|
"loss": 0.39200738072395325,
|
|
"num_tokens": 419804291.0,
|
|
"step": 465
|
|
},
|
|
{
|
|
"epoch": 3.4776119402985075,
|
|
"grad_norm": 0.26406315997541896,
|
|
"learning_rate": 2.397999346803518e-06,
|
|
"loss": 0.39208582043647766,
|
|
"num_tokens": 420712064.0,
|
|
"step": 466
|
|
},
|
|
{
|
|
"epoch": 3.485074626865672,
|
|
"grad_norm": 0.23773901962622077,
|
|
"learning_rate": 2.3810008923843077e-06,
|
|
"loss": 0.37207821011543274,
|
|
"num_tokens": 421699792.0,
|
|
"step": 467
|
|
},
|
|
{
|
|
"epoch": 3.4925373134328357,
|
|
"grad_norm": 0.2479152678036227,
|
|
"learning_rate": 2.3640876625830385e-06,
|
|
"loss": 0.37208831310272217,
|
|
"num_tokens": 422643169.0,
|
|
"step": 468
|
|
},
|
|
{
|
|
"epoch": 3.5,
|
|
"grad_norm": 0.2563637058550244,
|
|
"learning_rate": 2.347260119542692e-06,
|
|
"loss": 0.378294974565506,
|
|
"num_tokens": 423633161.0,
|
|
"step": 469
|
|
},
|
|
{
|
|
"epoch": 3.5074626865671643,
|
|
"grad_norm": 0.2606090648417702,
|
|
"learning_rate": 2.3305187230649177e-06,
|
|
"loss": 0.3819723129272461,
|
|
"num_tokens": 424556814.0,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 3.5149253731343286,
|
|
"grad_norm": 0.251352839735243,
|
|
"learning_rate": 2.3138639305974596e-06,
|
|
"loss": 0.37940090894699097,
|
|
"num_tokens": 425479906.0,
|
|
"step": 471
|
|
},
|
|
{
|
|
"epoch": 3.5223880597014925,
|
|
"grad_norm": 0.24680617583096287,
|
|
"learning_rate": 2.2972961972216703e-06,
|
|
"loss": 0.3712913393974304,
|
|
"num_tokens": 426446651.0,
|
|
"step": 472
|
|
},
|
|
{
|
|
"epoch": 3.529850746268657,
|
|
"grad_norm": 0.25068376553010957,
|
|
"learning_rate": 2.2808159756400667e-06,
|
|
"loss": 0.36781617999076843,
|
|
"num_tokens": 427310770.0,
|
|
"step": 473
|
|
},
|
|
{
|
|
"epoch": 3.5373134328358207,
|
|
"grad_norm": 0.2575081517211329,
|
|
"learning_rate": 2.264423716163962e-06,
|
|
"loss": 0.38692015409469604,
|
|
"num_tokens": 428270355.0,
|
|
"step": 474
|
|
},
|
|
{
|
|
"epoch": 3.544776119402985,
|
|
"grad_norm": 0.26353888813152593,
|
|
"learning_rate": 2.2481198667011675e-06,
|
|
"loss": 0.4076312184333801,
|
|
"num_tokens": 429240026.0,
|
|
"step": 475
|
|
},
|
|
{
|
|
"epoch": 3.5522388059701493,
|
|
"grad_norm": 0.24599771689956054,
|
|
"learning_rate": 2.231904872743739e-06,
|
|
"loss": 0.3803725838661194,
|
|
"num_tokens": 430167582.0,
|
|
"step": 476
|
|
},
|
|
{
|
|
"epoch": 3.5597014925373136,
|
|
"grad_norm": 0.24639104277677626,
|
|
"learning_rate": 2.2157791773558222e-06,
|
|
"loss": 0.3705400228500366,
|
|
"num_tokens": 431118645.0,
|
|
"step": 477
|
|
},
|
|
{
|
|
"epoch": 3.5671641791044775,
|
|
"grad_norm": 0.25169470907126656,
|
|
"learning_rate": 2.199743221161533e-06,
|
|
"loss": 0.40112996101379395,
|
|
"num_tokens": 432105903.0,
|
|
"step": 478
|
|
},
|
|
{
|
|
"epoch": 3.574626865671642,
|
|
"grad_norm": 0.2494015959735466,
|
|
"learning_rate": 2.1837974423329254e-06,
|
|
"loss": 0.37427645921707153,
|
|
"num_tokens": 432968989.0,
|
|
"step": 479
|
|
},
|
|
{
|
|
"epoch": 3.582089552238806,
|
|
"grad_norm": 0.2510312087393284,
|
|
"learning_rate": 2.1679422765780115e-06,
|
|
"loss": 0.3761802613735199,
|
|
"num_tokens": 433879607.0,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 3.58955223880597,
|
|
"grad_norm": 0.2472662036793444,
|
|
"learning_rate": 2.152178157128865e-06,
|
|
"loss": 0.37739771604537964,
|
|
"num_tokens": 434793981.0,
|
|
"step": 481
|
|
},
|
|
{
|
|
"epoch": 3.5970149253731343,
|
|
"grad_norm": 0.2526868205714577,
|
|
"learning_rate": 2.136505514729774e-06,
|
|
"loss": 0.3701442778110504,
|
|
"num_tokens": 435697283.0,
|
|
"step": 482
|
|
},
|
|
{
|
|
"epoch": 3.6044776119402986,
|
|
"grad_norm": 0.2516013371512894,
|
|
"learning_rate": 2.1209247776254795e-06,
|
|
"loss": 0.3924868106842041,
|
|
"num_tokens": 436627533.0,
|
|
"step": 483
|
|
},
|
|
{
|
|
"epoch": 3.611940298507463,
|
|
"grad_norm": 0.24502502103917492,
|
|
"learning_rate": 2.1054363715494695e-06,
|
|
"loss": 0.34178441762924194,
|
|
"num_tokens": 437481939.0,
|
|
"step": 484
|
|
},
|
|
{
|
|
"epoch": 3.6194029850746268,
|
|
"grad_norm": 0.26412516983013123,
|
|
"learning_rate": 2.0900407197123444e-06,
|
|
"loss": 0.3800678253173828,
|
|
"num_tokens": 438276274.0,
|
|
"step": 485
|
|
},
|
|
{
|
|
"epoch": 3.626865671641791,
|
|
"grad_norm": 0.2650046240456923,
|
|
"learning_rate": 2.0747382427902574e-06,
|
|
"loss": 0.4031677544116974,
|
|
"num_tokens": 439089480.0,
|
|
"step": 486
|
|
},
|
|
{
|
|
"epoch": 3.6343283582089554,
|
|
"grad_norm": 0.2587379164469128,
|
|
"learning_rate": 2.059529358913418e-06,
|
|
"loss": 0.37271153926849365,
|
|
"num_tokens": 439983559.0,
|
|
"step": 487
|
|
},
|
|
{
|
|
"epoch": 3.6417910447761193,
|
|
"grad_norm": 0.2540913543502109,
|
|
"learning_rate": 2.0444144836546684e-06,
|
|
"loss": 0.3822531998157501,
|
|
"num_tokens": 440850324.0,
|
|
"step": 488
|
|
},
|
|
{
|
|
"epoch": 3.6492537313432836,
|
|
"grad_norm": 0.27783327558446214,
|
|
"learning_rate": 2.0293940300181216e-06,
|
|
"loss": 0.3831808269023895,
|
|
"num_tokens": 441605590.0,
|
|
"step": 489
|
|
},
|
|
{
|
|
"epoch": 3.656716417910448,
|
|
"grad_norm": 0.2796967269697153,
|
|
"learning_rate": 2.0144684084278847e-06,
|
|
"loss": 0.3709692060947418,
|
|
"num_tokens": 442348946.0,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 3.664179104477612,
|
|
"grad_norm": 0.2465314987769435,
|
|
"learning_rate": 1.999638026716842e-06,
|
|
"loss": 0.35937702655792236,
|
|
"num_tokens": 443300971.0,
|
|
"step": 491
|
|
},
|
|
{
|
|
"epoch": 3.671641791044776,
|
|
"grad_norm": 0.24683809772269052,
|
|
"learning_rate": 1.9849032901155075e-06,
|
|
"loss": 0.39329999685287476,
|
|
"num_tokens": 444301774.0,
|
|
"step": 492
|
|
},
|
|
{
|
|
"epoch": 3.6791044776119404,
|
|
"grad_norm": 0.23859031932692393,
|
|
"learning_rate": 1.970264601240958e-06,
|
|
"loss": 0.3722185492515564,
|
|
"num_tokens": 445224414.0,
|
|
"step": 493
|
|
},
|
|
{
|
|
"epoch": 3.6865671641791042,
|
|
"grad_norm": 0.2707873095537451,
|
|
"learning_rate": 1.955722360085824e-06,
|
|
"loss": 0.38121020793914795,
|
|
"num_tokens": 446138719.0,
|
|
"step": 494
|
|
},
|
|
{
|
|
"epoch": 3.6940298507462686,
|
|
"grad_norm": 0.27134162465622047,
|
|
"learning_rate": 1.941276964007369e-06,
|
|
"loss": 0.41704389452934265,
|
|
"num_tokens": 447027595.0,
|
|
"step": 495
|
|
},
|
|
{
|
|
"epoch": 3.701492537313433,
|
|
"grad_norm": 0.27102779980384334,
|
|
"learning_rate": 1.9269288077166264e-06,
|
|
"loss": 0.41601014137268066,
|
|
"num_tokens": 447918016.0,
|
|
"step": 496
|
|
},
|
|
{
|
|
"epoch": 3.708955223880597,
|
|
"grad_norm": 0.2795343486481597,
|
|
"learning_rate": 1.9126782832676175e-06,
|
|
"loss": 0.37963247299194336,
|
|
"num_tokens": 448782123.0,
|
|
"step": 497
|
|
},
|
|
{
|
|
"epoch": 3.716417910447761,
|
|
"grad_norm": 0.24533152172023073,
|
|
"learning_rate": 1.898525780046635e-06,
|
|
"loss": 0.37255096435546875,
|
|
"num_tokens": 449735295.0,
|
|
"step": 498
|
|
},
|
|
{
|
|
"epoch": 3.7238805970149254,
|
|
"grad_norm": 0.25068064600084505,
|
|
"learning_rate": 1.8844716847616053e-06,
|
|
"loss": 0.3953704237937927,
|
|
"num_tokens": 450703519.0,
|
|
"step": 499
|
|
},
|
|
{
|
|
"epoch": 3.7313432835820897,
|
|
"grad_norm": 0.27826952511668485,
|
|
"learning_rate": 1.870516381431523e-06,
|
|
"loss": 0.37893447279930115,
|
|
"num_tokens": 451523722.0,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 3.7388059701492535,
|
|
"grad_norm": 0.2470705912133386,
|
|
"learning_rate": 1.8566602513759573e-06,
|
|
"loss": 0.36960500478744507,
|
|
"num_tokens": 452496914.0,
|
|
"step": 501
|
|
},
|
|
{
|
|
"epoch": 3.746268656716418,
|
|
"grad_norm": 0.2380353729045607,
|
|
"learning_rate": 1.8429036732046328e-06,
|
|
"loss": 0.3598456084728241,
|
|
"num_tokens": 453486873.0,
|
|
"step": 502
|
|
},
|
|
{
|
|
"epoch": 3.753731343283582,
|
|
"grad_norm": 0.24753875738528466,
|
|
"learning_rate": 1.8292470228070808e-06,
|
|
"loss": 0.3775923550128937,
|
|
"num_tokens": 454415514.0,
|
|
"step": 503
|
|
},
|
|
{
|
|
"epoch": 3.7611940298507465,
|
|
"grad_norm": 0.24852622318044526,
|
|
"learning_rate": 1.815690673342374e-06,
|
|
"loss": 0.377275288105011,
|
|
"num_tokens": 455330400.0,
|
|
"step": 504
|
|
},
|
|
{
|
|
"epoch": 3.7686567164179103,
|
|
"grad_norm": 0.24830439594342327,
|
|
"learning_rate": 1.8022349952289275e-06,
|
|
"loss": 0.3592768907546997,
|
|
"num_tokens": 456232858.0,
|
|
"step": 505
|
|
},
|
|
{
|
|
"epoch": 3.7761194029850746,
|
|
"grad_norm": 0.2661718758635726,
|
|
"learning_rate": 1.7888803561343755e-06,
|
|
"loss": 0.3917810320854187,
|
|
"num_tokens": 457091321.0,
|
|
"step": 506
|
|
},
|
|
{
|
|
"epoch": 3.783582089552239,
|
|
"grad_norm": 0.2652414658319871,
|
|
"learning_rate": 1.7756271209655296e-06,
|
|
"loss": 0.41377222537994385,
|
|
"num_tokens": 457990573.0,
|
|
"step": 507
|
|
},
|
|
{
|
|
"epoch": 3.791044776119403,
|
|
"grad_norm": 0.260047413567863,
|
|
"learning_rate": 1.7624756518584015e-06,
|
|
"loss": 0.3786197304725647,
|
|
"num_tokens": 458827375.0,
|
|
"step": 508
|
|
},
|
|
{
|
|
"epoch": 3.798507462686567,
|
|
"grad_norm": 0.24921975710502509,
|
|
"learning_rate": 1.7494263081683134e-06,
|
|
"loss": 0.36924827098846436,
|
|
"num_tokens": 459694321.0,
|
|
"step": 509
|
|
},
|
|
{
|
|
"epoch": 3.8059701492537314,
|
|
"grad_norm": 0.24376539520051552,
|
|
"learning_rate": 1.736479446460081e-06,
|
|
"loss": 0.3597017526626587,
|
|
"num_tokens": 460616396.0,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 3.8134328358208958,
|
|
"grad_norm": 0.24365917664342365,
|
|
"learning_rate": 1.723635420498259e-06,
|
|
"loss": 0.36935943365097046,
|
|
"num_tokens": 461530829.0,
|
|
"step": 511
|
|
},
|
|
{
|
|
"epoch": 3.8208955223880596,
|
|
"grad_norm": 0.23932370443954964,
|
|
"learning_rate": 1.7108945812374874e-06,
|
|
"loss": 0.387093722820282,
|
|
"num_tokens": 462464505.0,
|
|
"step": 512
|
|
},
|
|
{
|
|
"epoch": 3.828358208955224,
|
|
"grad_norm": 0.257078997056124,
|
|
"learning_rate": 1.6982572768128964e-06,
|
|
"loss": 0.38530057668685913,
|
|
"num_tokens": 463398691.0,
|
|
"step": 513
|
|
},
|
|
{
|
|
"epoch": 3.835820895522388,
|
|
"grad_norm": 0.24718882255890465,
|
|
"learning_rate": 1.6857238525305924e-06,
|
|
"loss": 0.3774847388267517,
|
|
"num_tokens": 464295344.0,
|
|
"step": 514
|
|
},
|
|
{
|
|
"epoch": 3.843283582089552,
|
|
"grad_norm": 0.23460337795500458,
|
|
"learning_rate": 1.6732946508582288e-06,
|
|
"loss": 0.3643302619457245,
|
|
"num_tokens": 465251963.0,
|
|
"step": 515
|
|
},
|
|
{
|
|
"epoch": 3.8507462686567164,
|
|
"grad_norm": 0.23769889055431628,
|
|
"learning_rate": 1.6609700114156368e-06,
|
|
"loss": 0.3710617423057556,
|
|
"num_tokens": 466213047.0,
|
|
"step": 516
|
|
},
|
|
{
|
|
"epoch": 3.8582089552238807,
|
|
"grad_norm": 0.23396843344896867,
|
|
"learning_rate": 1.6487502709655591e-06,
|
|
"loss": 0.382940411567688,
|
|
"num_tokens": 467245768.0,
|
|
"step": 517
|
|
},
|
|
{
|
|
"epoch": 3.8656716417910446,
|
|
"grad_norm": 0.23909686285484746,
|
|
"learning_rate": 1.6366357634044406e-06,
|
|
"loss": 0.3723403215408325,
|
|
"num_tokens": 468129089.0,
|
|
"step": 518
|
|
},
|
|
{
|
|
"epoch": 3.873134328358209,
|
|
"grad_norm": 0.2688981703218909,
|
|
"learning_rate": 1.6246268197533046e-06,
|
|
"loss": 0.3829047381877899,
|
|
"num_tokens": 468938058.0,
|
|
"step": 519
|
|
},
|
|
{
|
|
"epoch": 3.8805970149253732,
|
|
"grad_norm": 0.25519957310879615,
|
|
"learning_rate": 1.6127237681487096e-06,
|
|
"loss": 0.39446866512298584,
|
|
"num_tokens": 469847619.0,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 3.888059701492537,
|
|
"grad_norm": 0.2486366229197261,
|
|
"learning_rate": 1.6009269338337832e-06,
|
|
"loss": 0.3983200788497925,
|
|
"num_tokens": 470791148.0,
|
|
"step": 521
|
|
},
|
|
{
|
|
"epoch": 3.8955223880597014,
|
|
"grad_norm": 0.24830658428540756,
|
|
"learning_rate": 1.5892366391493363e-06,
|
|
"loss": 0.38877153396606445,
|
|
"num_tokens": 471735636.0,
|
|
"step": 522
|
|
},
|
|
{
|
|
"epoch": 3.9029850746268657,
|
|
"grad_norm": 0.24923977507707654,
|
|
"learning_rate": 1.5776532035250513e-06,
|
|
"loss": 0.37799936532974243,
|
|
"num_tokens": 472685312.0,
|
|
"step": 523
|
|
},
|
|
{
|
|
"epoch": 3.91044776119403,
|
|
"grad_norm": 0.23192614084158372,
|
|
"learning_rate": 1.5661769434707585e-06,
|
|
"loss": 0.36345481872558594,
|
|
"num_tokens": 473551908.0,
|
|
"step": 524
|
|
},
|
|
{
|
|
"epoch": 3.917910447761194,
|
|
"grad_norm": 0.2552136498693883,
|
|
"learning_rate": 1.5548081725677843e-06,
|
|
"loss": 0.38905611634254456,
|
|
"num_tokens": 474411763.0,
|
|
"step": 525
|
|
},
|
|
{
|
|
"epoch": 3.925373134328358,
|
|
"grad_norm": 0.24222236291728852,
|
|
"learning_rate": 1.543547201460384e-06,
|
|
"loss": 0.39437806606292725,
|
|
"num_tokens": 475386853.0,
|
|
"step": 526
|
|
},
|
|
{
|
|
"epoch": 3.9328358208955225,
|
|
"grad_norm": 0.24678201558159044,
|
|
"learning_rate": 1.5323943378472547e-06,
|
|
"loss": 0.38338255882263184,
|
|
"num_tokens": 476308351.0,
|
|
"step": 527
|
|
},
|
|
{
|
|
"epoch": 3.9402985074626864,
|
|
"grad_norm": 0.24156858852006619,
|
|
"learning_rate": 1.5213498864731266e-06,
|
|
"loss": 0.3475341796875,
|
|
"num_tokens": 477113932.0,
|
|
"step": 528
|
|
},
|
|
{
|
|
"epoch": 3.9477611940298507,
|
|
"grad_norm": 0.2450649252841632,
|
|
"learning_rate": 1.510414149120436e-06,
|
|
"loss": 0.3621699810028076,
|
|
"num_tokens": 477978986.0,
|
|
"step": 529
|
|
},
|
|
{
|
|
"epoch": 3.955223880597015,
|
|
"grad_norm": 0.2615934671849586,
|
|
"learning_rate": 1.4995874246010778e-06,
|
|
"loss": 0.39790230989456177,
|
|
"num_tokens": 478804801.0,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 3.9626865671641793,
|
|
"grad_norm": 0.23841246285008993,
|
|
"learning_rate": 1.4888700087482447e-06,
|
|
"loss": 0.36489465832710266,
|
|
"num_tokens": 479744154.0,
|
|
"step": 531
|
|
},
|
|
{
|
|
"epoch": 3.970149253731343,
|
|
"grad_norm": 0.23884234571084306,
|
|
"learning_rate": 1.4782621944083395e-06,
|
|
"loss": 0.3676777482032776,
|
|
"num_tokens": 480672910.0,
|
|
"step": 532
|
|
},
|
|
{
|
|
"epoch": 3.9776119402985075,
|
|
"grad_norm": 0.24521642019046497,
|
|
"learning_rate": 1.4677642714329772e-06,
|
|
"loss": 0.36571812629699707,
|
|
"num_tokens": 481542586.0,
|
|
"step": 533
|
|
},
|
|
{
|
|
"epoch": 3.9850746268656714,
|
|
"grad_norm": 0.2490357512875355,
|
|
"learning_rate": 1.45737652667106e-06,
|
|
"loss": 0.3776237964630127,
|
|
"num_tokens": 482388483.0,
|
|
"step": 534
|
|
},
|
|
{
|
|
"epoch": 3.9925373134328357,
|
|
"grad_norm": 0.26895614724288625,
|
|
"learning_rate": 1.4470992439609447e-06,
|
|
"loss": 0.36370331048965454,
|
|
"num_tokens": 483130281.0,
|
|
"step": 535
|
|
},
|
|
{
|
|
"epoch": 4.0,
|
|
"grad_norm": 0.23598448132329167,
|
|
"learning_rate": 1.4369327041226832e-06,
|
|
"loss": 0.3770376443862915,
|
|
"num_tokens": 484157211.0,
|
|
"step": 536
|
|
},
|
|
{
|
|
"epoch": 4.007462686567164,
|
|
"grad_norm": 0.2696832935027054,
|
|
"learning_rate": 1.4268771849503507e-06,
|
|
"loss": 0.3495013117790222,
|
|
"num_tokens": 484950425.0,
|
|
"step": 537
|
|
},
|
|
{
|
|
"epoch": 4.014925373134329,
|
|
"grad_norm": 0.2523061504872546,
|
|
"learning_rate": 1.416932961204457e-06,
|
|
"loss": 0.35033246874809265,
|
|
"num_tokens": 485897373.0,
|
|
"step": 538
|
|
},
|
|
{
|
|
"epoch": 4.022388059701493,
|
|
"grad_norm": 0.24871017609979634,
|
|
"learning_rate": 1.4071003046044324e-06,
|
|
"loss": 0.3654225468635559,
|
|
"num_tokens": 486751466.0,
|
|
"step": 539
|
|
},
|
|
{
|
|
"epoch": 4.029850746268656,
|
|
"grad_norm": 0.23941923046022578,
|
|
"learning_rate": 1.3973794838212124e-06,
|
|
"loss": 0.36163097620010376,
|
|
"num_tokens": 487741373.0,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 4.037313432835821,
|
|
"grad_norm": 0.2662894021736037,
|
|
"learning_rate": 1.3877707644698895e-06,
|
|
"loss": 0.3875274062156677,
|
|
"num_tokens": 488582397.0,
|
|
"step": 541
|
|
},
|
|
{
|
|
"epoch": 4.044776119402985,
|
|
"grad_norm": 0.2747015512526315,
|
|
"learning_rate": 1.3782744091024586e-06,
|
|
"loss": 0.3777075409889221,
|
|
"num_tokens": 489319854.0,
|
|
"step": 542
|
|
},
|
|
{
|
|
"epoch": 4.052238805970149,
|
|
"grad_norm": 0.2525490812172139,
|
|
"learning_rate": 1.3688906772006393e-06,
|
|
"loss": 0.36404550075531006,
|
|
"num_tokens": 490257709.0,
|
|
"step": 543
|
|
},
|
|
{
|
|
"epoch": 4.059701492537314,
|
|
"grad_norm": 0.24847635532681875,
|
|
"learning_rate": 1.359619825168792e-06,
|
|
"loss": 0.36995524168014526,
|
|
"num_tokens": 491153491.0,
|
|
"step": 544
|
|
},
|
|
{
|
|
"epoch": 4.067164179104478,
|
|
"grad_norm": 0.23960840738937653,
|
|
"learning_rate": 1.3504621063269058e-06,
|
|
"loss": 0.36562579870224,
|
|
"num_tokens": 492103168.0,
|
|
"step": 545
|
|
},
|
|
{
|
|
"epoch": 4.074626865671641,
|
|
"grad_norm": 0.25453231573026114,
|
|
"learning_rate": 1.3414177709036802e-06,
|
|
"loss": 0.36385661363601685,
|
|
"num_tokens": 493050344.0,
|
|
"step": 546
|
|
},
|
|
{
|
|
"epoch": 4.082089552238806,
|
|
"grad_norm": 0.2409618977265192,
|
|
"learning_rate": 1.3324870660296869e-06,
|
|
"loss": 0.34029990434646606,
|
|
"num_tokens": 493993937.0,
|
|
"step": 547
|
|
},
|
|
{
|
|
"epoch": 4.08955223880597,
|
|
"grad_norm": 0.23732030399559195,
|
|
"learning_rate": 1.3236702357306157e-06,
|
|
"loss": 0.37044817209243774,
|
|
"num_tokens": 494995752.0,
|
|
"step": 548
|
|
},
|
|
{
|
|
"epoch": 4.097014925373134,
|
|
"grad_norm": 0.27145957936067777,
|
|
"learning_rate": 1.3149675209206086e-06,
|
|
"loss": 0.36308181285858154,
|
|
"num_tokens": 495757177.0,
|
|
"step": 549
|
|
},
|
|
{
|
|
"epoch": 4.104477611940299,
|
|
"grad_norm": 0.2833034141091316,
|
|
"learning_rate": 1.3063791593956758e-06,
|
|
"loss": 0.37331539392471313,
|
|
"num_tokens": 496689668.0,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 4.111940298507463,
|
|
"grad_norm": 0.240823460931463,
|
|
"learning_rate": 1.2979053858271995e-06,
|
|
"loss": 0.36020007729530334,
|
|
"num_tokens": 497565858.0,
|
|
"step": 551
|
|
},
|
|
{
|
|
"epoch": 4.119402985074627,
|
|
"grad_norm": 0.2594604561644764,
|
|
"learning_rate": 1.2895464317555206e-06,
|
|
"loss": 0.3884323239326477,
|
|
"num_tokens": 498385563.0,
|
|
"step": 552
|
|
},
|
|
{
|
|
"epoch": 4.126865671641791,
|
|
"grad_norm": 0.23073517132438157,
|
|
"learning_rate": 1.2813025255836104e-06,
|
|
"loss": 0.349163293838501,
|
|
"num_tokens": 499323100.0,
|
|
"step": 553
|
|
},
|
|
{
|
|
"epoch": 4.134328358208955,
|
|
"grad_norm": 0.2603679958630556,
|
|
"learning_rate": 1.2731738925708328e-06,
|
|
"loss": 0.36741840839385986,
|
|
"num_tokens": 500196622.0,
|
|
"step": 554
|
|
},
|
|
{
|
|
"epoch": 4.141791044776119,
|
|
"grad_norm": 0.24326119145979633,
|
|
"learning_rate": 1.2651607548267873e-06,
|
|
"loss": 0.3810882568359375,
|
|
"num_tokens": 501224710.0,
|
|
"step": 555
|
|
},
|
|
{
|
|
"epoch": 4.149253731343284,
|
|
"grad_norm": 0.22934377798425087,
|
|
"learning_rate": 1.257263331305241e-06,
|
|
"loss": 0.37762486934661865,
|
|
"num_tokens": 502305655.0,
|
|
"step": 556
|
|
},
|
|
{
|
|
"epoch": 4.156716417910448,
|
|
"grad_norm": 0.2399419262393838,
|
|
"learning_rate": 1.249481837798144e-06,
|
|
"loss": 0.360861212015152,
|
|
"num_tokens": 503186087.0,
|
|
"step": 557
|
|
},
|
|
{
|
|
"epoch": 4.164179104477612,
|
|
"grad_norm": 0.2356017748084062,
|
|
"learning_rate": 1.2418164869297353e-06,
|
|
"loss": 0.36369866132736206,
|
|
"num_tokens": 504097376.0,
|
|
"step": 558
|
|
},
|
|
{
|
|
"epoch": 4.1716417910447765,
|
|
"grad_norm": 0.239368624704367,
|
|
"learning_rate": 1.2342674881507327e-06,
|
|
"loss": 0.36475175619125366,
|
|
"num_tokens": 505048926.0,
|
|
"step": 559
|
|
},
|
|
{
|
|
"epoch": 4.17910447761194,
|
|
"grad_norm": 0.24555194813944806,
|
|
"learning_rate": 1.2268350477326073e-06,
|
|
"loss": 0.3852774500846863,
|
|
"num_tokens": 505967694.0,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 4.186567164179104,
|
|
"grad_norm": 0.24385261062576613,
|
|
"learning_rate": 1.2195193687619505e-06,
|
|
"loss": 0.3750133812427521,
|
|
"num_tokens": 506924348.0,
|
|
"step": 561
|
|
},
|
|
{
|
|
"epoch": 4.1940298507462686,
|
|
"grad_norm": 0.24733441550806298,
|
|
"learning_rate": 1.2123206511349212e-06,
|
|
"loss": 0.36548683047294617,
|
|
"num_tokens": 507837247.0,
|
|
"step": 562
|
|
},
|
|
{
|
|
"epoch": 4.201492537313433,
|
|
"grad_norm": 0.2626516276894915,
|
|
"learning_rate": 1.2052390915517881e-06,
|
|
"loss": 0.36941125988960266,
|
|
"num_tokens": 508615951.0,
|
|
"step": 563
|
|
},
|
|
{
|
|
"epoch": 4.208955223880597,
|
|
"grad_norm": 0.24609691004441409,
|
|
"learning_rate": 1.1982748835115512e-06,
|
|
"loss": 0.3862428665161133,
|
|
"num_tokens": 509598473.0,
|
|
"step": 564
|
|
},
|
|
{
|
|
"epoch": 4.2164179104477615,
|
|
"grad_norm": 0.24842515895556683,
|
|
"learning_rate": 1.1914282173066574e-06,
|
|
"loss": 0.38270822167396545,
|
|
"num_tokens": 510499495.0,
|
|
"step": 565
|
|
},
|
|
{
|
|
"epoch": 4.223880597014926,
|
|
"grad_norm": 0.2407337171765148,
|
|
"learning_rate": 1.1846992800177979e-06,
|
|
"loss": 0.3664012551307678,
|
|
"num_tokens": 511393216.0,
|
|
"step": 566
|
|
},
|
|
{
|
|
"epoch": 4.231343283582089,
|
|
"grad_norm": 0.2442416141258047,
|
|
"learning_rate": 1.1780882555087988e-06,
|
|
"loss": 0.3886314034461975,
|
|
"num_tokens": 512343363.0,
|
|
"step": 567
|
|
},
|
|
{
|
|
"epoch": 4.2388059701492535,
|
|
"grad_norm": 0.2577619381883818,
|
|
"learning_rate": 1.1715953244215964e-06,
|
|
"loss": 0.3437773585319519,
|
|
"num_tokens": 513127609.0,
|
|
"step": 568
|
|
},
|
|
{
|
|
"epoch": 4.246268656716418,
|
|
"grad_norm": 0.25087871697950354,
|
|
"learning_rate": 1.165220664171302e-06,
|
|
"loss": 0.3734786808490753,
|
|
"num_tokens": 514033936.0,
|
|
"step": 569
|
|
},
|
|
{
|
|
"epoch": 4.253731343283582,
|
|
"grad_norm": 0.2392856334846873,
|
|
"learning_rate": 1.1589644489413516e-06,
|
|
"loss": 0.35015231370925903,
|
|
"num_tokens": 514934044.0,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 4.2611940298507465,
|
|
"grad_norm": 0.23533059380991045,
|
|
"learning_rate": 1.1528268496787498e-06,
|
|
"loss": 0.3818935453891754,
|
|
"num_tokens": 515909265.0,
|
|
"step": 571
|
|
},
|
|
{
|
|
"epoch": 4.268656716417911,
|
|
"grad_norm": 0.28002873497751246,
|
|
"learning_rate": 1.1468080340893958e-06,
|
|
"loss": 0.3613874316215515,
|
|
"num_tokens": 516712628.0,
|
|
"step": 572
|
|
},
|
|
{
|
|
"epoch": 4.276119402985074,
|
|
"grad_norm": 0.26573428139291055,
|
|
"learning_rate": 1.1409081666335035e-06,
|
|
"loss": 0.40466490387916565,
|
|
"num_tokens": 517664539.0,
|
|
"step": 573
|
|
},
|
|
{
|
|
"epoch": 4.2835820895522385,
|
|
"grad_norm": 0.2622221544713941,
|
|
"learning_rate": 1.1351274085211068e-06,
|
|
"loss": 0.36875689029693604,
|
|
"num_tokens": 518492097.0,
|
|
"step": 574
|
|
},
|
|
{
|
|
"epoch": 4.291044776119403,
|
|
"grad_norm": 0.8295997519231081,
|
|
"learning_rate": 1.1294659177076523e-06,
|
|
"loss": 0.343036413192749,
|
|
"num_tokens": 519432536.0,
|
|
"step": 575
|
|
},
|
|
{
|
|
"epoch": 4.298507462686567,
|
|
"grad_norm": 0.26477934459538893,
|
|
"learning_rate": 1.1239238488896875e-06,
|
|
"loss": 0.39276033639907837,
|
|
"num_tokens": 520276253.0,
|
|
"step": 576
|
|
},
|
|
{
|
|
"epoch": 4.3059701492537314,
|
|
"grad_norm": 0.2751291575165678,
|
|
"learning_rate": 1.118501353500631e-06,
|
|
"loss": 0.36554020643234253,
|
|
"num_tokens": 521085557.0,
|
|
"step": 577
|
|
},
|
|
{
|
|
"epoch": 4.313432835820896,
|
|
"grad_norm": 0.26704770077542006,
|
|
"learning_rate": 1.1131985797066364e-06,
|
|
"loss": 0.39840590953826904,
|
|
"num_tokens": 521915761.0,
|
|
"step": 578
|
|
},
|
|
{
|
|
"epoch": 4.32089552238806,
|
|
"grad_norm": 0.267325084112826,
|
|
"learning_rate": 1.1080156724025409e-06,
|
|
"loss": 0.3594783842563629,
|
|
"num_tokens": 522783342.0,
|
|
"step": 579
|
|
},
|
|
{
|
|
"epoch": 4.3283582089552235,
|
|
"grad_norm": 0.23810536176661679,
|
|
"learning_rate": 1.1029527732079084e-06,
|
|
"loss": 0.37440672516822815,
|
|
"num_tokens": 523807264.0,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 4.335820895522388,
|
|
"grad_norm": 0.27369911060242186,
|
|
"learning_rate": 1.0980100204631604e-06,
|
|
"loss": 0.40351587533950806,
|
|
"num_tokens": 524601938.0,
|
|
"step": 581
|
|
},
|
|
{
|
|
"epoch": 4.343283582089552,
|
|
"grad_norm": 0.23536111609123755,
|
|
"learning_rate": 1.0931875492257946e-06,
|
|
"loss": 0.33745962381362915,
|
|
"num_tokens": 525537212.0,
|
|
"step": 582
|
|
},
|
|
{
|
|
"epoch": 4.350746268656716,
|
|
"grad_norm": 0.2600131581237491,
|
|
"learning_rate": 1.088485491266694e-06,
|
|
"loss": 0.38494178652763367,
|
|
"num_tokens": 526347121.0,
|
|
"step": 583
|
|
},
|
|
{
|
|
"epoch": 4.358208955223881,
|
|
"grad_norm": 0.23219951832538527,
|
|
"learning_rate": 1.0839039750665292e-06,
|
|
"loss": 0.35427361726760864,
|
|
"num_tokens": 527281437.0,
|
|
"step": 584
|
|
},
|
|
{
|
|
"epoch": 4.365671641791045,
|
|
"grad_norm": 0.2489391057817072,
|
|
"learning_rate": 1.079443125812243e-06,
|
|
"loss": 0.3624609708786011,
|
|
"num_tokens": 528208071.0,
|
|
"step": 585
|
|
},
|
|
{
|
|
"epoch": 4.373134328358209,
|
|
"grad_norm": 0.2539695897127002,
|
|
"learning_rate": 1.0751030653936356e-06,
|
|
"loss": 0.3747778534889221,
|
|
"num_tokens": 529032089.0,
|
|
"step": 586
|
|
},
|
|
{
|
|
"epoch": 4.380597014925373,
|
|
"grad_norm": 0.2499880144186626,
|
|
"learning_rate": 1.0708839124000287e-06,
|
|
"loss": 0.38273054361343384,
|
|
"num_tokens": 529947287.0,
|
|
"step": 587
|
|
},
|
|
{
|
|
"epoch": 4.388059701492537,
|
|
"grad_norm": 0.2506974248310357,
|
|
"learning_rate": 1.0667857821170282e-06,
|
|
"loss": 0.3470362424850464,
|
|
"num_tokens": 530728896.0,
|
|
"step": 588
|
|
},
|
|
{
|
|
"epoch": 4.395522388059701,
|
|
"grad_norm": 0.24506418459436066,
|
|
"learning_rate": 1.0628087865233737e-06,
|
|
"loss": 0.35882338881492615,
|
|
"num_tokens": 531620091.0,
|
|
"step": 589
|
|
},
|
|
{
|
|
"epoch": 4.402985074626866,
|
|
"grad_norm": 0.24329483114740325,
|
|
"learning_rate": 1.058953034287877e-06,
|
|
"loss": 0.37174564599990845,
|
|
"num_tokens": 532460579.0,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 4.41044776119403,
|
|
"grad_norm": 0.23831984993388738,
|
|
"learning_rate": 1.0552186307664567e-06,
|
|
"loss": 0.363148033618927,
|
|
"num_tokens": 533351390.0,
|
|
"step": 591
|
|
},
|
|
{
|
|
"epoch": 4.417910447761194,
|
|
"grad_norm": 0.26162136426743393,
|
|
"learning_rate": 1.0516056779992543e-06,
|
|
"loss": 0.38013726472854614,
|
|
"num_tokens": 534195605.0,
|
|
"step": 592
|
|
},
|
|
{
|
|
"epoch": 4.425373134328359,
|
|
"grad_norm": 0.2635745464481523,
|
|
"learning_rate": 1.0481142747078494e-06,
|
|
"loss": 0.3700369596481323,
|
|
"num_tokens": 535033541.0,
|
|
"step": 593
|
|
},
|
|
{
|
|
"epoch": 4.432835820895522,
|
|
"grad_norm": 0.25007207032778783,
|
|
"learning_rate": 1.0447445162925614e-06,
|
|
"loss": 0.3790166974067688,
|
|
"num_tokens": 535964895.0,
|
|
"step": 594
|
|
},
|
|
{
|
|
"epoch": 4.440298507462686,
|
|
"grad_norm": 0.22799545701890034,
|
|
"learning_rate": 1.0414964948298436e-06,
|
|
"loss": 0.36508986353874207,
|
|
"num_tokens": 536941184.0,
|
|
"step": 595
|
|
},
|
|
{
|
|
"epoch": 4.447761194029851,
|
|
"grad_norm": 0.23265306394886567,
|
|
"learning_rate": 1.0383702990697657e-06,
|
|
"loss": 0.3546326160430908,
|
|
"num_tokens": 537896596.0,
|
|
"step": 596
|
|
},
|
|
{
|
|
"epoch": 4.455223880597015,
|
|
"grad_norm": 0.2452826212608677,
|
|
"learning_rate": 1.0353660144335892e-06,
|
|
"loss": 0.3647281229496002,
|
|
"num_tokens": 538748931.0,
|
|
"step": 597
|
|
},
|
|
{
|
|
"epoch": 4.462686567164179,
|
|
"grad_norm": 0.24623855227956742,
|
|
"learning_rate": 1.0324837230114332e-06,
|
|
"loss": 0.3664322793483734,
|
|
"num_tokens": 539622406.0,
|
|
"step": 598
|
|
},
|
|
{
|
|
"epoch": 4.470149253731344,
|
|
"grad_norm": 0.24476867667376634,
|
|
"learning_rate": 1.0297235035600337e-06,
|
|
"loss": 0.35626494884490967,
|
|
"num_tokens": 540561688.0,
|
|
"step": 599
|
|
},
|
|
{
|
|
"epoch": 4.477611940298507,
|
|
"grad_norm": 0.22411638197357536,
|
|
"learning_rate": 1.0270854315005874e-06,
|
|
"loss": 0.3493247628211975,
|
|
"num_tokens": 541498885.0,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 4.485074626865671,
|
|
"grad_norm": 0.23854702147816884,
|
|
"learning_rate": 1.024569578916695e-06,
|
|
"loss": 0.36460673809051514,
|
|
"num_tokens": 542468798.0,
|
|
"step": 601
|
|
},
|
|
{
|
|
"epoch": 4.492537313432836,
|
|
"grad_norm": 0.24473776240066009,
|
|
"learning_rate": 1.0221760145523876e-06,
|
|
"loss": 0.3664558529853821,
|
|
"num_tokens": 543354992.0,
|
|
"step": 602
|
|
},
|
|
{
|
|
"epoch": 4.5,
|
|
"grad_norm": 0.3484100772975978,
|
|
"learning_rate": 1.0199048038102528e-06,
|
|
"loss": 0.3781493902206421,
|
|
"num_tokens": 544264190.0,
|
|
"step": 603
|
|
},
|
|
{
|
|
"epoch": 4.507462686567164,
|
|
"grad_norm": 0.23041088788536823,
|
|
"learning_rate": 1.0177560087496425e-06,
|
|
"loss": 0.36557939648628235,
|
|
"num_tokens": 545199765.0,
|
|
"step": 604
|
|
},
|
|
{
|
|
"epoch": 4.514925373134329,
|
|
"grad_norm": 0.26397201636028744,
|
|
"learning_rate": 1.0157296880849826e-06,
|
|
"loss": 0.39719897508621216,
|
|
"num_tokens": 546061065.0,
|
|
"step": 605
|
|
},
|
|
{
|
|
"epoch": 4.522388059701493,
|
|
"grad_norm": 0.2510378043077616,
|
|
"learning_rate": 1.0138258971841642e-06,
|
|
"loss": 0.3602595925331116,
|
|
"num_tokens": 546928816.0,
|
|
"step": 606
|
|
},
|
|
{
|
|
"epoch": 4.529850746268656,
|
|
"grad_norm": 0.25217406420558186,
|
|
"learning_rate": 1.0120446880670326e-06,
|
|
"loss": 0.3766353130340576,
|
|
"num_tokens": 547847934.0,
|
|
"step": 607
|
|
},
|
|
{
|
|
"epoch": 4.537313432835821,
|
|
"grad_norm": 0.23959568238841403,
|
|
"learning_rate": 1.010386109403967e-06,
|
|
"loss": 0.3650025725364685,
|
|
"num_tokens": 548766636.0,
|
|
"step": 608
|
|
},
|
|
{
|
|
"epoch": 4.544776119402985,
|
|
"grad_norm": 0.2377901920772251,
|
|
"learning_rate": 1.008850206514547e-06,
|
|
"loss": 0.3625343143939972,
|
|
"num_tokens": 549661389.0,
|
|
"step": 609
|
|
},
|
|
{
|
|
"epoch": 4.552238805970149,
|
|
"grad_norm": 0.26122470845807755,
|
|
"learning_rate": 1.0074370213663202e-06,
|
|
"loss": 0.3682940602302551,
|
|
"num_tokens": 550430887.0,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 4.559701492537314,
|
|
"grad_norm": 0.2481365703161649,
|
|
"learning_rate": 1.0061465925736478e-06,
|
|
"loss": 0.36531317234039307,
|
|
"num_tokens": 551293916.0,
|
|
"step": 611
|
|
},
|
|
{
|
|
"epoch": 4.567164179104478,
|
|
"grad_norm": 0.23719670021949013,
|
|
"learning_rate": 1.004978955396657e-06,
|
|
"loss": 0.3669975996017456,
|
|
"num_tokens": 552281926.0,
|
|
"step": 612
|
|
},
|
|
{
|
|
"epoch": 4.574626865671641,
|
|
"grad_norm": 0.25803252973725255,
|
|
"learning_rate": 1.0039341417402715e-06,
|
|
"loss": 0.37066352367401123,
|
|
"num_tokens": 553148975.0,
|
|
"step": 613
|
|
},
|
|
{
|
|
"epoch": 4.582089552238806,
|
|
"grad_norm": 0.2476936983459798,
|
|
"learning_rate": 1.0030121801533442e-06,
|
|
"loss": 0.3824441134929657,
|
|
"num_tokens": 554068576.0,
|
|
"step": 614
|
|
},
|
|
{
|
|
"epoch": 4.58955223880597,
|
|
"grad_norm": 0.2489594826146839,
|
|
"learning_rate": 1.002213095827875e-06,
|
|
"loss": 0.3596557378768921,
|
|
"num_tokens": 554855138.0,
|
|
"step": 615
|
|
},
|
|
{
|
|
"epoch": 4.597014925373134,
|
|
"grad_norm": 0.2550266059020853,
|
|
"learning_rate": 1.0015369105983218e-06,
|
|
"loss": 0.34850555658340454,
|
|
"num_tokens": 555783649.0,
|
|
"step": 616
|
|
},
|
|
{
|
|
"epoch": 4.604477611940299,
|
|
"grad_norm": 0.28933444541800885,
|
|
"learning_rate": 1.0009836429410053e-06,
|
|
"loss": 0.3593859076499939,
|
|
"num_tokens": 556756059.0,
|
|
"step": 617
|
|
},
|
|
{
|
|
"epoch": 4.611940298507463,
|
|
"grad_norm": 0.24100103005251267,
|
|
"learning_rate": 1.0005533079736037e-06,
|
|
"loss": 0.34157663583755493,
|
|
"num_tokens": 557624997.0,
|
|
"step": 618
|
|
},
|
|
{
|
|
"epoch": 4.619402985074627,
|
|
"grad_norm": 0.2434497947580223,
|
|
"learning_rate": 1.00024591745474e-06,
|
|
"loss": 0.35940393805503845,
|
|
"num_tokens": 558551462.0,
|
|
"step": 619
|
|
},
|
|
{
|
|
"epoch": 4.6268656716417915,
|
|
"grad_norm": 0.2334659825308566,
|
|
"learning_rate": 1.0000614797836587e-06,
|
|
"loss": 0.3954239785671234,
|
|
"num_tokens": 559571713.0,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 4.6268656716417915,
|
|
"step": 620,
|
|
"total_flos": 829937030004736.0,
|
|
"train_loss": 0.4202386662844689,
|
|
"train_runtime": 18585.0074,
|
|
"train_samples_per_second": 1.068,
|
|
"train_steps_per_second": 0.033
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 620,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 5,
|
|
"save_steps": 62,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 829937030004736.0,
|
|
"train_batch_size": 2,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|