Model: ali-elganzory/1.7b-Comma0.1-300BT-longsft_16k-SFT-Tulu3-decontaminated Source: Original Platform
14677 lines
409 KiB
JSON
14677 lines
409 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 2.0,
|
|
"eval_steps": 500,
|
|
"global_step": 14634,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"entropy": 1.3484375,
|
|
"epoch": 0.0013666803334700013,
|
|
"grad_norm": 0.3553511020485123,
|
|
"learning_rate": 1.0227272727272728e-07,
|
|
"loss": 1.4813,
|
|
"mean_token_accuracy": 0.6670866370201111,
|
|
"num_tokens": 938571.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"entropy": 1.334375,
|
|
"epoch": 0.0027333606669400026,
|
|
"grad_norm": 0.41676215147451645,
|
|
"learning_rate": 2.1590909090909094e-07,
|
|
"loss": 1.4271,
|
|
"mean_token_accuracy": 0.6821090459823609,
|
|
"num_tokens": 1829597.0,
|
|
"step": 20
|
|
},
|
|
{
|
|
"entropy": 1.34609375,
|
|
"epoch": 0.004100041000410004,
|
|
"grad_norm": 0.38827723763105315,
|
|
"learning_rate": 3.2954545454545455e-07,
|
|
"loss": 1.4619,
|
|
"mean_token_accuracy": 0.6730497539043426,
|
|
"num_tokens": 2756097.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"entropy": 1.38203125,
|
|
"epoch": 0.005466721333880005,
|
|
"grad_norm": 0.3676289466858475,
|
|
"learning_rate": 4.431818181818182e-07,
|
|
"loss": 1.4878,
|
|
"mean_token_accuracy": 0.6686018645763397,
|
|
"num_tokens": 3667157.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"entropy": 1.31796875,
|
|
"epoch": 0.006833401667350007,
|
|
"grad_norm": 0.3706903769061633,
|
|
"learning_rate": 5.568181818181818e-07,
|
|
"loss": 1.4241,
|
|
"mean_token_accuracy": 0.6795761287212372,
|
|
"num_tokens": 4593212.0,
|
|
"step": 50
|
|
},
|
|
{
|
|
"entropy": 1.36796875,
|
|
"epoch": 0.008200082000820008,
|
|
"grad_norm": 0.4277543541801501,
|
|
"learning_rate": 6.704545454545456e-07,
|
|
"loss": 1.4534,
|
|
"mean_token_accuracy": 0.6741535246372223,
|
|
"num_tokens": 5559539.0,
|
|
"step": 60
|
|
},
|
|
{
|
|
"entropy": 1.3890625,
|
|
"epoch": 0.00956676233429001,
|
|
"grad_norm": 0.3520092976225818,
|
|
"learning_rate": 7.840909090909092e-07,
|
|
"loss": 1.4957,
|
|
"mean_token_accuracy": 0.6672939181327819,
|
|
"num_tokens": 6518318.0,
|
|
"step": 70
|
|
},
|
|
{
|
|
"entropy": 1.42890625,
|
|
"epoch": 0.01093344266776001,
|
|
"grad_norm": 0.37206243858008203,
|
|
"learning_rate": 8.977272727272728e-07,
|
|
"loss": 1.5335,
|
|
"mean_token_accuracy": 0.6601896584033966,
|
|
"num_tokens": 7462414.0,
|
|
"step": 80
|
|
},
|
|
{
|
|
"entropy": 1.3640625,
|
|
"epoch": 0.012300123001230012,
|
|
"grad_norm": 0.3211041865231172,
|
|
"learning_rate": 1.0113636363636365e-06,
|
|
"loss": 1.4587,
|
|
"mean_token_accuracy": 0.6714106917381286,
|
|
"num_tokens": 8328147.0,
|
|
"step": 90
|
|
},
|
|
{
|
|
"entropy": 1.365625,
|
|
"epoch": 0.013666803334700014,
|
|
"grad_norm": 1.1454432905598246,
|
|
"learning_rate": 1.125e-06,
|
|
"loss": 1.4654,
|
|
"mean_token_accuracy": 0.6742521762847901,
|
|
"num_tokens": 9256532.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"entropy": 1.340625,
|
|
"epoch": 0.015033483668170014,
|
|
"grad_norm": 0.28665169784933275,
|
|
"learning_rate": 1.2386363636363638e-06,
|
|
"loss": 1.4276,
|
|
"mean_token_accuracy": 0.6786331474781037,
|
|
"num_tokens": 10191350.0,
|
|
"step": 110
|
|
},
|
|
{
|
|
"entropy": 1.34765625,
|
|
"epoch": 0.016400164001640016,
|
|
"grad_norm": 0.23402987773124723,
|
|
"learning_rate": 1.3522727272727273e-06,
|
|
"loss": 1.4523,
|
|
"mean_token_accuracy": 0.6754368424415589,
|
|
"num_tokens": 11096376.0,
|
|
"step": 120
|
|
},
|
|
{
|
|
"entropy": 1.34453125,
|
|
"epoch": 0.017766844335110017,
|
|
"grad_norm": 0.4169285995612805,
|
|
"learning_rate": 1.465909090909091e-06,
|
|
"loss": 1.4274,
|
|
"mean_token_accuracy": 0.6787726759910584,
|
|
"num_tokens": 12025075.0,
|
|
"step": 130
|
|
},
|
|
{
|
|
"entropy": 1.36796875,
|
|
"epoch": 0.01913352466858002,
|
|
"grad_norm": 0.2283354112165046,
|
|
"learning_rate": 1.5795454545454547e-06,
|
|
"loss": 1.4523,
|
|
"mean_token_accuracy": 0.6749060332775116,
|
|
"num_tokens": 12961701.0,
|
|
"step": 140
|
|
},
|
|
{
|
|
"entropy": 1.38046875,
|
|
"epoch": 0.02050020500205002,
|
|
"grad_norm": 0.2334218135222127,
|
|
"learning_rate": 1.6931818181818182e-06,
|
|
"loss": 1.4685,
|
|
"mean_token_accuracy": 0.6732150495052338,
|
|
"num_tokens": 13905979.0,
|
|
"step": 150
|
|
},
|
|
{
|
|
"entropy": 1.3546875,
|
|
"epoch": 0.02186688533552002,
|
|
"grad_norm": 0.23083400404682422,
|
|
"learning_rate": 1.8068181818181822e-06,
|
|
"loss": 1.4372,
|
|
"mean_token_accuracy": 0.6770615637302398,
|
|
"num_tokens": 14850397.0,
|
|
"step": 160
|
|
},
|
|
{
|
|
"entropy": 1.38359375,
|
|
"epoch": 0.023233565668990024,
|
|
"grad_norm": 0.2475771048363106,
|
|
"learning_rate": 1.9204545454545457e-06,
|
|
"loss": 1.4649,
|
|
"mean_token_accuracy": 0.6742673635482788,
|
|
"num_tokens": 15703108.0,
|
|
"step": 170
|
|
},
|
|
{
|
|
"entropy": 1.32578125,
|
|
"epoch": 0.024600246002460024,
|
|
"grad_norm": 0.18343771068820733,
|
|
"learning_rate": 2.034090909090909e-06,
|
|
"loss": 1.3826,
|
|
"mean_token_accuracy": 0.6823219656944275,
|
|
"num_tokens": 16615464.0,
|
|
"step": 180
|
|
},
|
|
{
|
|
"entropy": 1.36796875,
|
|
"epoch": 0.025966926335930025,
|
|
"grad_norm": 0.19571774774713813,
|
|
"learning_rate": 2.147727272727273e-06,
|
|
"loss": 1.4208,
|
|
"mean_token_accuracy": 0.6731665015220643,
|
|
"num_tokens": 17581557.0,
|
|
"step": 190
|
|
},
|
|
{
|
|
"entropy": 1.3875,
|
|
"epoch": 0.02733360666940003,
|
|
"grad_norm": 0.1788238120236529,
|
|
"learning_rate": 2.2613636363636366e-06,
|
|
"loss": 1.4525,
|
|
"mean_token_accuracy": 0.6719917476177215,
|
|
"num_tokens": 18451335.0,
|
|
"step": 200
|
|
},
|
|
{
|
|
"entropy": 1.36953125,
|
|
"epoch": 0.02870028700287003,
|
|
"grad_norm": 0.18923236010224287,
|
|
"learning_rate": 2.375e-06,
|
|
"loss": 1.4302,
|
|
"mean_token_accuracy": 0.6714851379394531,
|
|
"num_tokens": 19392109.0,
|
|
"step": 210
|
|
},
|
|
{
|
|
"entropy": 1.38671875,
|
|
"epoch": 0.03006696733634003,
|
|
"grad_norm": 0.1927601660619569,
|
|
"learning_rate": 2.488636363636364e-06,
|
|
"loss": 1.4304,
|
|
"mean_token_accuracy": 0.6752189695835114,
|
|
"num_tokens": 20323700.0,
|
|
"step": 220
|
|
},
|
|
{
|
|
"entropy": 1.32109375,
|
|
"epoch": 0.03143364766981003,
|
|
"grad_norm": 0.16504608204715393,
|
|
"learning_rate": 2.6022727272727276e-06,
|
|
"loss": 1.362,
|
|
"mean_token_accuracy": 0.6865715384483337,
|
|
"num_tokens": 21263745.0,
|
|
"step": 230
|
|
},
|
|
{
|
|
"entropy": 1.365625,
|
|
"epoch": 0.03280032800328003,
|
|
"grad_norm": 0.1873355115098459,
|
|
"learning_rate": 2.715909090909091e-06,
|
|
"loss": 1.398,
|
|
"mean_token_accuracy": 0.679420781135559,
|
|
"num_tokens": 22195458.0,
|
|
"step": 240
|
|
},
|
|
{
|
|
"entropy": 1.36171875,
|
|
"epoch": 0.034167008336750036,
|
|
"grad_norm": 0.16098138186782374,
|
|
"learning_rate": 2.829545454545455e-06,
|
|
"loss": 1.3859,
|
|
"mean_token_accuracy": 0.6847734808921814,
|
|
"num_tokens": 23115090.0,
|
|
"step": 250
|
|
},
|
|
{
|
|
"entropy": 1.3421875,
|
|
"epoch": 0.03553368867022003,
|
|
"grad_norm": 0.14911510394601135,
|
|
"learning_rate": 2.9431818181818185e-06,
|
|
"loss": 1.3754,
|
|
"mean_token_accuracy": 0.684462022781372,
|
|
"num_tokens": 24043510.0,
|
|
"step": 260
|
|
},
|
|
{
|
|
"entropy": 1.3484375,
|
|
"epoch": 0.03690036900369004,
|
|
"grad_norm": 0.15811352000315643,
|
|
"learning_rate": 3.056818181818182e-06,
|
|
"loss": 1.3801,
|
|
"mean_token_accuracy": 0.6831223785877227,
|
|
"num_tokens": 24975455.0,
|
|
"step": 270
|
|
},
|
|
{
|
|
"entropy": 1.34140625,
|
|
"epoch": 0.03826704933716004,
|
|
"grad_norm": 0.13293503805485782,
|
|
"learning_rate": 3.1704545454545456e-06,
|
|
"loss": 1.3494,
|
|
"mean_token_accuracy": 0.6882503688335418,
|
|
"num_tokens": 25915880.0,
|
|
"step": 280
|
|
},
|
|
{
|
|
"entropy": 1.35703125,
|
|
"epoch": 0.03963372967063004,
|
|
"grad_norm": 0.1352443519572186,
|
|
"learning_rate": 3.2840909090909095e-06,
|
|
"loss": 1.3707,
|
|
"mean_token_accuracy": 0.6840597748756408,
|
|
"num_tokens": 26836649.0,
|
|
"step": 290
|
|
},
|
|
{
|
|
"entropy": 1.3328125,
|
|
"epoch": 0.04100041000410004,
|
|
"grad_norm": 0.12407312578198194,
|
|
"learning_rate": 3.397727272727273e-06,
|
|
"loss": 1.3593,
|
|
"mean_token_accuracy": 0.6846068978309632,
|
|
"num_tokens": 27758649.0,
|
|
"step": 300
|
|
},
|
|
{
|
|
"entropy": 1.37265625,
|
|
"epoch": 0.042367090337570044,
|
|
"grad_norm": 0.15015443199462075,
|
|
"learning_rate": 3.5113636363636365e-06,
|
|
"loss": 1.384,
|
|
"mean_token_accuracy": 0.6834948122501373,
|
|
"num_tokens": 28704182.0,
|
|
"step": 310
|
|
},
|
|
{
|
|
"entropy": 1.33828125,
|
|
"epoch": 0.04373377067104004,
|
|
"grad_norm": 0.131675851616143,
|
|
"learning_rate": 3.625e-06,
|
|
"loss": 1.3443,
|
|
"mean_token_accuracy": 0.6862433016300201,
|
|
"num_tokens": 29605445.0,
|
|
"step": 320
|
|
},
|
|
{
|
|
"entropy": 1.43984375,
|
|
"epoch": 0.045100451004510045,
|
|
"grad_norm": 0.13336649259966477,
|
|
"learning_rate": 3.7386363636363635e-06,
|
|
"loss": 1.4414,
|
|
"mean_token_accuracy": 0.6734611630439759,
|
|
"num_tokens": 30515940.0,
|
|
"step": 330
|
|
},
|
|
{
|
|
"entropy": 1.3078125,
|
|
"epoch": 0.04646713133798005,
|
|
"grad_norm": 0.15319642986788068,
|
|
"learning_rate": 3.852272727272728e-06,
|
|
"loss": 1.3086,
|
|
"mean_token_accuracy": 0.6947275221347808,
|
|
"num_tokens": 31408267.0,
|
|
"step": 340
|
|
},
|
|
{
|
|
"entropy": 1.2828125,
|
|
"epoch": 0.047833811671450045,
|
|
"grad_norm": 0.13765286915346087,
|
|
"learning_rate": 3.965909090909091e-06,
|
|
"loss": 1.2859,
|
|
"mean_token_accuracy": 0.6990822076797485,
|
|
"num_tokens": 32349688.0,
|
|
"step": 350
|
|
},
|
|
{
|
|
"entropy": 1.35859375,
|
|
"epoch": 0.04920049200492005,
|
|
"grad_norm": 0.11858761557418658,
|
|
"learning_rate": 4.079545454545455e-06,
|
|
"loss": 1.3735,
|
|
"mean_token_accuracy": 0.6806787550449371,
|
|
"num_tokens": 33292692.0,
|
|
"step": 360
|
|
},
|
|
{
|
|
"entropy": 1.3734375,
|
|
"epoch": 0.05056717233839005,
|
|
"grad_norm": 0.14384689161839317,
|
|
"learning_rate": 4.193181818181819e-06,
|
|
"loss": 1.3796,
|
|
"mean_token_accuracy": 0.6825670182704926,
|
|
"num_tokens": 34223187.0,
|
|
"step": 370
|
|
},
|
|
{
|
|
"entropy": 1.38828125,
|
|
"epoch": 0.05193385267186005,
|
|
"grad_norm": 0.1460231877683502,
|
|
"learning_rate": 4.306818181818182e-06,
|
|
"loss": 1.4013,
|
|
"mean_token_accuracy": 0.6782120883464813,
|
|
"num_tokens": 35129123.0,
|
|
"step": 380
|
|
},
|
|
{
|
|
"entropy": 1.3109375,
|
|
"epoch": 0.05330053300533005,
|
|
"grad_norm": 0.12115863288975269,
|
|
"learning_rate": 4.420454545454546e-06,
|
|
"loss": 1.3233,
|
|
"mean_token_accuracy": 0.6924657225608826,
|
|
"num_tokens": 36084818.0,
|
|
"step": 390
|
|
},
|
|
{
|
|
"entropy": 1.31484375,
|
|
"epoch": 0.05466721333880006,
|
|
"grad_norm": 0.13446456858745237,
|
|
"learning_rate": 4.53409090909091e-06,
|
|
"loss": 1.3066,
|
|
"mean_token_accuracy": 0.6961567819118499,
|
|
"num_tokens": 36969815.0,
|
|
"step": 400
|
|
},
|
|
{
|
|
"entropy": 1.3875,
|
|
"epoch": 0.05603389367227005,
|
|
"grad_norm": 0.1549229688590677,
|
|
"learning_rate": 4.647727272727273e-06,
|
|
"loss": 1.3966,
|
|
"mean_token_accuracy": 0.6786888539791107,
|
|
"num_tokens": 37874200.0,
|
|
"step": 410
|
|
},
|
|
{
|
|
"entropy": 1.31953125,
|
|
"epoch": 0.05740057400574006,
|
|
"grad_norm": 0.1263890792464901,
|
|
"learning_rate": 4.761363636363637e-06,
|
|
"loss": 1.3357,
|
|
"mean_token_accuracy": 0.69099280834198,
|
|
"num_tokens": 38814930.0,
|
|
"step": 420
|
|
},
|
|
{
|
|
"entropy": 1.30625,
|
|
"epoch": 0.05876725433921006,
|
|
"grad_norm": 0.14044630286786208,
|
|
"learning_rate": 4.875e-06,
|
|
"loss": 1.3189,
|
|
"mean_token_accuracy": 0.692230498790741,
|
|
"num_tokens": 39757293.0,
|
|
"step": 430
|
|
},
|
|
{
|
|
"entropy": 1.34453125,
|
|
"epoch": 0.06013393467268006,
|
|
"grad_norm": 0.13170160638780798,
|
|
"learning_rate": 4.988636363636364e-06,
|
|
"loss": 1.3546,
|
|
"mean_token_accuracy": 0.6853543221950531,
|
|
"num_tokens": 40713143.0,
|
|
"step": 440
|
|
},
|
|
{
|
|
"entropy": 1.3609375,
|
|
"epoch": 0.06150061500615006,
|
|
"grad_norm": 0.12176289062599722,
|
|
"learning_rate": 4.996829646329435e-06,
|
|
"loss": 1.3707,
|
|
"mean_token_accuracy": 0.6822915613651276,
|
|
"num_tokens": 41638884.0,
|
|
"step": 450
|
|
},
|
|
{
|
|
"entropy": 1.39921875,
|
|
"epoch": 0.06286729533962006,
|
|
"grad_norm": 0.11941016686304397,
|
|
"learning_rate": 4.993307031139919e-06,
|
|
"loss": 1.4051,
|
|
"mean_token_accuracy": 0.6793888628482818,
|
|
"num_tokens": 42583453.0,
|
|
"step": 460
|
|
},
|
|
{
|
|
"entropy": 1.3484375,
|
|
"epoch": 0.06423397567309007,
|
|
"grad_norm": 0.12195170299979932,
|
|
"learning_rate": 4.989784415950402e-06,
|
|
"loss": 1.3589,
|
|
"mean_token_accuracy": 0.6847981810569763,
|
|
"num_tokens": 43522757.0,
|
|
"step": 470
|
|
},
|
|
{
|
|
"entropy": 1.41015625,
|
|
"epoch": 0.06560065600656007,
|
|
"grad_norm": 0.1321957922757234,
|
|
"learning_rate": 4.986261800760885e-06,
|
|
"loss": 1.4288,
|
|
"mean_token_accuracy": 0.6740886211395264,
|
|
"num_tokens": 44444147.0,
|
|
"step": 480
|
|
},
|
|
{
|
|
"entropy": 1.3328125,
|
|
"epoch": 0.06696733634003006,
|
|
"grad_norm": 0.12379289938783294,
|
|
"learning_rate": 4.9827391855713685e-06,
|
|
"loss": 1.3393,
|
|
"mean_token_accuracy": 0.6921751618385314,
|
|
"num_tokens": 45400305.0,
|
|
"step": 490
|
|
},
|
|
{
|
|
"entropy": 1.34375,
|
|
"epoch": 0.06833401667350007,
|
|
"grad_norm": 0.12971039337503745,
|
|
"learning_rate": 4.979216570381852e-06,
|
|
"loss": 1.3353,
|
|
"mean_token_accuracy": 0.6869662821292877,
|
|
"num_tokens": 46325614.0,
|
|
"step": 500
|
|
},
|
|
{
|
|
"entropy": 1.340625,
|
|
"epoch": 0.06970069700697007,
|
|
"grad_norm": 0.12071527331391309,
|
|
"learning_rate": 4.975693955192335e-06,
|
|
"loss": 1.3532,
|
|
"mean_token_accuracy": 0.6832802414894104,
|
|
"num_tokens": 47238170.0,
|
|
"step": 510
|
|
},
|
|
{
|
|
"entropy": 1.365625,
|
|
"epoch": 0.07106737734044007,
|
|
"grad_norm": 0.12944203380757524,
|
|
"learning_rate": 4.972171340002819e-06,
|
|
"loss": 1.3736,
|
|
"mean_token_accuracy": 0.6836429715156556,
|
|
"num_tokens": 48194911.0,
|
|
"step": 520
|
|
},
|
|
{
|
|
"entropy": 1.3734375,
|
|
"epoch": 0.07243405767391008,
|
|
"grad_norm": 0.13651735508497456,
|
|
"learning_rate": 4.968648724813302e-06,
|
|
"loss": 1.3795,
|
|
"mean_token_accuracy": 0.6829518437385559,
|
|
"num_tokens": 49112005.0,
|
|
"step": 530
|
|
},
|
|
{
|
|
"entropy": 1.32578125,
|
|
"epoch": 0.07380073800738007,
|
|
"grad_norm": 0.13093967026509584,
|
|
"learning_rate": 4.965126109623785e-06,
|
|
"loss": 1.3384,
|
|
"mean_token_accuracy": 0.6877105534076691,
|
|
"num_tokens": 50015461.0,
|
|
"step": 540
|
|
},
|
|
{
|
|
"entropy": 1.3484375,
|
|
"epoch": 0.07516741834085007,
|
|
"grad_norm": 0.13045175940037865,
|
|
"learning_rate": 4.961603494434268e-06,
|
|
"loss": 1.3583,
|
|
"mean_token_accuracy": 0.6861937403678894,
|
|
"num_tokens": 50935643.0,
|
|
"step": 550
|
|
},
|
|
{
|
|
"entropy": 1.31015625,
|
|
"epoch": 0.07653409867432008,
|
|
"grad_norm": 0.14108184250682235,
|
|
"learning_rate": 4.958080879244752e-06,
|
|
"loss": 1.3162,
|
|
"mean_token_accuracy": 0.6916009187698364,
|
|
"num_tokens": 51878002.0,
|
|
"step": 560
|
|
},
|
|
{
|
|
"entropy": 1.31015625,
|
|
"epoch": 0.07790077900779008,
|
|
"grad_norm": 0.13859153497962068,
|
|
"learning_rate": 4.954558264055234e-06,
|
|
"loss": 1.321,
|
|
"mean_token_accuracy": 0.689879196882248,
|
|
"num_tokens": 52758343.0,
|
|
"step": 570
|
|
},
|
|
{
|
|
"entropy": 1.29453125,
|
|
"epoch": 0.07926745934126007,
|
|
"grad_norm": 0.14096188125155884,
|
|
"learning_rate": 4.951035648865719e-06,
|
|
"loss": 1.2857,
|
|
"mean_token_accuracy": 0.6996366500854492,
|
|
"num_tokens": 53650442.0,
|
|
"step": 580
|
|
},
|
|
{
|
|
"entropy": 1.30546875,
|
|
"epoch": 0.08063413967473008,
|
|
"grad_norm": 0.204190344724878,
|
|
"learning_rate": 4.9475130336762015e-06,
|
|
"loss": 1.3131,
|
|
"mean_token_accuracy": 0.6913063168525696,
|
|
"num_tokens": 54555977.0,
|
|
"step": 590
|
|
},
|
|
{
|
|
"entropy": 1.35,
|
|
"epoch": 0.08200082000820008,
|
|
"grad_norm": 0.14013384944296453,
|
|
"learning_rate": 4.943990418486685e-06,
|
|
"loss": 1.3531,
|
|
"mean_token_accuracy": 0.6833755731582641,
|
|
"num_tokens": 55522236.0,
|
|
"step": 600
|
|
},
|
|
{
|
|
"entropy": 1.33671875,
|
|
"epoch": 0.08336750034167008,
|
|
"grad_norm": 0.12931684169206636,
|
|
"learning_rate": 4.9404678032971685e-06,
|
|
"loss": 1.342,
|
|
"mean_token_accuracy": 0.689514946937561,
|
|
"num_tokens": 56427197.0,
|
|
"step": 610
|
|
},
|
|
{
|
|
"entropy": 1.28984375,
|
|
"epoch": 0.08473418067514009,
|
|
"grad_norm": 0.1272204131256775,
|
|
"learning_rate": 4.936945188107651e-06,
|
|
"loss": 1.3099,
|
|
"mean_token_accuracy": 0.6964854538440705,
|
|
"num_tokens": 57362921.0,
|
|
"step": 620
|
|
},
|
|
{
|
|
"entropy": 1.2703125,
|
|
"epoch": 0.08610086100861009,
|
|
"grad_norm": 0.11740301191337443,
|
|
"learning_rate": 4.933422572918135e-06,
|
|
"loss": 1.2662,
|
|
"mean_token_accuracy": 0.7020695030689239,
|
|
"num_tokens": 58256680.0,
|
|
"step": 630
|
|
},
|
|
{
|
|
"entropy": 1.30546875,
|
|
"epoch": 0.08746754134208008,
|
|
"grad_norm": 0.16338639851433615,
|
|
"learning_rate": 4.929899957728618e-06,
|
|
"loss": 1.301,
|
|
"mean_token_accuracy": 0.6962743699550629,
|
|
"num_tokens": 59193508.0,
|
|
"step": 640
|
|
},
|
|
{
|
|
"entropy": 1.3125,
|
|
"epoch": 0.08883422167555009,
|
|
"grad_norm": 0.14196587321639417,
|
|
"learning_rate": 4.926377342539102e-06,
|
|
"loss": 1.3045,
|
|
"mean_token_accuracy": 0.6936445236206055,
|
|
"num_tokens": 60108420.0,
|
|
"step": 650
|
|
},
|
|
{
|
|
"entropy": 1.28125,
|
|
"epoch": 0.09020090200902009,
|
|
"grad_norm": 0.16033955782017392,
|
|
"learning_rate": 4.922854727349585e-06,
|
|
"loss": 1.2734,
|
|
"mean_token_accuracy": 0.6999214291572571,
|
|
"num_tokens": 61034298.0,
|
|
"step": 660
|
|
},
|
|
{
|
|
"entropy": 1.32578125,
|
|
"epoch": 0.09156758234249009,
|
|
"grad_norm": 0.11805863195379233,
|
|
"learning_rate": 4.919332112160068e-06,
|
|
"loss": 1.3253,
|
|
"mean_token_accuracy": 0.6931415498256683,
|
|
"num_tokens": 61983537.0,
|
|
"step": 670
|
|
},
|
|
{
|
|
"entropy": 1.3,
|
|
"epoch": 0.0929342626759601,
|
|
"grad_norm": 0.12276586556431372,
|
|
"learning_rate": 4.915809496970551e-06,
|
|
"loss": 1.3093,
|
|
"mean_token_accuracy": 0.6931610465049743,
|
|
"num_tokens": 62935993.0,
|
|
"step": 680
|
|
},
|
|
{
|
|
"entropy": 1.40078125,
|
|
"epoch": 0.0943009430094301,
|
|
"grad_norm": 0.12716474862618415,
|
|
"learning_rate": 4.912286881781035e-06,
|
|
"loss": 1.4189,
|
|
"mean_token_accuracy": 0.6733922719955444,
|
|
"num_tokens": 63853924.0,
|
|
"step": 690
|
|
},
|
|
{
|
|
"entropy": 1.30390625,
|
|
"epoch": 0.09566762334290009,
|
|
"grad_norm": 0.1271507194379322,
|
|
"learning_rate": 4.908764266591518e-06,
|
|
"loss": 1.3013,
|
|
"mean_token_accuracy": 0.6941017091274262,
|
|
"num_tokens": 64763931.0,
|
|
"step": 700
|
|
},
|
|
{
|
|
"entropy": 1.3203125,
|
|
"epoch": 0.0970343036763701,
|
|
"grad_norm": 0.1327580078864058,
|
|
"learning_rate": 4.9052416514020015e-06,
|
|
"loss": 1.3185,
|
|
"mean_token_accuracy": 0.6924223661422729,
|
|
"num_tokens": 65673140.0,
|
|
"step": 710
|
|
},
|
|
{
|
|
"entropy": 1.27734375,
|
|
"epoch": 0.0984009840098401,
|
|
"grad_norm": 0.14705901203212612,
|
|
"learning_rate": 4.901719036212484e-06,
|
|
"loss": 1.2851,
|
|
"mean_token_accuracy": 0.6969168603420257,
|
|
"num_tokens": 66640391.0,
|
|
"step": 720
|
|
},
|
|
{
|
|
"entropy": 1.27578125,
|
|
"epoch": 0.0997676643433101,
|
|
"grad_norm": 0.1194207197951316,
|
|
"learning_rate": 4.898196421022968e-06,
|
|
"loss": 1.2872,
|
|
"mean_token_accuracy": 0.6998118221759796,
|
|
"num_tokens": 67563282.0,
|
|
"step": 730
|
|
},
|
|
{
|
|
"entropy": 1.27109375,
|
|
"epoch": 0.1011343446767801,
|
|
"grad_norm": 0.14367212749072983,
|
|
"learning_rate": 4.894673805833451e-06,
|
|
"loss": 1.2567,
|
|
"mean_token_accuracy": 0.7024235129356384,
|
|
"num_tokens": 68416165.0,
|
|
"step": 740
|
|
},
|
|
{
|
|
"entropy": 1.3046875,
|
|
"epoch": 0.1025010250102501,
|
|
"grad_norm": 0.11884771534095487,
|
|
"learning_rate": 4.891151190643935e-06,
|
|
"loss": 1.3076,
|
|
"mean_token_accuracy": 0.695323783159256,
|
|
"num_tokens": 69336094.0,
|
|
"step": 750
|
|
},
|
|
{
|
|
"entropy": 1.30234375,
|
|
"epoch": 0.1038677053437201,
|
|
"grad_norm": 0.11228796055390164,
|
|
"learning_rate": 4.8876285754544175e-06,
|
|
"loss": 1.3043,
|
|
"mean_token_accuracy": 0.6938041388988495,
|
|
"num_tokens": 70292744.0,
|
|
"step": 760
|
|
},
|
|
{
|
|
"entropy": 1.3546875,
|
|
"epoch": 0.10523438567719011,
|
|
"grad_norm": 0.1302954208542776,
|
|
"learning_rate": 4.884105960264901e-06,
|
|
"loss": 1.3668,
|
|
"mean_token_accuracy": 0.6848212838172912,
|
|
"num_tokens": 71192370.0,
|
|
"step": 770
|
|
},
|
|
{
|
|
"entropy": 1.29765625,
|
|
"epoch": 0.1066010660106601,
|
|
"grad_norm": 0.1188469139995681,
|
|
"learning_rate": 4.880583345075385e-06,
|
|
"loss": 1.2903,
|
|
"mean_token_accuracy": 0.6975840568542481,
|
|
"num_tokens": 72109962.0,
|
|
"step": 780
|
|
},
|
|
{
|
|
"entropy": 1.25625,
|
|
"epoch": 0.1079677463441301,
|
|
"grad_norm": 0.11755504459225453,
|
|
"learning_rate": 4.877060729885867e-06,
|
|
"loss": 1.2585,
|
|
"mean_token_accuracy": 0.7007296323776245,
|
|
"num_tokens": 73022537.0,
|
|
"step": 790
|
|
},
|
|
{
|
|
"entropy": 1.2828125,
|
|
"epoch": 0.10933442667760011,
|
|
"grad_norm": 0.1247791894554745,
|
|
"learning_rate": 4.873538114696351e-06,
|
|
"loss": 1.2849,
|
|
"mean_token_accuracy": 0.6980208337306977,
|
|
"num_tokens": 73932084.0,
|
|
"step": 800
|
|
},
|
|
{
|
|
"entropy": 1.29921875,
|
|
"epoch": 0.11070110701107011,
|
|
"grad_norm": 0.12047063050592452,
|
|
"learning_rate": 4.870015499506834e-06,
|
|
"loss": 1.3034,
|
|
"mean_token_accuracy": 0.6971640527248383,
|
|
"num_tokens": 74843608.0,
|
|
"step": 810
|
|
},
|
|
{
|
|
"entropy": 1.30390625,
|
|
"epoch": 0.1120677873445401,
|
|
"grad_norm": 0.10350331565870703,
|
|
"learning_rate": 4.866492884317318e-06,
|
|
"loss": 1.3023,
|
|
"mean_token_accuracy": 0.6938735663890838,
|
|
"num_tokens": 75769415.0,
|
|
"step": 820
|
|
},
|
|
{
|
|
"entropy": 1.26171875,
|
|
"epoch": 0.11343446767801012,
|
|
"grad_norm": 0.11501872297069834,
|
|
"learning_rate": 4.862970269127801e-06,
|
|
"loss": 1.2691,
|
|
"mean_token_accuracy": 0.7039656102657318,
|
|
"num_tokens": 76676625.0,
|
|
"step": 830
|
|
},
|
|
{
|
|
"entropy": 1.31796875,
|
|
"epoch": 0.11480114801148011,
|
|
"grad_norm": 0.13303630393670696,
|
|
"learning_rate": 4.859447653938284e-06,
|
|
"loss": 1.3109,
|
|
"mean_token_accuracy": 0.6923641681671142,
|
|
"num_tokens": 77575710.0,
|
|
"step": 840
|
|
},
|
|
{
|
|
"entropy": 1.303125,
|
|
"epoch": 0.11616782834495011,
|
|
"grad_norm": 0.11134732499782132,
|
|
"learning_rate": 4.855925038748768e-06,
|
|
"loss": 1.3211,
|
|
"mean_token_accuracy": 0.6928994297981262,
|
|
"num_tokens": 78573992.0,
|
|
"step": 850
|
|
},
|
|
{
|
|
"entropy": 1.31015625,
|
|
"epoch": 0.11753450867842012,
|
|
"grad_norm": 0.11473962705778819,
|
|
"learning_rate": 4.852402423559251e-06,
|
|
"loss": 1.3177,
|
|
"mean_token_accuracy": 0.6933590352535248,
|
|
"num_tokens": 79532329.0,
|
|
"step": 860
|
|
},
|
|
{
|
|
"entropy": 1.32578125,
|
|
"epoch": 0.11890118901189012,
|
|
"grad_norm": 0.12078576012109726,
|
|
"learning_rate": 4.848879808369734e-06,
|
|
"loss": 1.3394,
|
|
"mean_token_accuracy": 0.6904009819030762,
|
|
"num_tokens": 80438221.0,
|
|
"step": 870
|
|
},
|
|
{
|
|
"entropy": 1.27890625,
|
|
"epoch": 0.12026786934536011,
|
|
"grad_norm": 0.11105898888784467,
|
|
"learning_rate": 4.8453571931802175e-06,
|
|
"loss": 1.2609,
|
|
"mean_token_accuracy": 0.7035066068172455,
|
|
"num_tokens": 81376762.0,
|
|
"step": 880
|
|
},
|
|
{
|
|
"entropy": 1.24453125,
|
|
"epoch": 0.12163454967883013,
|
|
"grad_norm": 0.1087772368483006,
|
|
"learning_rate": 4.8418345779907e-06,
|
|
"loss": 1.246,
|
|
"mean_token_accuracy": 0.7066019833087921,
|
|
"num_tokens": 82272468.0,
|
|
"step": 890
|
|
},
|
|
{
|
|
"entropy": 1.33828125,
|
|
"epoch": 0.12300123001230012,
|
|
"grad_norm": 0.15051539253738397,
|
|
"learning_rate": 4.838311962801184e-06,
|
|
"loss": 1.349,
|
|
"mean_token_accuracy": 0.6886163592338562,
|
|
"num_tokens": 83215846.0,
|
|
"step": 900
|
|
},
|
|
{
|
|
"entropy": 1.29296875,
|
|
"epoch": 0.12436791034577012,
|
|
"grad_norm": 0.11886626305325909,
|
|
"learning_rate": 4.834789347611667e-06,
|
|
"loss": 1.2985,
|
|
"mean_token_accuracy": 0.6966093182563782,
|
|
"num_tokens": 84093524.0,
|
|
"step": 910
|
|
},
|
|
{
|
|
"entropy": 1.3140625,
|
|
"epoch": 0.12573459067924012,
|
|
"grad_norm": 0.12070029593997376,
|
|
"learning_rate": 4.831266732422151e-06,
|
|
"loss": 1.3076,
|
|
"mean_token_accuracy": 0.6948084533214569,
|
|
"num_tokens": 84978921.0,
|
|
"step": 920
|
|
},
|
|
{
|
|
"entropy": 1.2171875,
|
|
"epoch": 0.12710127101271013,
|
|
"grad_norm": 0.1248477679440931,
|
|
"learning_rate": 4.827744117232634e-06,
|
|
"loss": 1.212,
|
|
"mean_token_accuracy": 0.7120832681655884,
|
|
"num_tokens": 85868292.0,
|
|
"step": 930
|
|
},
|
|
{
|
|
"entropy": 1.2609375,
|
|
"epoch": 0.12846795134618014,
|
|
"grad_norm": 0.12906423948653598,
|
|
"learning_rate": 4.824221502043117e-06,
|
|
"loss": 1.2539,
|
|
"mean_token_accuracy": 0.7032524287700653,
|
|
"num_tokens": 86759570.0,
|
|
"step": 940
|
|
},
|
|
{
|
|
"entropy": 1.31171875,
|
|
"epoch": 0.12983463167965012,
|
|
"grad_norm": 0.13716779878502058,
|
|
"learning_rate": 4.820698886853601e-06,
|
|
"loss": 1.3127,
|
|
"mean_token_accuracy": 0.6902301728725433,
|
|
"num_tokens": 87742990.0,
|
|
"step": 950
|
|
},
|
|
{
|
|
"entropy": 1.3109375,
|
|
"epoch": 0.13120131201312013,
|
|
"grad_norm": 0.11416191383169672,
|
|
"learning_rate": 4.817176271664084e-06,
|
|
"loss": 1.3114,
|
|
"mean_token_accuracy": 0.6916352033615112,
|
|
"num_tokens": 88685843.0,
|
|
"step": 960
|
|
},
|
|
{
|
|
"entropy": 1.24921875,
|
|
"epoch": 0.13256799234659014,
|
|
"grad_norm": 0.11099603709307587,
|
|
"learning_rate": 4.813653656474567e-06,
|
|
"loss": 1.2505,
|
|
"mean_token_accuracy": 0.7030920267105103,
|
|
"num_tokens": 89560923.0,
|
|
"step": 970
|
|
},
|
|
{
|
|
"entropy": 1.29765625,
|
|
"epoch": 0.13393467268006012,
|
|
"grad_norm": 0.13007455212061048,
|
|
"learning_rate": 4.8101310412850505e-06,
|
|
"loss": 1.3112,
|
|
"mean_token_accuracy": 0.6919164538383484,
|
|
"num_tokens": 90477434.0,
|
|
"step": 980
|
|
},
|
|
{
|
|
"entropy": 1.25703125,
|
|
"epoch": 0.13530135301353013,
|
|
"grad_norm": 0.11632252733821648,
|
|
"learning_rate": 4.806608426095534e-06,
|
|
"loss": 1.27,
|
|
"mean_token_accuracy": 0.7020042061805725,
|
|
"num_tokens": 91445593.0,
|
|
"step": 990
|
|
},
|
|
{
|
|
"entropy": 1.2625,
|
|
"epoch": 0.13666803334700015,
|
|
"grad_norm": 0.1263951215895295,
|
|
"learning_rate": 4.803085810906017e-06,
|
|
"loss": 1.2607,
|
|
"mean_token_accuracy": 0.7023445963859558,
|
|
"num_tokens": 92372246.0,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"entropy": 1.334375,
|
|
"epoch": 0.13803471368047013,
|
|
"grad_norm": 0.14190019680418903,
|
|
"learning_rate": 4.7995631957165e-06,
|
|
"loss": 1.344,
|
|
"mean_token_accuracy": 0.6860853850841522,
|
|
"num_tokens": 93322975.0,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"entropy": 1.2328125,
|
|
"epoch": 0.13940139401394014,
|
|
"grad_norm": 0.11280175988668857,
|
|
"learning_rate": 4.796040580526984e-06,
|
|
"loss": 1.2374,
|
|
"mean_token_accuracy": 0.7067894995212555,
|
|
"num_tokens": 94243005.0,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"entropy": 1.27265625,
|
|
"epoch": 0.14076807434741015,
|
|
"grad_norm": 0.13919755415787416,
|
|
"learning_rate": 4.792517965337467e-06,
|
|
"loss": 1.2766,
|
|
"mean_token_accuracy": 0.7000946283340455,
|
|
"num_tokens": 95128367.0,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"entropy": 1.25546875,
|
|
"epoch": 0.14213475468088013,
|
|
"grad_norm": 0.12992520043978573,
|
|
"learning_rate": 4.78899535014795e-06,
|
|
"loss": 1.2638,
|
|
"mean_token_accuracy": 0.7026342749595642,
|
|
"num_tokens": 96052977.0,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"entropy": 1.2625,
|
|
"epoch": 0.14350143501435014,
|
|
"grad_norm": 0.12524312047327435,
|
|
"learning_rate": 4.785472734958434e-06,
|
|
"loss": 1.2639,
|
|
"mean_token_accuracy": 0.7012313544750214,
|
|
"num_tokens": 96997179.0,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"entropy": 1.2890625,
|
|
"epoch": 0.14486811534782015,
|
|
"grad_norm": 0.11277201461606566,
|
|
"learning_rate": 4.781950119768916e-06,
|
|
"loss": 1.2957,
|
|
"mean_token_accuracy": 0.6959935128688812,
|
|
"num_tokens": 97934565.0,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"entropy": 1.24453125,
|
|
"epoch": 0.14623479568129014,
|
|
"grad_norm": 0.1087881262038136,
|
|
"learning_rate": 4.778427504579401e-06,
|
|
"loss": 1.2252,
|
|
"mean_token_accuracy": 0.7089367628097534,
|
|
"num_tokens": 98864499.0,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"entropy": 1.23125,
|
|
"epoch": 0.14760147601476015,
|
|
"grad_norm": 0.11048000163644099,
|
|
"learning_rate": 4.774904889389883e-06,
|
|
"loss": 1.2327,
|
|
"mean_token_accuracy": 0.7064364135265351,
|
|
"num_tokens": 99822130.0,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"entropy": 1.2484375,
|
|
"epoch": 0.14896815634823016,
|
|
"grad_norm": 0.12760126749792125,
|
|
"learning_rate": 4.771382274200367e-06,
|
|
"loss": 1.2582,
|
|
"mean_token_accuracy": 0.7050937056541443,
|
|
"num_tokens": 100744706.0,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"entropy": 1.3015625,
|
|
"epoch": 0.15033483668170014,
|
|
"grad_norm": 0.11081065704496072,
|
|
"learning_rate": 4.76785965901085e-06,
|
|
"loss": 1.3056,
|
|
"mean_token_accuracy": 0.6932923972606659,
|
|
"num_tokens": 101705020.0,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"entropy": 1.32578125,
|
|
"epoch": 0.15170151701517015,
|
|
"grad_norm": 0.12868107161414996,
|
|
"learning_rate": 4.764337043821333e-06,
|
|
"loss": 1.3346,
|
|
"mean_token_accuracy": 0.6890910029411316,
|
|
"num_tokens": 102616456.0,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"entropy": 1.29453125,
|
|
"epoch": 0.15306819734864016,
|
|
"grad_norm": 0.11150333177991506,
|
|
"learning_rate": 4.760814428631817e-06,
|
|
"loss": 1.294,
|
|
"mean_token_accuracy": 0.6964092493057251,
|
|
"num_tokens": 103534288.0,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"entropy": 1.27109375,
|
|
"epoch": 0.15443487768211014,
|
|
"grad_norm": 0.11395182418147555,
|
|
"learning_rate": 4.7572918134423e-06,
|
|
"loss": 1.2661,
|
|
"mean_token_accuracy": 0.7006071031093597,
|
|
"num_tokens": 104412112.0,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"entropy": 1.2171875,
|
|
"epoch": 0.15580155801558015,
|
|
"grad_norm": 0.10206897883030548,
|
|
"learning_rate": 4.753769198252783e-06,
|
|
"loss": 1.23,
|
|
"mean_token_accuracy": 0.707980090379715,
|
|
"num_tokens": 105339912.0,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"entropy": 1.3296875,
|
|
"epoch": 0.15716823834905017,
|
|
"grad_norm": 0.12013694549404241,
|
|
"learning_rate": 4.7502465830632665e-06,
|
|
"loss": 1.3339,
|
|
"mean_token_accuracy": 0.6892430305480957,
|
|
"num_tokens": 106300822.0,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"entropy": 1.22109375,
|
|
"epoch": 0.15853491868252015,
|
|
"grad_norm": 0.10629855991551682,
|
|
"learning_rate": 4.74672396787375e-06,
|
|
"loss": 1.2165,
|
|
"mean_token_accuracy": 0.7117327511310577,
|
|
"num_tokens": 107234943.0,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"entropy": 1.24453125,
|
|
"epoch": 0.15990159901599016,
|
|
"grad_norm": 0.13391164696644373,
|
|
"learning_rate": 4.743201352684233e-06,
|
|
"loss": 1.241,
|
|
"mean_token_accuracy": 0.7068231999874115,
|
|
"num_tokens": 108169288.0,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"entropy": 1.26484375,
|
|
"epoch": 0.16126827934946017,
|
|
"grad_norm": 0.11851445139651404,
|
|
"learning_rate": 4.739678737494716e-06,
|
|
"loss": 1.2702,
|
|
"mean_token_accuracy": 0.704141891002655,
|
|
"num_tokens": 109066252.0,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"entropy": 1.28359375,
|
|
"epoch": 0.16263495968293015,
|
|
"grad_norm": 0.1156615819492856,
|
|
"learning_rate": 4.7361561223052e-06,
|
|
"loss": 1.2834,
|
|
"mean_token_accuracy": 0.6970308601856232,
|
|
"num_tokens": 109981241.0,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"entropy": 1.23984375,
|
|
"epoch": 0.16400164001640016,
|
|
"grad_norm": 0.13492370660596423,
|
|
"learning_rate": 4.7326335071156834e-06,
|
|
"loss": 1.2436,
|
|
"mean_token_accuracy": 0.7074066817760467,
|
|
"num_tokens": 110887025.0,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"entropy": 1.2734375,
|
|
"epoch": 0.16536832034987017,
|
|
"grad_norm": 0.1116127229322019,
|
|
"learning_rate": 4.729110891926166e-06,
|
|
"loss": 1.286,
|
|
"mean_token_accuracy": 0.6975305855274201,
|
|
"num_tokens": 111835136.0,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"entropy": 1.2515625,
|
|
"epoch": 0.16673500068334016,
|
|
"grad_norm": 0.14866799896078606,
|
|
"learning_rate": 4.72558827673665e-06,
|
|
"loss": 1.2615,
|
|
"mean_token_accuracy": 0.7023941457271576,
|
|
"num_tokens": 112715049.0,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"entropy": 1.2625,
|
|
"epoch": 0.16810168101681017,
|
|
"grad_norm": 0.1160258008286987,
|
|
"learning_rate": 4.722065661547132e-06,
|
|
"loss": 1.2783,
|
|
"mean_token_accuracy": 0.7016432762145997,
|
|
"num_tokens": 113645212.0,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"entropy": 1.28671875,
|
|
"epoch": 0.16946836135028018,
|
|
"grad_norm": 0.11569753921609605,
|
|
"learning_rate": 4.718543046357617e-06,
|
|
"loss": 1.285,
|
|
"mean_token_accuracy": 0.6979293763637543,
|
|
"num_tokens": 114607757.0,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"entropy": 1.2203125,
|
|
"epoch": 0.17083504168375016,
|
|
"grad_norm": 0.1172963889262354,
|
|
"learning_rate": 4.7150204311680995e-06,
|
|
"loss": 1.2285,
|
|
"mean_token_accuracy": 0.7068657755851746,
|
|
"num_tokens": 115481536.0,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"entropy": 1.27421875,
|
|
"epoch": 0.17220172201722017,
|
|
"grad_norm": 0.11342748865648057,
|
|
"learning_rate": 4.711497815978583e-06,
|
|
"loss": 1.2766,
|
|
"mean_token_accuracy": 0.7005185186862946,
|
|
"num_tokens": 116398355.0,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"entropy": 1.278125,
|
|
"epoch": 0.17356840235069018,
|
|
"grad_norm": 0.11947925448919298,
|
|
"learning_rate": 4.707975200789066e-06,
|
|
"loss": 1.2957,
|
|
"mean_token_accuracy": 0.6954469442367553,
|
|
"num_tokens": 117363895.0,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"entropy": 1.27578125,
|
|
"epoch": 0.17493508268416016,
|
|
"grad_norm": 0.1282637345548867,
|
|
"learning_rate": 4.704452585599549e-06,
|
|
"loss": 1.2836,
|
|
"mean_token_accuracy": 0.6973258078098297,
|
|
"num_tokens": 118281157.0,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"entropy": 1.24609375,
|
|
"epoch": 0.17630176301763018,
|
|
"grad_norm": 0.11197805928601094,
|
|
"learning_rate": 4.700929970410033e-06,
|
|
"loss": 1.2509,
|
|
"mean_token_accuracy": 0.7049233853816986,
|
|
"num_tokens": 119199903.0,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"entropy": 1.24375,
|
|
"epoch": 0.17766844335110019,
|
|
"grad_norm": 0.12328082379637305,
|
|
"learning_rate": 4.697407355220516e-06,
|
|
"loss": 1.2394,
|
|
"mean_token_accuracy": 0.7076999723911286,
|
|
"num_tokens": 120153384.0,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"entropy": 1.24609375,
|
|
"epoch": 0.17903512368457017,
|
|
"grad_norm": 0.12143629493704232,
|
|
"learning_rate": 4.693884740030999e-06,
|
|
"loss": 1.2351,
|
|
"mean_token_accuracy": 0.7040312051773071,
|
|
"num_tokens": 121028690.0,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"entropy": 1.28671875,
|
|
"epoch": 0.18040180401804018,
|
|
"grad_norm": 0.1232323997568802,
|
|
"learning_rate": 4.690362124841483e-06,
|
|
"loss": 1.286,
|
|
"mean_token_accuracy": 0.699395877122879,
|
|
"num_tokens": 121950665.0,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"entropy": 1.26875,
|
|
"epoch": 0.1817684843515102,
|
|
"grad_norm": 0.10852095143498845,
|
|
"learning_rate": 4.686839509651966e-06,
|
|
"loss": 1.2781,
|
|
"mean_token_accuracy": 0.6978210031986236,
|
|
"num_tokens": 122875209.0,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"entropy": 1.27265625,
|
|
"epoch": 0.18313516468498017,
|
|
"grad_norm": 0.12109552170766562,
|
|
"learning_rate": 4.683316894462449e-06,
|
|
"loss": 1.2679,
|
|
"mean_token_accuracy": 0.6996523320674897,
|
|
"num_tokens": 123817600.0,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"entropy": 1.2515625,
|
|
"epoch": 0.18450184501845018,
|
|
"grad_norm": 0.11392837205490931,
|
|
"learning_rate": 4.679794279272933e-06,
|
|
"loss": 1.2745,
|
|
"mean_token_accuracy": 0.6993427574634552,
|
|
"num_tokens": 124741536.0,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"entropy": 1.2609375,
|
|
"epoch": 0.1858685253519202,
|
|
"grad_norm": 0.11752854332197356,
|
|
"learning_rate": 4.676271664083416e-06,
|
|
"loss": 1.275,
|
|
"mean_token_accuracy": 0.7004583597183227,
|
|
"num_tokens": 125689703.0,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"entropy": 1.3234375,
|
|
"epoch": 0.18723520568539018,
|
|
"grad_norm": 0.21829814825312172,
|
|
"learning_rate": 4.6727490488938995e-06,
|
|
"loss": 1.3286,
|
|
"mean_token_accuracy": 0.6911531984806061,
|
|
"num_tokens": 126565228.0,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"entropy": 1.2609375,
|
|
"epoch": 0.1886018860188602,
|
|
"grad_norm": 0.1054342887227756,
|
|
"learning_rate": 4.669226433704382e-06,
|
|
"loss": 1.2601,
|
|
"mean_token_accuracy": 0.7024249792098999,
|
|
"num_tokens": 127484119.0,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"entropy": 1.23046875,
|
|
"epoch": 0.1899685663523302,
|
|
"grad_norm": 0.10946707654451954,
|
|
"learning_rate": 4.665703818514866e-06,
|
|
"loss": 1.2368,
|
|
"mean_token_accuracy": 0.708287650346756,
|
|
"num_tokens": 128382056.0,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"entropy": 1.2421875,
|
|
"epoch": 0.19133524668580018,
|
|
"grad_norm": 0.10255426398748964,
|
|
"learning_rate": 4.6621812033253484e-06,
|
|
"loss": 1.2449,
|
|
"mean_token_accuracy": 0.7052888512611389,
|
|
"num_tokens": 129302334.0,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"entropy": 1.155859375,
|
|
"epoch": 0.1927019270192702,
|
|
"grad_norm": 0.11258920300478263,
|
|
"learning_rate": 4.658658588135833e-06,
|
|
"loss": 1.1561,
|
|
"mean_token_accuracy": 0.7213032662868499,
|
|
"num_tokens": 130215453.0,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"entropy": 1.21875,
|
|
"epoch": 0.1940686073527402,
|
|
"grad_norm": 0.13135842711084292,
|
|
"learning_rate": 4.6551359729463155e-06,
|
|
"loss": 1.2083,
|
|
"mean_token_accuracy": 0.7126948714256287,
|
|
"num_tokens": 131191399.0,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"entropy": 1.2921875,
|
|
"epoch": 0.19543528768621019,
|
|
"grad_norm": 0.12239083692115234,
|
|
"learning_rate": 4.651613357756799e-06,
|
|
"loss": 1.3078,
|
|
"mean_token_accuracy": 0.6917528867721557,
|
|
"num_tokens": 132122523.0,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"entropy": 1.284375,
|
|
"epoch": 0.1968019680196802,
|
|
"grad_norm": 0.13545360772696502,
|
|
"learning_rate": 4.648090742567283e-06,
|
|
"loss": 1.3059,
|
|
"mean_token_accuracy": 0.6952866017818451,
|
|
"num_tokens": 133055902.0,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"entropy": 1.26328125,
|
|
"epoch": 0.1981686483531502,
|
|
"grad_norm": 0.11919457406354442,
|
|
"learning_rate": 4.644568127377765e-06,
|
|
"loss": 1.2726,
|
|
"mean_token_accuracy": 0.6994832754135132,
|
|
"num_tokens": 134007365.0,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"entropy": 1.259375,
|
|
"epoch": 0.1995353286866202,
|
|
"grad_norm": 0.11207349116600387,
|
|
"learning_rate": 4.641045512188249e-06,
|
|
"loss": 1.2674,
|
|
"mean_token_accuracy": 0.7021160900592804,
|
|
"num_tokens": 134939937.0,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"entropy": 1.22421875,
|
|
"epoch": 0.2009020090200902,
|
|
"grad_norm": 0.14587384587008098,
|
|
"learning_rate": 4.6375228969987324e-06,
|
|
"loss": 1.2316,
|
|
"mean_token_accuracy": 0.7047403156757355,
|
|
"num_tokens": 135883157.0,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"entropy": 1.25234375,
|
|
"epoch": 0.2022686893535602,
|
|
"grad_norm": 0.12356593863883758,
|
|
"learning_rate": 4.634000281809216e-06,
|
|
"loss": 1.2481,
|
|
"mean_token_accuracy": 0.7056151628494263,
|
|
"num_tokens": 136826502.0,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"entropy": 1.24375,
|
|
"epoch": 0.2036353696870302,
|
|
"grad_norm": 0.12211924269097764,
|
|
"learning_rate": 4.630477666619699e-06,
|
|
"loss": 1.2367,
|
|
"mean_token_accuracy": 0.7062997400760651,
|
|
"num_tokens": 137729172.0,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"entropy": 1.203125,
|
|
"epoch": 0.2050020500205002,
|
|
"grad_norm": 0.11464412079923114,
|
|
"learning_rate": 4.626955051430182e-06,
|
|
"loss": 1.2084,
|
|
"mean_token_accuracy": 0.7143740177154541,
|
|
"num_tokens": 138673916.0,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"entropy": 1.275,
|
|
"epoch": 0.20636873035397021,
|
|
"grad_norm": 0.14305712211049257,
|
|
"learning_rate": 4.623432436240665e-06,
|
|
"loss": 1.2762,
|
|
"mean_token_accuracy": 0.7005420029163361,
|
|
"num_tokens": 139533756.0,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"entropy": 1.23046875,
|
|
"epoch": 0.2077354106874402,
|
|
"grad_norm": 0.20297751552005372,
|
|
"learning_rate": 4.619909821051149e-06,
|
|
"loss": 1.2265,
|
|
"mean_token_accuracy": 0.7075737714767456,
|
|
"num_tokens": 140388113.0,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"entropy": 1.24140625,
|
|
"epoch": 0.2091020910209102,
|
|
"grad_norm": 0.11586691061198916,
|
|
"learning_rate": 4.616387205861632e-06,
|
|
"loss": 1.2569,
|
|
"mean_token_accuracy": 0.7015835523605347,
|
|
"num_tokens": 141300644.0,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"entropy": 1.23046875,
|
|
"epoch": 0.21046877135438022,
|
|
"grad_norm": 0.1344673679633541,
|
|
"learning_rate": 4.6128645906721156e-06,
|
|
"loss": 1.2353,
|
|
"mean_token_accuracy": 0.7071058988571167,
|
|
"num_tokens": 142178254.0,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"entropy": 1.29375,
|
|
"epoch": 0.2118354516878502,
|
|
"grad_norm": 0.11720033399392564,
|
|
"learning_rate": 4.609341975482598e-06,
|
|
"loss": 1.3038,
|
|
"mean_token_accuracy": 0.6963693797588348,
|
|
"num_tokens": 143088982.0,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"entropy": 1.25234375,
|
|
"epoch": 0.2132021320213202,
|
|
"grad_norm": 0.12694285193037882,
|
|
"learning_rate": 4.605819360293082e-06,
|
|
"loss": 1.2561,
|
|
"mean_token_accuracy": 0.7031785786151886,
|
|
"num_tokens": 144044472.0,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"entropy": 1.20078125,
|
|
"epoch": 0.21456881235479022,
|
|
"grad_norm": 0.1029570749193216,
|
|
"learning_rate": 4.602296745103565e-06,
|
|
"loss": 1.2058,
|
|
"mean_token_accuracy": 0.7131929576396943,
|
|
"num_tokens": 144956757.0,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"entropy": 1.26796875,
|
|
"epoch": 0.2159354926882602,
|
|
"grad_norm": 0.1275030313293898,
|
|
"learning_rate": 4.598774129914049e-06,
|
|
"loss": 1.2968,
|
|
"mean_token_accuracy": 0.6970438599586487,
|
|
"num_tokens": 145879897.0,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"entropy": 1.27734375,
|
|
"epoch": 0.21730217302173022,
|
|
"grad_norm": 0.12089795780364844,
|
|
"learning_rate": 4.595251514724532e-06,
|
|
"loss": 1.2696,
|
|
"mean_token_accuracy": 0.6978880345821381,
|
|
"num_tokens": 146783735.0,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"entropy": 1.23828125,
|
|
"epoch": 0.21866885335520023,
|
|
"grad_norm": 0.11274623094420715,
|
|
"learning_rate": 4.591728899535015e-06,
|
|
"loss": 1.2278,
|
|
"mean_token_accuracy": 0.7075154900550842,
|
|
"num_tokens": 147702553.0,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"entropy": 1.1953125,
|
|
"epoch": 0.2200355336886702,
|
|
"grad_norm": 0.12038984877895303,
|
|
"learning_rate": 4.588206284345499e-06,
|
|
"loss": 1.1993,
|
|
"mean_token_accuracy": 0.7121262907981872,
|
|
"num_tokens": 148628418.0,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"entropy": 1.32109375,
|
|
"epoch": 0.22140221402214022,
|
|
"grad_norm": 0.12097813276166236,
|
|
"learning_rate": 4.584683669155981e-06,
|
|
"loss": 1.3322,
|
|
"mean_token_accuracy": 0.6892448484897613,
|
|
"num_tokens": 149568000.0,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"entropy": 1.2921875,
|
|
"epoch": 0.22276889435561023,
|
|
"grad_norm": 0.12136240710153218,
|
|
"learning_rate": 4.581161053966465e-06,
|
|
"loss": 1.2932,
|
|
"mean_token_accuracy": 0.6988578021526337,
|
|
"num_tokens": 150546517.0,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"entropy": 1.290625,
|
|
"epoch": 0.2241355746890802,
|
|
"grad_norm": 0.12119196652809988,
|
|
"learning_rate": 4.5776384387769485e-06,
|
|
"loss": 1.3133,
|
|
"mean_token_accuracy": 0.6948598921298981,
|
|
"num_tokens": 151479506.0,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"entropy": 1.23515625,
|
|
"epoch": 0.22550225502255022,
|
|
"grad_norm": 0.1264824650069849,
|
|
"learning_rate": 4.574115823587432e-06,
|
|
"loss": 1.232,
|
|
"mean_token_accuracy": 0.7089293897151947,
|
|
"num_tokens": 152405951.0,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"entropy": 1.2375,
|
|
"epoch": 0.22686893535602023,
|
|
"grad_norm": 0.11712305646442983,
|
|
"learning_rate": 4.570593208397915e-06,
|
|
"loss": 1.2526,
|
|
"mean_token_accuracy": 0.7049263775348663,
|
|
"num_tokens": 153325504.0,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"entropy": 1.25,
|
|
"epoch": 0.22823561568949022,
|
|
"grad_norm": 0.111837174069338,
|
|
"learning_rate": 4.567070593208398e-06,
|
|
"loss": 1.2469,
|
|
"mean_token_accuracy": 0.7047263503074646,
|
|
"num_tokens": 154226518.0,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"entropy": 1.24921875,
|
|
"epoch": 0.22960229602296023,
|
|
"grad_norm": 0.11441091850849872,
|
|
"learning_rate": 4.563547978018882e-06,
|
|
"loss": 1.2538,
|
|
"mean_token_accuracy": 0.7022512376308441,
|
|
"num_tokens": 155166706.0,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"entropy": 1.225,
|
|
"epoch": 0.23096897635643024,
|
|
"grad_norm": 0.10861278589372325,
|
|
"learning_rate": 4.560025362829365e-06,
|
|
"loss": 1.2305,
|
|
"mean_token_accuracy": 0.7053006887435913,
|
|
"num_tokens": 156080529.0,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"entropy": 1.24375,
|
|
"epoch": 0.23233565668990022,
|
|
"grad_norm": 0.1215795321741291,
|
|
"learning_rate": 4.556502747639848e-06,
|
|
"loss": 1.2617,
|
|
"mean_token_accuracy": 0.7021601319313049,
|
|
"num_tokens": 157016731.0,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"entropy": 1.26171875,
|
|
"epoch": 0.23370233702337023,
|
|
"grad_norm": 0.11238558054270573,
|
|
"learning_rate": 4.552980132450332e-06,
|
|
"loss": 1.2644,
|
|
"mean_token_accuracy": 0.6998307526111602,
|
|
"num_tokens": 157986519.0,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"entropy": 1.22734375,
|
|
"epoch": 0.23506901735684024,
|
|
"grad_norm": 0.11453654229159906,
|
|
"learning_rate": 4.549457517260814e-06,
|
|
"loss": 1.2345,
|
|
"mean_token_accuracy": 0.7073581635951995,
|
|
"num_tokens": 158934641.0,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"entropy": 1.23046875,
|
|
"epoch": 0.23643569769031023,
|
|
"grad_norm": 0.11655753111935402,
|
|
"learning_rate": 4.545934902071298e-06,
|
|
"loss": 1.2406,
|
|
"mean_token_accuracy": 0.7061099112033844,
|
|
"num_tokens": 159912974.0,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"entropy": 1.27734375,
|
|
"epoch": 0.23780237802378024,
|
|
"grad_norm": 0.13583728368485223,
|
|
"learning_rate": 4.5424122868817814e-06,
|
|
"loss": 1.2863,
|
|
"mean_token_accuracy": 0.6969979107379913,
|
|
"num_tokens": 160803757.0,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"entropy": 1.21015625,
|
|
"epoch": 0.23916905835725025,
|
|
"grad_norm": 0.1181756374812874,
|
|
"learning_rate": 4.538889671692265e-06,
|
|
"loss": 1.2121,
|
|
"mean_token_accuracy": 0.713445633649826,
|
|
"num_tokens": 161701172.0,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"entropy": 1.2484375,
|
|
"epoch": 0.24053573869072023,
|
|
"grad_norm": 0.12928047639072826,
|
|
"learning_rate": 4.535367056502748e-06,
|
|
"loss": 1.2508,
|
|
"mean_token_accuracy": 0.7035690426826477,
|
|
"num_tokens": 162650956.0,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"entropy": 1.259375,
|
|
"epoch": 0.24190241902419024,
|
|
"grad_norm": 0.11583750597571059,
|
|
"learning_rate": 4.531844441313231e-06,
|
|
"loss": 1.2607,
|
|
"mean_token_accuracy": 0.7025727510452271,
|
|
"num_tokens": 163571702.0,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"entropy": 1.19921875,
|
|
"epoch": 0.24326909935766025,
|
|
"grad_norm": 0.11574184890023204,
|
|
"learning_rate": 4.528321826123715e-06,
|
|
"loss": 1.2068,
|
|
"mean_token_accuracy": 0.7122830092906952,
|
|
"num_tokens": 164471607.0,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"entropy": 1.24921875,
|
|
"epoch": 0.24463577969113023,
|
|
"grad_norm": 0.10372922155409074,
|
|
"learning_rate": 4.524799210934198e-06,
|
|
"loss": 1.2483,
|
|
"mean_token_accuracy": 0.7045480966567993,
|
|
"num_tokens": 165399554.0,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"entropy": 1.2484375,
|
|
"epoch": 0.24600246002460024,
|
|
"grad_norm": 0.11075426578073735,
|
|
"learning_rate": 4.521276595744681e-06,
|
|
"loss": 1.247,
|
|
"mean_token_accuracy": 0.7057838618755341,
|
|
"num_tokens": 166291849.0,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"entropy": 1.259375,
|
|
"epoch": 0.24736914035807026,
|
|
"grad_norm": 0.18973187202003752,
|
|
"learning_rate": 4.5177539805551646e-06,
|
|
"loss": 1.2797,
|
|
"mean_token_accuracy": 0.6995975732803345,
|
|
"num_tokens": 167206313.0,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"entropy": 1.2234375,
|
|
"epoch": 0.24873582069154024,
|
|
"grad_norm": 0.11712640844857243,
|
|
"learning_rate": 4.514231365365648e-06,
|
|
"loss": 1.234,
|
|
"mean_token_accuracy": 0.7071960091590881,
|
|
"num_tokens": 168088688.0,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"entropy": 1.2328125,
|
|
"epoch": 0.25010250102501025,
|
|
"grad_norm": 0.12051854332375843,
|
|
"learning_rate": 4.510708750176131e-06,
|
|
"loss": 1.224,
|
|
"mean_token_accuracy": 0.708029282093048,
|
|
"num_tokens": 168991287.0,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"entropy": 1.225,
|
|
"epoch": 0.25146918135848023,
|
|
"grad_norm": 0.11626500068050175,
|
|
"learning_rate": 4.507186134986614e-06,
|
|
"loss": 1.2202,
|
|
"mean_token_accuracy": 0.7097592830657959,
|
|
"num_tokens": 169961051.0,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"entropy": 1.23515625,
|
|
"epoch": 0.25283586169195027,
|
|
"grad_norm": 0.11397480137267096,
|
|
"learning_rate": 4.503663519797098e-06,
|
|
"loss": 1.2456,
|
|
"mean_token_accuracy": 0.703343003988266,
|
|
"num_tokens": 170889299.0,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"entropy": 1.2203125,
|
|
"epoch": 0.25420254202542025,
|
|
"grad_norm": 0.1519230561342558,
|
|
"learning_rate": 4.5001409046075814e-06,
|
|
"loss": 1.2277,
|
|
"mean_token_accuracy": 0.7103578746318817,
|
|
"num_tokens": 171801507.0,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"entropy": 1.2171875,
|
|
"epoch": 0.25556922235889024,
|
|
"grad_norm": 0.10656497023375607,
|
|
"learning_rate": 4.496618289418064e-06,
|
|
"loss": 1.2133,
|
|
"mean_token_accuracy": 0.7096076488494873,
|
|
"num_tokens": 172756808.0,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"entropy": 1.21875,
|
|
"epoch": 0.2569359026923603,
|
|
"grad_norm": 0.12441822304740713,
|
|
"learning_rate": 4.493095674228548e-06,
|
|
"loss": 1.2218,
|
|
"mean_token_accuracy": 0.7108271956443787,
|
|
"num_tokens": 173650984.0,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"entropy": 1.275,
|
|
"epoch": 0.25830258302583026,
|
|
"grad_norm": 0.11334976370867855,
|
|
"learning_rate": 4.48957305903903e-06,
|
|
"loss": 1.2758,
|
|
"mean_token_accuracy": 0.6993201673030853,
|
|
"num_tokens": 174590986.0,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"entropy": 1.246875,
|
|
"epoch": 0.25966926335930024,
|
|
"grad_norm": 0.11868841589377474,
|
|
"learning_rate": 4.486050443849515e-06,
|
|
"loss": 1.2548,
|
|
"mean_token_accuracy": 0.7053836345672607,
|
|
"num_tokens": 175502905.0,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"entropy": 1.21875,
|
|
"epoch": 0.2610359436927703,
|
|
"grad_norm": 0.13194541336786514,
|
|
"learning_rate": 4.4825278286599975e-06,
|
|
"loss": 1.2303,
|
|
"mean_token_accuracy": 0.7070636808872223,
|
|
"num_tokens": 176434918.0,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"entropy": 1.19765625,
|
|
"epoch": 0.26240262402624026,
|
|
"grad_norm": 0.11886921564434713,
|
|
"learning_rate": 4.479005213470481e-06,
|
|
"loss": 1.2058,
|
|
"mean_token_accuracy": 0.7139345347881317,
|
|
"num_tokens": 177351387.0,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"entropy": 1.24296875,
|
|
"epoch": 0.26376930435971024,
|
|
"grad_norm": 0.1260382241098783,
|
|
"learning_rate": 4.475482598280964e-06,
|
|
"loss": 1.2375,
|
|
"mean_token_accuracy": 0.7038226902484894,
|
|
"num_tokens": 178273072.0,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"entropy": 1.20390625,
|
|
"epoch": 0.2651359846931803,
|
|
"grad_norm": 0.10945196510393462,
|
|
"learning_rate": 4.471959983091447e-06,
|
|
"loss": 1.2101,
|
|
"mean_token_accuracy": 0.7117155194282532,
|
|
"num_tokens": 179201344.0,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"entropy": 1.2328125,
|
|
"epoch": 0.26650266502665027,
|
|
"grad_norm": 0.11560791102260529,
|
|
"learning_rate": 4.468437367901931e-06,
|
|
"loss": 1.2431,
|
|
"mean_token_accuracy": 0.7073447406291962,
|
|
"num_tokens": 180128488.0,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"entropy": 1.27734375,
|
|
"epoch": 0.26786934536012025,
|
|
"grad_norm": 0.13477704120988218,
|
|
"learning_rate": 4.464914752712414e-06,
|
|
"loss": 1.2729,
|
|
"mean_token_accuracy": 0.7016937911510468,
|
|
"num_tokens": 181044049.0,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"entropy": 1.20234375,
|
|
"epoch": 0.2692360256935903,
|
|
"grad_norm": 0.1205197525908966,
|
|
"learning_rate": 4.461392137522897e-06,
|
|
"loss": 1.2103,
|
|
"mean_token_accuracy": 0.7103444397449493,
|
|
"num_tokens": 181988674.0,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"entropy": 1.196875,
|
|
"epoch": 0.27060270602706027,
|
|
"grad_norm": 0.11274915839630165,
|
|
"learning_rate": 4.457869522333381e-06,
|
|
"loss": 1.196,
|
|
"mean_token_accuracy": 0.7139041125774384,
|
|
"num_tokens": 182885151.0,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"entropy": 1.21796875,
|
|
"epoch": 0.27196938636053025,
|
|
"grad_norm": 0.14115034006865093,
|
|
"learning_rate": 4.454346907143864e-06,
|
|
"loss": 1.2277,
|
|
"mean_token_accuracy": 0.7105869233608246,
|
|
"num_tokens": 183799094.0,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"entropy": 1.26796875,
|
|
"epoch": 0.2733360666940003,
|
|
"grad_norm": 0.12514237632279177,
|
|
"learning_rate": 4.450824291954347e-06,
|
|
"loss": 1.2712,
|
|
"mean_token_accuracy": 0.7012050211429596,
|
|
"num_tokens": 184737383.0,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"entropy": 1.18828125,
|
|
"epoch": 0.2747027470274703,
|
|
"grad_norm": 0.09677814079568298,
|
|
"learning_rate": 4.44730167676483e-06,
|
|
"loss": 1.2019,
|
|
"mean_token_accuracy": 0.7142653167247772,
|
|
"num_tokens": 185666325.0,
|
|
"step": 2010
|
|
},
|
|
{
|
|
"entropy": 1.25390625,
|
|
"epoch": 0.27606942736094026,
|
|
"grad_norm": 0.12685234634198916,
|
|
"learning_rate": 4.443779061575314e-06,
|
|
"loss": 1.2637,
|
|
"mean_token_accuracy": 0.7005473852157593,
|
|
"num_tokens": 186596675.0,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"entropy": 1.19140625,
|
|
"epoch": 0.2774361076944103,
|
|
"grad_norm": 0.13342166451322687,
|
|
"learning_rate": 4.4402564463857975e-06,
|
|
"loss": 1.1932,
|
|
"mean_token_accuracy": 0.712081640958786,
|
|
"num_tokens": 187482907.0,
|
|
"step": 2030
|
|
},
|
|
{
|
|
"entropy": 1.2296875,
|
|
"epoch": 0.2788027880278803,
|
|
"grad_norm": 0.1202325338802687,
|
|
"learning_rate": 4.43673383119628e-06,
|
|
"loss": 1.2535,
|
|
"mean_token_accuracy": 0.7029890775680542,
|
|
"num_tokens": 188385774.0,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"entropy": 1.225,
|
|
"epoch": 0.28016946836135026,
|
|
"grad_norm": 0.13205668949042706,
|
|
"learning_rate": 4.433211216006764e-06,
|
|
"loss": 1.2363,
|
|
"mean_token_accuracy": 0.7062061607837677,
|
|
"num_tokens": 189277151.0,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"entropy": 1.20234375,
|
|
"epoch": 0.2815361486948203,
|
|
"grad_norm": 0.11231419740846599,
|
|
"learning_rate": 4.4296886008172465e-06,
|
|
"loss": 1.2231,
|
|
"mean_token_accuracy": 0.7078758299350738,
|
|
"num_tokens": 190184956.0,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"entropy": 1.21484375,
|
|
"epoch": 0.2829028290282903,
|
|
"grad_norm": 0.11824215697067271,
|
|
"learning_rate": 4.426165985627731e-06,
|
|
"loss": 1.2244,
|
|
"mean_token_accuracy": 0.7103768825531006,
|
|
"num_tokens": 191146194.0,
|
|
"step": 2070
|
|
},
|
|
{
|
|
"entropy": 1.2,
|
|
"epoch": 0.28426950936176026,
|
|
"grad_norm": 0.11780984115929422,
|
|
"learning_rate": 4.4226433704382136e-06,
|
|
"loss": 1.2024,
|
|
"mean_token_accuracy": 0.7145664989948273,
|
|
"num_tokens": 192033840.0,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"entropy": 1.2078125,
|
|
"epoch": 0.2856361896952303,
|
|
"grad_norm": 0.11411160362667996,
|
|
"learning_rate": 4.419120755248697e-06,
|
|
"loss": 1.2143,
|
|
"mean_token_accuracy": 0.7104967594146728,
|
|
"num_tokens": 192955176.0,
|
|
"step": 2090
|
|
},
|
|
{
|
|
"entropy": 1.3015625,
|
|
"epoch": 0.2870028700287003,
|
|
"grad_norm": 0.1098865935244753,
|
|
"learning_rate": 4.41559814005918e-06,
|
|
"loss": 1.3199,
|
|
"mean_token_accuracy": 0.6928641438484192,
|
|
"num_tokens": 193892359.0,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"entropy": 1.2140625,
|
|
"epoch": 0.28836955036217027,
|
|
"grad_norm": 0.10882516472344989,
|
|
"learning_rate": 4.412075524869663e-06,
|
|
"loss": 1.2142,
|
|
"mean_token_accuracy": 0.7125491738319397,
|
|
"num_tokens": 194764643.0,
|
|
"step": 2110
|
|
},
|
|
{
|
|
"entropy": 1.23515625,
|
|
"epoch": 0.2897362306956403,
|
|
"grad_norm": 0.10866670982873326,
|
|
"learning_rate": 4.408552909680147e-06,
|
|
"loss": 1.2374,
|
|
"mean_token_accuracy": 0.7093129515647888,
|
|
"num_tokens": 195672996.0,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"entropy": 1.23359375,
|
|
"epoch": 0.2911029110291103,
|
|
"grad_norm": 0.10479200134743032,
|
|
"learning_rate": 4.4050302944906304e-06,
|
|
"loss": 1.2439,
|
|
"mean_token_accuracy": 0.7049891710281372,
|
|
"num_tokens": 196608934.0,
|
|
"step": 2130
|
|
},
|
|
{
|
|
"entropy": 1.221875,
|
|
"epoch": 0.2924695913625803,
|
|
"grad_norm": 0.2463812765217771,
|
|
"learning_rate": 4.401507679301113e-06,
|
|
"loss": 1.2166,
|
|
"mean_token_accuracy": 0.7089015126228333,
|
|
"num_tokens": 197498404.0,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"entropy": 1.296875,
|
|
"epoch": 0.2938362716960503,
|
|
"grad_norm": 0.12447272896619188,
|
|
"learning_rate": 4.397985064111597e-06,
|
|
"loss": 1.311,
|
|
"mean_token_accuracy": 0.6941071271896362,
|
|
"num_tokens": 198454218.0,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"entropy": 1.2328125,
|
|
"epoch": 0.2952029520295203,
|
|
"grad_norm": 0.1446477237455551,
|
|
"learning_rate": 4.39446244892208e-06,
|
|
"loss": 1.2336,
|
|
"mean_token_accuracy": 0.7065993070602417,
|
|
"num_tokens": 199374348.0,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"entropy": 1.23125,
|
|
"epoch": 0.2965696323629903,
|
|
"grad_norm": 0.1366402368806956,
|
|
"learning_rate": 4.390939833732563e-06,
|
|
"loss": 1.2388,
|
|
"mean_token_accuracy": 0.7058007836341857,
|
|
"num_tokens": 200221659.0,
|
|
"step": 2170
|
|
},
|
|
{
|
|
"entropy": 1.22421875,
|
|
"epoch": 0.2979363126964603,
|
|
"grad_norm": 0.12790273228362115,
|
|
"learning_rate": 4.387417218543047e-06,
|
|
"loss": 1.2366,
|
|
"mean_token_accuracy": 0.7064303874969482,
|
|
"num_tokens": 201155187.0,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"entropy": 1.18671875,
|
|
"epoch": 0.2993029930299303,
|
|
"grad_norm": 0.11901216602819274,
|
|
"learning_rate": 4.38389460335353e-06,
|
|
"loss": 1.1765,
|
|
"mean_token_accuracy": 0.7179707825183869,
|
|
"num_tokens": 202067534.0,
|
|
"step": 2190
|
|
},
|
|
{
|
|
"entropy": 1.234375,
|
|
"epoch": 0.3006696733634003,
|
|
"grad_norm": 0.10686910613181509,
|
|
"learning_rate": 4.380371988164014e-06,
|
|
"loss": 1.2431,
|
|
"mean_token_accuracy": 0.7037003576755524,
|
|
"num_tokens": 203044126.0,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"entropy": 1.21796875,
|
|
"epoch": 0.3020363536968703,
|
|
"grad_norm": 0.11027956143712456,
|
|
"learning_rate": 4.376849372974496e-06,
|
|
"loss": 1.223,
|
|
"mean_token_accuracy": 0.7111189782619476,
|
|
"num_tokens": 203996905.0,
|
|
"step": 2210
|
|
},
|
|
{
|
|
"entropy": 1.2,
|
|
"epoch": 0.3034030340303403,
|
|
"grad_norm": 0.12978608162353986,
|
|
"learning_rate": 4.37332675778498e-06,
|
|
"loss": 1.201,
|
|
"mean_token_accuracy": 0.7120745122432709,
|
|
"num_tokens": 204929875.0,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"entropy": 1.2046875,
|
|
"epoch": 0.3047697143638103,
|
|
"grad_norm": 0.11546520868568194,
|
|
"learning_rate": 4.369804142595463e-06,
|
|
"loss": 1.2218,
|
|
"mean_token_accuracy": 0.7113225281238555,
|
|
"num_tokens": 205839908.0,
|
|
"step": 2230
|
|
},
|
|
{
|
|
"entropy": 1.22890625,
|
|
"epoch": 0.3061363946972803,
|
|
"grad_norm": 0.11575292759732109,
|
|
"learning_rate": 4.366281527405947e-06,
|
|
"loss": 1.2417,
|
|
"mean_token_accuracy": 0.706443864107132,
|
|
"num_tokens": 206800232.0,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"entropy": 1.171875,
|
|
"epoch": 0.3075030750307503,
|
|
"grad_norm": 0.13344276333039418,
|
|
"learning_rate": 4.36275891221643e-06,
|
|
"loss": 1.1714,
|
|
"mean_token_accuracy": 0.7159866094589233,
|
|
"num_tokens": 207736757.0,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"entropy": 1.184375,
|
|
"epoch": 0.3088697553642203,
|
|
"grad_norm": 0.12553595247017077,
|
|
"learning_rate": 4.359236297026913e-06,
|
|
"loss": 1.1931,
|
|
"mean_token_accuracy": 0.7139945566654206,
|
|
"num_tokens": 208650386.0,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"entropy": 1.17421875,
|
|
"epoch": 0.3102364356976903,
|
|
"grad_norm": 0.13741122740741044,
|
|
"learning_rate": 4.355713681837396e-06,
|
|
"loss": 1.1833,
|
|
"mean_token_accuracy": 0.7163913369178772,
|
|
"num_tokens": 209561366.0,
|
|
"step": 2270
|
|
},
|
|
{
|
|
"entropy": 1.2,
|
|
"epoch": 0.3116031160311603,
|
|
"grad_norm": 0.11207335530286498,
|
|
"learning_rate": 4.352191066647879e-06,
|
|
"loss": 1.21,
|
|
"mean_token_accuracy": 0.7117487967014313,
|
|
"num_tokens": 210471900.0,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"entropy": 1.20234375,
|
|
"epoch": 0.3129697963646303,
|
|
"grad_norm": 0.13255210284043265,
|
|
"learning_rate": 4.348668451458363e-06,
|
|
"loss": 1.214,
|
|
"mean_token_accuracy": 0.7127776205539703,
|
|
"num_tokens": 211400549.0,
|
|
"step": 2290
|
|
},
|
|
{
|
|
"entropy": 1.25703125,
|
|
"epoch": 0.31433647669810033,
|
|
"grad_norm": 0.11161323765865865,
|
|
"learning_rate": 4.3451458362688465e-06,
|
|
"loss": 1.2558,
|
|
"mean_token_accuracy": 0.7033869504928589,
|
|
"num_tokens": 212322958.0,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"entropy": 1.22890625,
|
|
"epoch": 0.3157031570315703,
|
|
"grad_norm": 0.10833588272240326,
|
|
"learning_rate": 4.34162322107933e-06,
|
|
"loss": 1.2244,
|
|
"mean_token_accuracy": 0.711439573764801,
|
|
"num_tokens": 213218481.0,
|
|
"step": 2310
|
|
},
|
|
{
|
|
"entropy": 1.2296875,
|
|
"epoch": 0.3170698373650403,
|
|
"grad_norm": 0.11364277803801169,
|
|
"learning_rate": 4.338100605889813e-06,
|
|
"loss": 1.2279,
|
|
"mean_token_accuracy": 0.7098181307315826,
|
|
"num_tokens": 214104458.0,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"entropy": 1.1703125,
|
|
"epoch": 0.31843651769851034,
|
|
"grad_norm": 0.12007416608833255,
|
|
"learning_rate": 4.334577990700296e-06,
|
|
"loss": 1.1647,
|
|
"mean_token_accuracy": 0.7219733655452728,
|
|
"num_tokens": 214997474.0,
|
|
"step": 2330
|
|
},
|
|
{
|
|
"entropy": 1.2515625,
|
|
"epoch": 0.3198031980319803,
|
|
"grad_norm": 0.2269245943352355,
|
|
"learning_rate": 4.33105537551078e-06,
|
|
"loss": 1.259,
|
|
"mean_token_accuracy": 0.7017202854156495,
|
|
"num_tokens": 215898945.0,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"entropy": 1.1828125,
|
|
"epoch": 0.3211698783654503,
|
|
"grad_norm": 0.11183231634909453,
|
|
"learning_rate": 4.327532760321263e-06,
|
|
"loss": 1.1803,
|
|
"mean_token_accuracy": 0.7193388164043426,
|
|
"num_tokens": 216780044.0,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"entropy": 1.209375,
|
|
"epoch": 0.32253655869892034,
|
|
"grad_norm": 0.12084723475645805,
|
|
"learning_rate": 4.324010145131746e-06,
|
|
"loss": 1.2123,
|
|
"mean_token_accuracy": 0.7096632719039917,
|
|
"num_tokens": 217686514.0,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"entropy": 1.22421875,
|
|
"epoch": 0.3239032390323903,
|
|
"grad_norm": 0.11142643725482235,
|
|
"learning_rate": 4.32048752994223e-06,
|
|
"loss": 1.2306,
|
|
"mean_token_accuracy": 0.7080573976039887,
|
|
"num_tokens": 218646157.0,
|
|
"step": 2370
|
|
},
|
|
{
|
|
"entropy": 1.23125,
|
|
"epoch": 0.3252699193658603,
|
|
"grad_norm": 0.10528294781606598,
|
|
"learning_rate": 4.316964914752712e-06,
|
|
"loss": 1.2334,
|
|
"mean_token_accuracy": 0.7093044757843018,
|
|
"num_tokens": 219565771.0,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"entropy": 1.23515625,
|
|
"epoch": 0.32663659969933034,
|
|
"grad_norm": 0.12843247982389333,
|
|
"learning_rate": 4.313442299563196e-06,
|
|
"loss": 1.2418,
|
|
"mean_token_accuracy": 0.7083779156208039,
|
|
"num_tokens": 220446709.0,
|
|
"step": 2390
|
|
},
|
|
{
|
|
"entropy": 1.20234375,
|
|
"epoch": 0.3280032800328003,
|
|
"grad_norm": 0.12812790822227052,
|
|
"learning_rate": 4.3099196843736794e-06,
|
|
"loss": 1.2136,
|
|
"mean_token_accuracy": 0.7105535268783569,
|
|
"num_tokens": 221335909.0,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"entropy": 1.2234375,
|
|
"epoch": 0.3293699603662703,
|
|
"grad_norm": 0.11108766415921036,
|
|
"learning_rate": 4.306397069184163e-06,
|
|
"loss": 1.22,
|
|
"mean_token_accuracy": 0.707669323682785,
|
|
"num_tokens": 222251385.0,
|
|
"step": 2410
|
|
},
|
|
{
|
|
"entropy": 1.237109375,
|
|
"epoch": 0.33073664069974035,
|
|
"grad_norm": 0.13240602749217906,
|
|
"learning_rate": 4.302874453994646e-06,
|
|
"loss": 1.246,
|
|
"mean_token_accuracy": 0.7040450394153595,
|
|
"num_tokens": 223160354.0,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"entropy": 1.2015625,
|
|
"epoch": 0.33210332103321033,
|
|
"grad_norm": 0.11253030245534361,
|
|
"learning_rate": 4.299351838805129e-06,
|
|
"loss": 1.2111,
|
|
"mean_token_accuracy": 0.7120585322380066,
|
|
"num_tokens": 224079742.0,
|
|
"step": 2430
|
|
},
|
|
{
|
|
"entropy": 1.23125,
|
|
"epoch": 0.3334700013666803,
|
|
"grad_norm": 0.12067290521300841,
|
|
"learning_rate": 4.295829223615613e-06,
|
|
"loss": 1.239,
|
|
"mean_token_accuracy": 0.707594096660614,
|
|
"num_tokens": 225006893.0,
|
|
"step": 2440
|
|
},
|
|
{
|
|
"entropy": 1.2390625,
|
|
"epoch": 0.33483668170015035,
|
|
"grad_norm": 0.11486008610530264,
|
|
"learning_rate": 4.292306608426096e-06,
|
|
"loss": 1.2312,
|
|
"mean_token_accuracy": 0.7086809694766998,
|
|
"num_tokens": 225912700.0,
|
|
"step": 2450
|
|
},
|
|
{
|
|
"entropy": 1.18828125,
|
|
"epoch": 0.33620336203362033,
|
|
"grad_norm": 0.12195001844261555,
|
|
"learning_rate": 4.288783993236579e-06,
|
|
"loss": 1.1851,
|
|
"mean_token_accuracy": 0.7163573622703552,
|
|
"num_tokens": 226846501.0,
|
|
"step": 2460
|
|
},
|
|
{
|
|
"entropy": 1.22578125,
|
|
"epoch": 0.3375700423670903,
|
|
"grad_norm": 0.12005245889075568,
|
|
"learning_rate": 4.285261378047063e-06,
|
|
"loss": 1.2261,
|
|
"mean_token_accuracy": 0.7102294027805328,
|
|
"num_tokens": 227775064.0,
|
|
"step": 2470
|
|
},
|
|
{
|
|
"entropy": 1.2234375,
|
|
"epoch": 0.33893672270056036,
|
|
"grad_norm": 0.12383690679452089,
|
|
"learning_rate": 4.281738762857546e-06,
|
|
"loss": 1.2351,
|
|
"mean_token_accuracy": 0.7045800745487213,
|
|
"num_tokens": 228705788.0,
|
|
"step": 2480
|
|
},
|
|
{
|
|
"entropy": 1.275,
|
|
"epoch": 0.34030340303403034,
|
|
"grad_norm": 0.11654133716901861,
|
|
"learning_rate": 4.278216147668029e-06,
|
|
"loss": 1.2764,
|
|
"mean_token_accuracy": 0.6989609181880951,
|
|
"num_tokens": 229637091.0,
|
|
"step": 2490
|
|
},
|
|
{
|
|
"entropy": 1.2203125,
|
|
"epoch": 0.3416700833675003,
|
|
"grad_norm": 0.12141160564553213,
|
|
"learning_rate": 4.274693532478512e-06,
|
|
"loss": 1.2271,
|
|
"mean_token_accuracy": 0.7076741099357605,
|
|
"num_tokens": 230557197.0,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"entropy": 1.171875,
|
|
"epoch": 0.34303676370097036,
|
|
"grad_norm": 0.1053563718610391,
|
|
"learning_rate": 4.271170917288996e-06,
|
|
"loss": 1.1741,
|
|
"mean_token_accuracy": 0.7200338065624237,
|
|
"num_tokens": 231469955.0,
|
|
"step": 2510
|
|
},
|
|
{
|
|
"entropy": 1.2,
|
|
"epoch": 0.34440344403444034,
|
|
"grad_norm": 0.11227460620650129,
|
|
"learning_rate": 4.2676483020994795e-06,
|
|
"loss": 1.1995,
|
|
"mean_token_accuracy": 0.714777284860611,
|
|
"num_tokens": 232397901.0,
|
|
"step": 2520
|
|
},
|
|
{
|
|
"entropy": 1.23984375,
|
|
"epoch": 0.3457701243679103,
|
|
"grad_norm": 0.11552577057591451,
|
|
"learning_rate": 4.264125686909962e-06,
|
|
"loss": 1.2555,
|
|
"mean_token_accuracy": 0.7049373269081116,
|
|
"num_tokens": 233361773.0,
|
|
"step": 2530
|
|
},
|
|
{
|
|
"entropy": 1.2125,
|
|
"epoch": 0.34713680470138036,
|
|
"grad_norm": 0.11828892613353097,
|
|
"learning_rate": 4.260603071720446e-06,
|
|
"loss": 1.2218,
|
|
"mean_token_accuracy": 0.7103797733783722,
|
|
"num_tokens": 234268527.0,
|
|
"step": 2540
|
|
},
|
|
{
|
|
"entropy": 1.24296875,
|
|
"epoch": 0.34850348503485035,
|
|
"grad_norm": 0.14272668487569073,
|
|
"learning_rate": 4.257080456530928e-06,
|
|
"loss": 1.2491,
|
|
"mean_token_accuracy": 0.7055578172206879,
|
|
"num_tokens": 235186978.0,
|
|
"step": 2550
|
|
},
|
|
{
|
|
"entropy": 1.178125,
|
|
"epoch": 0.34987016536832033,
|
|
"grad_norm": 0.11862282093605896,
|
|
"learning_rate": 4.253557841341413e-06,
|
|
"loss": 1.1789,
|
|
"mean_token_accuracy": 0.7168623864650726,
|
|
"num_tokens": 236119762.0,
|
|
"step": 2560
|
|
},
|
|
{
|
|
"entropy": 1.209375,
|
|
"epoch": 0.35123684570179037,
|
|
"grad_norm": 0.12485443969166463,
|
|
"learning_rate": 4.2500352261518955e-06,
|
|
"loss": 1.2134,
|
|
"mean_token_accuracy": 0.7092878937721252,
|
|
"num_tokens": 237045069.0,
|
|
"step": 2570
|
|
},
|
|
{
|
|
"entropy": 1.1484375,
|
|
"epoch": 0.35260352603526035,
|
|
"grad_norm": 0.11953568199889339,
|
|
"learning_rate": 4.246512610962379e-06,
|
|
"loss": 1.1534,
|
|
"mean_token_accuracy": 0.7228009521961212,
|
|
"num_tokens": 237966003.0,
|
|
"step": 2580
|
|
},
|
|
{
|
|
"entropy": 1.22421875,
|
|
"epoch": 0.35397020636873033,
|
|
"grad_norm": 0.12084189725520857,
|
|
"learning_rate": 4.242989995772862e-06,
|
|
"loss": 1.2291,
|
|
"mean_token_accuracy": 0.709770941734314,
|
|
"num_tokens": 238847792.0,
|
|
"step": 2590
|
|
},
|
|
{
|
|
"entropy": 1.2265625,
|
|
"epoch": 0.35533688670220037,
|
|
"grad_norm": 0.11935594814398096,
|
|
"learning_rate": 4.239467380583345e-06,
|
|
"loss": 1.2386,
|
|
"mean_token_accuracy": 0.7061507999897003,
|
|
"num_tokens": 239817460.0,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"entropy": 1.196875,
|
|
"epoch": 0.35670356703567035,
|
|
"grad_norm": 0.11715003746147501,
|
|
"learning_rate": 4.235944765393829e-06,
|
|
"loss": 1.1877,
|
|
"mean_token_accuracy": 0.7155265510082245,
|
|
"num_tokens": 240730875.0,
|
|
"step": 2610
|
|
},
|
|
{
|
|
"entropy": 1.22109375,
|
|
"epoch": 0.35807024736914034,
|
|
"grad_norm": 0.11917742363461516,
|
|
"learning_rate": 4.232422150204312e-06,
|
|
"loss": 1.2314,
|
|
"mean_token_accuracy": 0.7052329897880554,
|
|
"num_tokens": 241618927.0,
|
|
"step": 2620
|
|
},
|
|
{
|
|
"entropy": 1.27890625,
|
|
"epoch": 0.3594369277026104,
|
|
"grad_norm": 0.11878306107182499,
|
|
"learning_rate": 4.228899535014795e-06,
|
|
"loss": 1.2832,
|
|
"mean_token_accuracy": 0.6976205408573151,
|
|
"num_tokens": 242506323.0,
|
|
"step": 2630
|
|
},
|
|
{
|
|
"entropy": 1.20859375,
|
|
"epoch": 0.36080360803608036,
|
|
"grad_norm": 0.11747890583910714,
|
|
"learning_rate": 4.225376919825279e-06,
|
|
"loss": 1.2176,
|
|
"mean_token_accuracy": 0.7101323366165161,
|
|
"num_tokens": 243419790.0,
|
|
"step": 2640
|
|
},
|
|
{
|
|
"entropy": 1.21015625,
|
|
"epoch": 0.36217028836955034,
|
|
"grad_norm": 0.11121326458236319,
|
|
"learning_rate": 4.221854304635762e-06,
|
|
"loss": 1.2084,
|
|
"mean_token_accuracy": 0.709800523519516,
|
|
"num_tokens": 244337605.0,
|
|
"step": 2650
|
|
},
|
|
{
|
|
"entropy": 1.26953125,
|
|
"epoch": 0.3635369687030204,
|
|
"grad_norm": 0.1245208390718137,
|
|
"learning_rate": 4.218331689446245e-06,
|
|
"loss": 1.2887,
|
|
"mean_token_accuracy": 0.6973680019378662,
|
|
"num_tokens": 245269588.0,
|
|
"step": 2660
|
|
},
|
|
{
|
|
"entropy": 1.19453125,
|
|
"epoch": 0.36490364903649036,
|
|
"grad_norm": 0.11518069975595381,
|
|
"learning_rate": 4.2148090742567284e-06,
|
|
"loss": 1.1881,
|
|
"mean_token_accuracy": 0.7169245898723602,
|
|
"num_tokens": 246183522.0,
|
|
"step": 2670
|
|
},
|
|
{
|
|
"entropy": 1.20390625,
|
|
"epoch": 0.36627032936996035,
|
|
"grad_norm": 0.11429037769033079,
|
|
"learning_rate": 4.211286459067212e-06,
|
|
"loss": 1.2286,
|
|
"mean_token_accuracy": 0.7041479349136353,
|
|
"num_tokens": 247159185.0,
|
|
"step": 2680
|
|
},
|
|
{
|
|
"entropy": 1.16171875,
|
|
"epoch": 0.3676370097034304,
|
|
"grad_norm": 0.12335155663354345,
|
|
"learning_rate": 4.2077638438776955e-06,
|
|
"loss": 1.1529,
|
|
"mean_token_accuracy": 0.7237085461616516,
|
|
"num_tokens": 248044696.0,
|
|
"step": 2690
|
|
},
|
|
{
|
|
"entropy": 1.17890625,
|
|
"epoch": 0.36900369003690037,
|
|
"grad_norm": 0.11189889094970047,
|
|
"learning_rate": 4.204241228688178e-06,
|
|
"loss": 1.1898,
|
|
"mean_token_accuracy": 0.7170444011688233,
|
|
"num_tokens": 249045044.0,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"entropy": 1.1890625,
|
|
"epoch": 0.37037037037037035,
|
|
"grad_norm": 0.11382112110069235,
|
|
"learning_rate": 4.200718613498662e-06,
|
|
"loss": 1.1965,
|
|
"mean_token_accuracy": 0.7144470691680909,
|
|
"num_tokens": 249974276.0,
|
|
"step": 2710
|
|
},
|
|
{
|
|
"entropy": 1.20078125,
|
|
"epoch": 0.3717370507038404,
|
|
"grad_norm": 0.11853008668983502,
|
|
"learning_rate": 4.1971959983091445e-06,
|
|
"loss": 1.1955,
|
|
"mean_token_accuracy": 0.7156452000141144,
|
|
"num_tokens": 250873544.0,
|
|
"step": 2720
|
|
},
|
|
{
|
|
"entropy": 1.171875,
|
|
"epoch": 0.37310373103731037,
|
|
"grad_norm": 0.11917532554480921,
|
|
"learning_rate": 4.193673383119629e-06,
|
|
"loss": 1.1675,
|
|
"mean_token_accuracy": 0.7191675782203675,
|
|
"num_tokens": 251819879.0,
|
|
"step": 2730
|
|
},
|
|
{
|
|
"entropy": 1.25703125,
|
|
"epoch": 0.37447041137078035,
|
|
"grad_norm": 0.11386756960276433,
|
|
"learning_rate": 4.190150767930112e-06,
|
|
"loss": 1.2696,
|
|
"mean_token_accuracy": 0.7022183835506439,
|
|
"num_tokens": 252807631.0,
|
|
"step": 2740
|
|
},
|
|
{
|
|
"entropy": 1.271875,
|
|
"epoch": 0.3758370917042504,
|
|
"grad_norm": 0.11676557052482973,
|
|
"learning_rate": 4.186628152740595e-06,
|
|
"loss": 1.279,
|
|
"mean_token_accuracy": 0.6998183012008667,
|
|
"num_tokens": 253747908.0,
|
|
"step": 2750
|
|
},
|
|
{
|
|
"entropy": 1.19453125,
|
|
"epoch": 0.3772037720377204,
|
|
"grad_norm": 0.1203453282692222,
|
|
"learning_rate": 4.183105537551078e-06,
|
|
"loss": 1.1884,
|
|
"mean_token_accuracy": 0.7147728085517884,
|
|
"num_tokens": 254680625.0,
|
|
"step": 2760
|
|
},
|
|
{
|
|
"entropy": 1.246875,
|
|
"epoch": 0.37857045237119036,
|
|
"grad_norm": 0.12668139378971868,
|
|
"learning_rate": 4.179582922361561e-06,
|
|
"loss": 1.2606,
|
|
"mean_token_accuracy": 0.700765335559845,
|
|
"num_tokens": 255573141.0,
|
|
"step": 2770
|
|
},
|
|
{
|
|
"entropy": 1.23203125,
|
|
"epoch": 0.3799371327046604,
|
|
"grad_norm": 0.11833151993804719,
|
|
"learning_rate": 4.176060307172045e-06,
|
|
"loss": 1.2355,
|
|
"mean_token_accuracy": 0.706069964170456,
|
|
"num_tokens": 256517622.0,
|
|
"step": 2780
|
|
},
|
|
{
|
|
"entropy": 1.20625,
|
|
"epoch": 0.3813038130381304,
|
|
"grad_norm": 0.11503083766204669,
|
|
"learning_rate": 4.1725376919825285e-06,
|
|
"loss": 1.2065,
|
|
"mean_token_accuracy": 0.7133639216423034,
|
|
"num_tokens": 257406301.0,
|
|
"step": 2790
|
|
},
|
|
{
|
|
"entropy": 1.2125,
|
|
"epoch": 0.38267049337160036,
|
|
"grad_norm": 0.11575321341355244,
|
|
"learning_rate": 4.169015076793011e-06,
|
|
"loss": 1.2163,
|
|
"mean_token_accuracy": 0.7110467731952668,
|
|
"num_tokens": 258296600.0,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"entropy": 1.178125,
|
|
"epoch": 0.3840371737050704,
|
|
"grad_norm": 0.11796419411398186,
|
|
"learning_rate": 4.165492461603495e-06,
|
|
"loss": 1.1832,
|
|
"mean_token_accuracy": 0.7158100306987762,
|
|
"num_tokens": 259223724.0,
|
|
"step": 2810
|
|
},
|
|
{
|
|
"entropy": 1.1734375,
|
|
"epoch": 0.3854038540385404,
|
|
"grad_norm": 0.14971311816805744,
|
|
"learning_rate": 4.161969846413978e-06,
|
|
"loss": 1.181,
|
|
"mean_token_accuracy": 0.7173164546489715,
|
|
"num_tokens": 260114709.0,
|
|
"step": 2820
|
|
},
|
|
{
|
|
"entropy": 1.141015625,
|
|
"epoch": 0.38677053437201037,
|
|
"grad_norm": 0.10434771262484813,
|
|
"learning_rate": 4.158447231224461e-06,
|
|
"loss": 1.1463,
|
|
"mean_token_accuracy": 0.7233933866024017,
|
|
"num_tokens": 261095644.0,
|
|
"step": 2830
|
|
},
|
|
{
|
|
"entropy": 1.22109375,
|
|
"epoch": 0.3881372147054804,
|
|
"grad_norm": 0.22953133616197297,
|
|
"learning_rate": 4.1549246160349445e-06,
|
|
"loss": 1.2192,
|
|
"mean_token_accuracy": 0.7088740646839142,
|
|
"num_tokens": 262062947.0,
|
|
"step": 2840
|
|
},
|
|
{
|
|
"entropy": 1.20390625,
|
|
"epoch": 0.3895038950389504,
|
|
"grad_norm": 0.11885981760225418,
|
|
"learning_rate": 4.151402000845428e-06,
|
|
"loss": 1.2112,
|
|
"mean_token_accuracy": 0.7109782636165619,
|
|
"num_tokens": 263008190.0,
|
|
"step": 2850
|
|
},
|
|
{
|
|
"entropy": 1.17421875,
|
|
"epoch": 0.39087057537242037,
|
|
"grad_norm": 0.10194101188338277,
|
|
"learning_rate": 4.147879385655912e-06,
|
|
"loss": 1.1719,
|
|
"mean_token_accuracy": 0.7197973072528839,
|
|
"num_tokens": 263918362.0,
|
|
"step": 2860
|
|
},
|
|
{
|
|
"entropy": 1.1875,
|
|
"epoch": 0.3922372557058904,
|
|
"grad_norm": 0.12345415130565106,
|
|
"learning_rate": 4.144356770466394e-06,
|
|
"loss": 1.1897,
|
|
"mean_token_accuracy": 0.7126540601253509,
|
|
"num_tokens": 264804628.0,
|
|
"step": 2870
|
|
},
|
|
{
|
|
"entropy": 1.27734375,
|
|
"epoch": 0.3936039360393604,
|
|
"grad_norm": 0.11591416511856474,
|
|
"learning_rate": 4.140834155276878e-06,
|
|
"loss": 1.2763,
|
|
"mean_token_accuracy": 0.6998351395130158,
|
|
"num_tokens": 265741733.0,
|
|
"step": 2880
|
|
},
|
|
{
|
|
"entropy": 1.22578125,
|
|
"epoch": 0.3949706163728304,
|
|
"grad_norm": 0.10817228662088077,
|
|
"learning_rate": 4.137311540087361e-06,
|
|
"loss": 1.234,
|
|
"mean_token_accuracy": 0.7093201756477356,
|
|
"num_tokens": 266715918.0,
|
|
"step": 2890
|
|
},
|
|
{
|
|
"entropy": 1.25546875,
|
|
"epoch": 0.3963372967063004,
|
|
"grad_norm": 0.47234869716593475,
|
|
"learning_rate": 4.133788924897845e-06,
|
|
"loss": 1.2476,
|
|
"mean_token_accuracy": 0.7055683612823487,
|
|
"num_tokens": 267644209.0,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"entropy": 1.15546875,
|
|
"epoch": 0.3977039770397704,
|
|
"grad_norm": 0.10988257144539172,
|
|
"learning_rate": 4.130266309708328e-06,
|
|
"loss": 1.1584,
|
|
"mean_token_accuracy": 0.7235127747058868,
|
|
"num_tokens": 268556439.0,
|
|
"step": 2910
|
|
},
|
|
{
|
|
"entropy": 1.2515625,
|
|
"epoch": 0.3990706573732404,
|
|
"grad_norm": 0.1333696061680535,
|
|
"learning_rate": 4.126743694518811e-06,
|
|
"loss": 1.2505,
|
|
"mean_token_accuracy": 0.704085236787796,
|
|
"num_tokens": 269504566.0,
|
|
"step": 2920
|
|
},
|
|
{
|
|
"entropy": 1.1890625,
|
|
"epoch": 0.4004373377067104,
|
|
"grad_norm": 0.12154038639678431,
|
|
"learning_rate": 4.123221079329294e-06,
|
|
"loss": 1.1895,
|
|
"mean_token_accuracy": 0.7149413347244262,
|
|
"num_tokens": 270367741.0,
|
|
"step": 2930
|
|
},
|
|
{
|
|
"entropy": 1.2265625,
|
|
"epoch": 0.4018040180401804,
|
|
"grad_norm": 0.1268580275941906,
|
|
"learning_rate": 4.1196984641397774e-06,
|
|
"loss": 1.2296,
|
|
"mean_token_accuracy": 0.7079175114631653,
|
|
"num_tokens": 271271744.0,
|
|
"step": 2940
|
|
},
|
|
{
|
|
"entropy": 1.169140625,
|
|
"epoch": 0.4031706983736504,
|
|
"grad_norm": 0.12292168374204256,
|
|
"learning_rate": 4.116175848950261e-06,
|
|
"loss": 1.1724,
|
|
"mean_token_accuracy": 0.722224086523056,
|
|
"num_tokens": 272171745.0,
|
|
"step": 2950
|
|
},
|
|
{
|
|
"entropy": 1.17578125,
|
|
"epoch": 0.4045373787071204,
|
|
"grad_norm": 0.1258877286489614,
|
|
"learning_rate": 4.1126532337607445e-06,
|
|
"loss": 1.1797,
|
|
"mean_token_accuracy": 0.7176162540912628,
|
|
"num_tokens": 273016295.0,
|
|
"step": 2960
|
|
},
|
|
{
|
|
"entropy": 1.22421875,
|
|
"epoch": 0.4059040590405904,
|
|
"grad_norm": 0.14022058252869884,
|
|
"learning_rate": 4.109130618571227e-06,
|
|
"loss": 1.2256,
|
|
"mean_token_accuracy": 0.7095972955226898,
|
|
"num_tokens": 273956028.0,
|
|
"step": 2970
|
|
},
|
|
{
|
|
"entropy": 1.21796875,
|
|
"epoch": 0.4072707393740604,
|
|
"grad_norm": 0.12905623218032924,
|
|
"learning_rate": 4.105608003381711e-06,
|
|
"loss": 1.2172,
|
|
"mean_token_accuracy": 0.7056478261947632,
|
|
"num_tokens": 274864033.0,
|
|
"step": 2980
|
|
},
|
|
{
|
|
"entropy": 1.1625,
|
|
"epoch": 0.4086374197075304,
|
|
"grad_norm": 0.12818451697275637,
|
|
"learning_rate": 4.102085388192194e-06,
|
|
"loss": 1.1592,
|
|
"mean_token_accuracy": 0.7219943702220917,
|
|
"num_tokens": 275771714.0,
|
|
"step": 2990
|
|
},
|
|
{
|
|
"entropy": 1.20625,
|
|
"epoch": 0.4100041000410004,
|
|
"grad_norm": 0.13113954180739548,
|
|
"learning_rate": 4.098562773002678e-06,
|
|
"loss": 1.2128,
|
|
"mean_token_accuracy": 0.7069997549057007,
|
|
"num_tokens": 276671468.0,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"entropy": 1.1796875,
|
|
"epoch": 0.4113707803744704,
|
|
"grad_norm": 0.11849935741616247,
|
|
"learning_rate": 4.095040157813161e-06,
|
|
"loss": 1.1783,
|
|
"mean_token_accuracy": 0.7181065678596497,
|
|
"num_tokens": 277629191.0,
|
|
"step": 3010
|
|
},
|
|
{
|
|
"entropy": 1.22421875,
|
|
"epoch": 0.41273746070794043,
|
|
"grad_norm": 0.11866154485799126,
|
|
"learning_rate": 4.091517542623644e-06,
|
|
"loss": 1.2422,
|
|
"mean_token_accuracy": 0.7059893846511841,
|
|
"num_tokens": 278530380.0,
|
|
"step": 3020
|
|
},
|
|
{
|
|
"entropy": 1.25625,
|
|
"epoch": 0.4141041410414104,
|
|
"grad_norm": 0.1294452839900264,
|
|
"learning_rate": 4.087994927434128e-06,
|
|
"loss": 1.2754,
|
|
"mean_token_accuracy": 0.6988065004348755,
|
|
"num_tokens": 279444562.0,
|
|
"step": 3030
|
|
},
|
|
{
|
|
"entropy": 1.21796875,
|
|
"epoch": 0.4154708213748804,
|
|
"grad_norm": 0.11638858306057102,
|
|
"learning_rate": 4.08447231224461e-06,
|
|
"loss": 1.2276,
|
|
"mean_token_accuracy": 0.7070161581039429,
|
|
"num_tokens": 280334745.0,
|
|
"step": 3040
|
|
},
|
|
{
|
|
"entropy": 1.15625,
|
|
"epoch": 0.41683750170835043,
|
|
"grad_norm": 0.12824815954937224,
|
|
"learning_rate": 4.080949697055094e-06,
|
|
"loss": 1.1597,
|
|
"mean_token_accuracy": 0.7196870088577271,
|
|
"num_tokens": 281213602.0,
|
|
"step": 3050
|
|
},
|
|
{
|
|
"entropy": 1.17421875,
|
|
"epoch": 0.4182041820418204,
|
|
"grad_norm": 0.10612786285280862,
|
|
"learning_rate": 4.0774270818655775e-06,
|
|
"loss": 1.1745,
|
|
"mean_token_accuracy": 0.7197153091430664,
|
|
"num_tokens": 282155704.0,
|
|
"step": 3060
|
|
},
|
|
{
|
|
"entropy": 1.2453125,
|
|
"epoch": 0.4195708623752904,
|
|
"grad_norm": 0.12696302258174705,
|
|
"learning_rate": 4.073904466676061e-06,
|
|
"loss": 1.2494,
|
|
"mean_token_accuracy": 0.7032497525215149,
|
|
"num_tokens": 283060115.0,
|
|
"step": 3070
|
|
},
|
|
{
|
|
"entropy": 1.22109375,
|
|
"epoch": 0.42093754270876044,
|
|
"grad_norm": 0.13445075590942424,
|
|
"learning_rate": 4.070381851486544e-06,
|
|
"loss": 1.2277,
|
|
"mean_token_accuracy": 0.7112857103347778,
|
|
"num_tokens": 284015383.0,
|
|
"step": 3080
|
|
},
|
|
{
|
|
"entropy": 1.18359375,
|
|
"epoch": 0.4223042230422304,
|
|
"grad_norm": 0.11530590305423223,
|
|
"learning_rate": 4.066859236297027e-06,
|
|
"loss": 1.1952,
|
|
"mean_token_accuracy": 0.7153125584125519,
|
|
"num_tokens": 284957286.0,
|
|
"step": 3090
|
|
},
|
|
{
|
|
"entropy": 1.21328125,
|
|
"epoch": 0.4236709033757004,
|
|
"grad_norm": 0.13445483719806908,
|
|
"learning_rate": 4.06333662110751e-06,
|
|
"loss": 1.2133,
|
|
"mean_token_accuracy": 0.7126824796199799,
|
|
"num_tokens": 285849739.0,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"entropy": 1.2515625,
|
|
"epoch": 0.42503758370917044,
|
|
"grad_norm": 0.12316885361110538,
|
|
"learning_rate": 4.059814005917994e-06,
|
|
"loss": 1.2693,
|
|
"mean_token_accuracy": 0.7024375021457672,
|
|
"num_tokens": 286816194.0,
|
|
"step": 3110
|
|
},
|
|
{
|
|
"entropy": 1.1984375,
|
|
"epoch": 0.4264042640426404,
|
|
"grad_norm": 0.11422229962413366,
|
|
"learning_rate": 4.056291390728477e-06,
|
|
"loss": 1.1954,
|
|
"mean_token_accuracy": 0.7162624418735504,
|
|
"num_tokens": 287733305.0,
|
|
"step": 3120
|
|
},
|
|
{
|
|
"entropy": 1.25234375,
|
|
"epoch": 0.4277709443761104,
|
|
"grad_norm": 0.12464642742672141,
|
|
"learning_rate": 4.052768775538961e-06,
|
|
"loss": 1.2682,
|
|
"mean_token_accuracy": 0.7052128136157989,
|
|
"num_tokens": 288676265.0,
|
|
"step": 3130
|
|
},
|
|
{
|
|
"entropy": 1.21953125,
|
|
"epoch": 0.42913762470958045,
|
|
"grad_norm": 0.1301322904806882,
|
|
"learning_rate": 4.049246160349444e-06,
|
|
"loss": 1.219,
|
|
"mean_token_accuracy": 0.7120357573032379,
|
|
"num_tokens": 289628921.0,
|
|
"step": 3140
|
|
},
|
|
{
|
|
"entropy": 1.184375,
|
|
"epoch": 0.43050430504305043,
|
|
"grad_norm": 0.10351735392792755,
|
|
"learning_rate": 4.045723545159927e-06,
|
|
"loss": 1.187,
|
|
"mean_token_accuracy": 0.7169369280338287,
|
|
"num_tokens": 290508054.0,
|
|
"step": 3150
|
|
},
|
|
{
|
|
"entropy": 1.1484375,
|
|
"epoch": 0.4318709853765204,
|
|
"grad_norm": 0.10555269010759614,
|
|
"learning_rate": 4.04220092997041e-06,
|
|
"loss": 1.1524,
|
|
"mean_token_accuracy": 0.7243097364902497,
|
|
"num_tokens": 291445061.0,
|
|
"step": 3160
|
|
},
|
|
{
|
|
"entropy": 1.18984375,
|
|
"epoch": 0.43323766570999045,
|
|
"grad_norm": 0.17583776411189217,
|
|
"learning_rate": 4.038678314780894e-06,
|
|
"loss": 1.1847,
|
|
"mean_token_accuracy": 0.7165256798267364,
|
|
"num_tokens": 292415043.0,
|
|
"step": 3170
|
|
},
|
|
{
|
|
"entropy": 1.19296875,
|
|
"epoch": 0.43460434604346043,
|
|
"grad_norm": 0.11380231712914891,
|
|
"learning_rate": 4.0351556995913775e-06,
|
|
"loss": 1.2045,
|
|
"mean_token_accuracy": 0.7117973506450653,
|
|
"num_tokens": 293378677.0,
|
|
"step": 3180
|
|
},
|
|
{
|
|
"entropy": 1.20234375,
|
|
"epoch": 0.4359710263769304,
|
|
"grad_norm": 0.11847267976409274,
|
|
"learning_rate": 4.03163308440186e-06,
|
|
"loss": 1.2131,
|
|
"mean_token_accuracy": 0.7111904859542847,
|
|
"num_tokens": 294289221.0,
|
|
"step": 3190
|
|
},
|
|
{
|
|
"entropy": 1.2078125,
|
|
"epoch": 0.43733770671040045,
|
|
"grad_norm": 0.12368698936193516,
|
|
"learning_rate": 4.028110469212344e-06,
|
|
"loss": 1.2154,
|
|
"mean_token_accuracy": 0.7140913784503937,
|
|
"num_tokens": 295166106.0,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"entropy": 1.1921875,
|
|
"epoch": 0.43870438704387044,
|
|
"grad_norm": 0.11584148289955366,
|
|
"learning_rate": 4.0245878540228264e-06,
|
|
"loss": 1.1937,
|
|
"mean_token_accuracy": 0.7149182975292205,
|
|
"num_tokens": 296076562.0,
|
|
"step": 3210
|
|
},
|
|
{
|
|
"entropy": 1.23046875,
|
|
"epoch": 0.4400710673773404,
|
|
"grad_norm": 0.11793743138084245,
|
|
"learning_rate": 4.02106523883331e-06,
|
|
"loss": 1.2325,
|
|
"mean_token_accuracy": 0.7091451168060303,
|
|
"num_tokens": 296967830.0,
|
|
"step": 3220
|
|
},
|
|
{
|
|
"entropy": 1.1609375,
|
|
"epoch": 0.44143774771081046,
|
|
"grad_norm": 0.12324423356627937,
|
|
"learning_rate": 4.0175426236437935e-06,
|
|
"loss": 1.171,
|
|
"mean_token_accuracy": 0.7213197708129883,
|
|
"num_tokens": 297854033.0,
|
|
"step": 3230
|
|
},
|
|
{
|
|
"entropy": 1.184375,
|
|
"epoch": 0.44280442804428044,
|
|
"grad_norm": 0.1380641746449623,
|
|
"learning_rate": 4.014020008454277e-06,
|
|
"loss": 1.1899,
|
|
"mean_token_accuracy": 0.715823358297348,
|
|
"num_tokens": 298775899.0,
|
|
"step": 3240
|
|
},
|
|
{
|
|
"entropy": 1.17265625,
|
|
"epoch": 0.4441711083777504,
|
|
"grad_norm": 0.11293027117684723,
|
|
"learning_rate": 4.01049739326476e-06,
|
|
"loss": 1.1669,
|
|
"mean_token_accuracy": 0.7174931943416596,
|
|
"num_tokens": 299644998.0,
|
|
"step": 3250
|
|
},
|
|
{
|
|
"entropy": 1.19453125,
|
|
"epoch": 0.44553778871122046,
|
|
"grad_norm": 0.11811021742515998,
|
|
"learning_rate": 4.006974778075243e-06,
|
|
"loss": 1.2005,
|
|
"mean_token_accuracy": 0.7147043943405151,
|
|
"num_tokens": 300579588.0,
|
|
"step": 3260
|
|
},
|
|
{
|
|
"entropy": 1.2140625,
|
|
"epoch": 0.44690446904469044,
|
|
"grad_norm": 0.10304747891382542,
|
|
"learning_rate": 4.003452162885727e-06,
|
|
"loss": 1.2246,
|
|
"mean_token_accuracy": 0.705631148815155,
|
|
"num_tokens": 301539377.0,
|
|
"step": 3270
|
|
},
|
|
{
|
|
"entropy": 1.15390625,
|
|
"epoch": 0.4482711493781604,
|
|
"grad_norm": 0.12077027618236748,
|
|
"learning_rate": 3.99992954769621e-06,
|
|
"loss": 1.1622,
|
|
"mean_token_accuracy": 0.7202878415584564,
|
|
"num_tokens": 302459676.0,
|
|
"step": 3280
|
|
},
|
|
{
|
|
"entropy": 1.16328125,
|
|
"epoch": 0.44963782971163047,
|
|
"grad_norm": 0.11275012098029857,
|
|
"learning_rate": 3.996406932506693e-06,
|
|
"loss": 1.1676,
|
|
"mean_token_accuracy": 0.7190170645713806,
|
|
"num_tokens": 303424014.0,
|
|
"step": 3290
|
|
},
|
|
{
|
|
"entropy": 1.14453125,
|
|
"epoch": 0.45100451004510045,
|
|
"grad_norm": 0.15718823402418142,
|
|
"learning_rate": 3.992884317317177e-06,
|
|
"loss": 1.155,
|
|
"mean_token_accuracy": 0.7214746952056885,
|
|
"num_tokens": 304396334.0,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"entropy": 1.2375,
|
|
"epoch": 0.45237119037857043,
|
|
"grad_norm": 0.13019822786766888,
|
|
"learning_rate": 3.98936170212766e-06,
|
|
"loss": 1.2485,
|
|
"mean_token_accuracy": 0.7061928153038025,
|
|
"num_tokens": 305340507.0,
|
|
"step": 3310
|
|
},
|
|
{
|
|
"entropy": 1.1578125,
|
|
"epoch": 0.45373787071204047,
|
|
"grad_norm": 0.10698063414528024,
|
|
"learning_rate": 3.985839086938143e-06,
|
|
"loss": 1.1548,
|
|
"mean_token_accuracy": 0.7210987627506256,
|
|
"num_tokens": 306253804.0,
|
|
"step": 3320
|
|
},
|
|
{
|
|
"entropy": 1.2109375,
|
|
"epoch": 0.45510455104551045,
|
|
"grad_norm": 0.12275187714264506,
|
|
"learning_rate": 3.9823164717486265e-06,
|
|
"loss": 1.2185,
|
|
"mean_token_accuracy": 0.710013210773468,
|
|
"num_tokens": 307219655.0,
|
|
"step": 3330
|
|
},
|
|
{
|
|
"entropy": 1.203125,
|
|
"epoch": 0.45647123137898044,
|
|
"grad_norm": 0.11368693476404895,
|
|
"learning_rate": 3.97879385655911e-06,
|
|
"loss": 1.2074,
|
|
"mean_token_accuracy": 0.7123259842395783,
|
|
"num_tokens": 308120248.0,
|
|
"step": 3340
|
|
},
|
|
{
|
|
"entropy": 1.22265625,
|
|
"epoch": 0.4578379117124505,
|
|
"grad_norm": 0.12601176455803906,
|
|
"learning_rate": 3.9752712413695936e-06,
|
|
"loss": 1.228,
|
|
"mean_token_accuracy": 0.7091707348823547,
|
|
"num_tokens": 309033220.0,
|
|
"step": 3350
|
|
},
|
|
{
|
|
"entropy": 1.2828125,
|
|
"epoch": 0.45920459204592046,
|
|
"grad_norm": 0.12091415673723657,
|
|
"learning_rate": 3.971748626180076e-06,
|
|
"loss": 1.2922,
|
|
"mean_token_accuracy": 0.6955231726169586,
|
|
"num_tokens": 309966122.0,
|
|
"step": 3360
|
|
},
|
|
{
|
|
"entropy": 1.190625,
|
|
"epoch": 0.46057127237939044,
|
|
"grad_norm": 0.11483298547122973,
|
|
"learning_rate": 3.96822601099056e-06,
|
|
"loss": 1.1945,
|
|
"mean_token_accuracy": 0.71350856423378,
|
|
"num_tokens": 310927570.0,
|
|
"step": 3370
|
|
},
|
|
{
|
|
"entropy": 1.19609375,
|
|
"epoch": 0.4619379527128605,
|
|
"grad_norm": 0.11312866552885843,
|
|
"learning_rate": 3.9647033958010425e-06,
|
|
"loss": 1.2185,
|
|
"mean_token_accuracy": 0.7099091589450837,
|
|
"num_tokens": 311832124.0,
|
|
"step": 3380
|
|
},
|
|
{
|
|
"entropy": 1.203125,
|
|
"epoch": 0.46330463304633046,
|
|
"grad_norm": 0.1245275984400666,
|
|
"learning_rate": 3.961180780611527e-06,
|
|
"loss": 1.2133,
|
|
"mean_token_accuracy": 0.7086798250675201,
|
|
"num_tokens": 312752210.0,
|
|
"step": 3390
|
|
},
|
|
{
|
|
"entropy": 1.22421875,
|
|
"epoch": 0.46467131337980044,
|
|
"grad_norm": 0.10994771125642451,
|
|
"learning_rate": 3.95765816542201e-06,
|
|
"loss": 1.2262,
|
|
"mean_token_accuracy": 0.7110893368721009,
|
|
"num_tokens": 313721040.0,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"entropy": 1.160546875,
|
|
"epoch": 0.4660379937132705,
|
|
"grad_norm": 0.11078132491530011,
|
|
"learning_rate": 3.954135550232493e-06,
|
|
"loss": 1.1578,
|
|
"mean_token_accuracy": 0.7199752151966095,
|
|
"num_tokens": 314623635.0,
|
|
"step": 3410
|
|
},
|
|
{
|
|
"entropy": 1.21328125,
|
|
"epoch": 0.46740467404674046,
|
|
"grad_norm": 0.12107915996127346,
|
|
"learning_rate": 3.950612935042976e-06,
|
|
"loss": 1.2103,
|
|
"mean_token_accuracy": 0.7128358960151673,
|
|
"num_tokens": 315549717.0,
|
|
"step": 3420
|
|
},
|
|
{
|
|
"entropy": 1.1640625,
|
|
"epoch": 0.46877135438021045,
|
|
"grad_norm": 0.12003839069305765,
|
|
"learning_rate": 3.947090319853459e-06,
|
|
"loss": 1.1568,
|
|
"mean_token_accuracy": 0.7218001902103424,
|
|
"num_tokens": 316459223.0,
|
|
"step": 3430
|
|
},
|
|
{
|
|
"entropy": 1.25390625,
|
|
"epoch": 0.4701380347136805,
|
|
"grad_norm": 0.1273636089630686,
|
|
"learning_rate": 3.943567704663943e-06,
|
|
"loss": 1.2503,
|
|
"mean_token_accuracy": 0.706339418888092,
|
|
"num_tokens": 317359360.0,
|
|
"step": 3440
|
|
},
|
|
{
|
|
"entropy": 1.1921875,
|
|
"epoch": 0.47150471504715047,
|
|
"grad_norm": 0.13429926366031464,
|
|
"learning_rate": 3.9400450894744265e-06,
|
|
"loss": 1.1961,
|
|
"mean_token_accuracy": 0.7135069906711579,
|
|
"num_tokens": 318239868.0,
|
|
"step": 3450
|
|
},
|
|
{
|
|
"entropy": 1.228125,
|
|
"epoch": 0.47287139538062045,
|
|
"grad_norm": 0.12176879054778855,
|
|
"learning_rate": 3.936522474284909e-06,
|
|
"loss": 1.2472,
|
|
"mean_token_accuracy": 0.705737566947937,
|
|
"num_tokens": 319176882.0,
|
|
"step": 3460
|
|
},
|
|
{
|
|
"entropy": 1.1875,
|
|
"epoch": 0.4742380757140905,
|
|
"grad_norm": 0.12521352569904004,
|
|
"learning_rate": 3.932999859095393e-06,
|
|
"loss": 1.204,
|
|
"mean_token_accuracy": 0.713945335149765,
|
|
"num_tokens": 320165510.0,
|
|
"step": 3470
|
|
},
|
|
{
|
|
"entropy": 1.1703125,
|
|
"epoch": 0.4756047560475605,
|
|
"grad_norm": 0.12396383417796053,
|
|
"learning_rate": 3.929477243905876e-06,
|
|
"loss": 1.1711,
|
|
"mean_token_accuracy": 0.7211913406848908,
|
|
"num_tokens": 321088771.0,
|
|
"step": 3480
|
|
},
|
|
{
|
|
"entropy": 1.21875,
|
|
"epoch": 0.47697143638103046,
|
|
"grad_norm": 0.1312237774517091,
|
|
"learning_rate": 3.925954628716359e-06,
|
|
"loss": 1.2177,
|
|
"mean_token_accuracy": 0.7131180226802826,
|
|
"num_tokens": 322002114.0,
|
|
"step": 3490
|
|
},
|
|
{
|
|
"entropy": 1.21015625,
|
|
"epoch": 0.4783381167145005,
|
|
"grad_norm": 0.11588487313887344,
|
|
"learning_rate": 3.9224320135268425e-06,
|
|
"loss": 1.2051,
|
|
"mean_token_accuracy": 0.7130066335201264,
|
|
"num_tokens": 322898306.0,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"entropy": 1.21328125,
|
|
"epoch": 0.4797047970479705,
|
|
"grad_norm": 0.1249978279629876,
|
|
"learning_rate": 3.918909398337326e-06,
|
|
"loss": 1.2185,
|
|
"mean_token_accuracy": 0.7103014886379242,
|
|
"num_tokens": 323816623.0,
|
|
"step": 3510
|
|
},
|
|
{
|
|
"entropy": 1.17734375,
|
|
"epoch": 0.48107147738144046,
|
|
"grad_norm": 0.1170297966763624,
|
|
"learning_rate": 3.91538678314781e-06,
|
|
"loss": 1.182,
|
|
"mean_token_accuracy": 0.7187474191188812,
|
|
"num_tokens": 324757771.0,
|
|
"step": 3520
|
|
},
|
|
{
|
|
"entropy": 1.2109375,
|
|
"epoch": 0.4824381577149105,
|
|
"grad_norm": 0.1153041918728577,
|
|
"learning_rate": 3.911864167958292e-06,
|
|
"loss": 1.202,
|
|
"mean_token_accuracy": 0.7148920953273773,
|
|
"num_tokens": 325695539.0,
|
|
"step": 3530
|
|
},
|
|
{
|
|
"entropy": 1.18046875,
|
|
"epoch": 0.4838048380483805,
|
|
"grad_norm": 0.12097479461756408,
|
|
"learning_rate": 3.908341552768776e-06,
|
|
"loss": 1.1753,
|
|
"mean_token_accuracy": 0.718091470003128,
|
|
"num_tokens": 326619627.0,
|
|
"step": 3540
|
|
},
|
|
{
|
|
"entropy": 1.16953125,
|
|
"epoch": 0.48517151838185046,
|
|
"grad_norm": 0.10353696149519344,
|
|
"learning_rate": 3.9048189375792586e-06,
|
|
"loss": 1.1751,
|
|
"mean_token_accuracy": 0.7153058230876923,
|
|
"num_tokens": 327522409.0,
|
|
"step": 3550
|
|
},
|
|
{
|
|
"entropy": 1.21796875,
|
|
"epoch": 0.4865381987153205,
|
|
"grad_norm": 0.11904560399983277,
|
|
"learning_rate": 3.901296322389743e-06,
|
|
"loss": 1.2235,
|
|
"mean_token_accuracy": 0.7076343357563019,
|
|
"num_tokens": 328418711.0,
|
|
"step": 3560
|
|
},
|
|
{
|
|
"entropy": 1.22265625,
|
|
"epoch": 0.4879048790487905,
|
|
"grad_norm": 0.12064230728449035,
|
|
"learning_rate": 3.897773707200226e-06,
|
|
"loss": 1.2152,
|
|
"mean_token_accuracy": 0.7103489995002746,
|
|
"num_tokens": 329322871.0,
|
|
"step": 3570
|
|
},
|
|
{
|
|
"entropy": 1.1984375,
|
|
"epoch": 0.48927155938226047,
|
|
"grad_norm": 0.1223012461342541,
|
|
"learning_rate": 3.894251092010709e-06,
|
|
"loss": 1.1865,
|
|
"mean_token_accuracy": 0.7153882086277008,
|
|
"num_tokens": 330272623.0,
|
|
"step": 3580
|
|
},
|
|
{
|
|
"entropy": 1.15859375,
|
|
"epoch": 0.4906382397157305,
|
|
"grad_norm": 0.126454843708084,
|
|
"learning_rate": 3.890728476821192e-06,
|
|
"loss": 1.1522,
|
|
"mean_token_accuracy": 0.7235879361629486,
|
|
"num_tokens": 331160163.0,
|
|
"step": 3590
|
|
},
|
|
{
|
|
"entropy": 1.118359375,
|
|
"epoch": 0.4920049200492005,
|
|
"grad_norm": 0.11605134986235713,
|
|
"learning_rate": 3.8872058616316755e-06,
|
|
"loss": 1.1197,
|
|
"mean_token_accuracy": 0.7311794638633728,
|
|
"num_tokens": 332041350.0,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"entropy": 1.18125,
|
|
"epoch": 0.49337160038267047,
|
|
"grad_norm": 0.11833815791151074,
|
|
"learning_rate": 3.883683246442159e-06,
|
|
"loss": 1.183,
|
|
"mean_token_accuracy": 0.7163702607154846,
|
|
"num_tokens": 332958062.0,
|
|
"step": 3610
|
|
},
|
|
{
|
|
"entropy": 1.2015625,
|
|
"epoch": 0.4947382807161405,
|
|
"grad_norm": 0.11599248792095146,
|
|
"learning_rate": 3.8801606312526426e-06,
|
|
"loss": 1.2021,
|
|
"mean_token_accuracy": 0.7148635566234589,
|
|
"num_tokens": 333926373.0,
|
|
"step": 3620
|
|
},
|
|
{
|
|
"entropy": 1.119140625,
|
|
"epoch": 0.4961049610496105,
|
|
"grad_norm": 0.1197372952764069,
|
|
"learning_rate": 3.876638016063125e-06,
|
|
"loss": 1.1098,
|
|
"mean_token_accuracy": 0.7318840801715851,
|
|
"num_tokens": 334867149.0,
|
|
"step": 3630
|
|
},
|
|
{
|
|
"entropy": 1.16875,
|
|
"epoch": 0.4974716413830805,
|
|
"grad_norm": 0.12115007223236485,
|
|
"learning_rate": 3.873115400873609e-06,
|
|
"loss": 1.1693,
|
|
"mean_token_accuracy": 0.7206172227859498,
|
|
"num_tokens": 335748006.0,
|
|
"step": 3640
|
|
},
|
|
{
|
|
"entropy": 1.20546875,
|
|
"epoch": 0.4988383217165505,
|
|
"grad_norm": 0.12842729607822834,
|
|
"learning_rate": 3.869592785684092e-06,
|
|
"loss": 1.2112,
|
|
"mean_token_accuracy": 0.7137919485569,
|
|
"num_tokens": 336676648.0,
|
|
"step": 3650
|
|
},
|
|
{
|
|
"entropy": 1.17890625,
|
|
"epoch": 0.5002050020500205,
|
|
"grad_norm": 0.10541860499862447,
|
|
"learning_rate": 3.866070170494575e-06,
|
|
"loss": 1.1881,
|
|
"mean_token_accuracy": 0.7180696606636048,
|
|
"num_tokens": 337574385.0,
|
|
"step": 3660
|
|
},
|
|
{
|
|
"entropy": 1.18828125,
|
|
"epoch": 0.5015716823834905,
|
|
"grad_norm": 0.12337001669887439,
|
|
"learning_rate": 3.862547555305059e-06,
|
|
"loss": 1.2043,
|
|
"mean_token_accuracy": 0.7130369186401367,
|
|
"num_tokens": 338532100.0,
|
|
"step": 3670
|
|
},
|
|
{
|
|
"entropy": 1.20078125,
|
|
"epoch": 0.5029383627169605,
|
|
"grad_norm": 0.133591218099603,
|
|
"learning_rate": 3.859024940115542e-06,
|
|
"loss": 1.2116,
|
|
"mean_token_accuracy": 0.7109639823436738,
|
|
"num_tokens": 339464473.0,
|
|
"step": 3680
|
|
},
|
|
{
|
|
"entropy": 1.13515625,
|
|
"epoch": 0.5043050430504306,
|
|
"grad_norm": 0.11710419402705904,
|
|
"learning_rate": 3.855502324926026e-06,
|
|
"loss": 1.1363,
|
|
"mean_token_accuracy": 0.7278536021709442,
|
|
"num_tokens": 340432340.0,
|
|
"step": 3690
|
|
},
|
|
{
|
|
"entropy": 1.18359375,
|
|
"epoch": 0.5056717233839005,
|
|
"grad_norm": 0.1167456718167425,
|
|
"learning_rate": 3.851979709736508e-06,
|
|
"loss": 1.1916,
|
|
"mean_token_accuracy": 0.7178699910640717,
|
|
"num_tokens": 341335395.0,
|
|
"step": 3700
|
|
},
|
|
{
|
|
"entropy": 1.209375,
|
|
"epoch": 0.5070384037173705,
|
|
"grad_norm": 0.11208653867839308,
|
|
"learning_rate": 3.848457094546992e-06,
|
|
"loss": 1.2041,
|
|
"mean_token_accuracy": 0.7142843544483185,
|
|
"num_tokens": 342247450.0,
|
|
"step": 3710
|
|
},
|
|
{
|
|
"entropy": 1.1609375,
|
|
"epoch": 0.5084050840508405,
|
|
"grad_norm": 0.11370429249981165,
|
|
"learning_rate": 3.8449344793574755e-06,
|
|
"loss": 1.1698,
|
|
"mean_token_accuracy": 0.7203823447227478,
|
|
"num_tokens": 343198674.0,
|
|
"step": 3720
|
|
},
|
|
{
|
|
"entropy": 1.20078125,
|
|
"epoch": 0.5097717643843105,
|
|
"grad_norm": 0.10862145203660946,
|
|
"learning_rate": 3.841411864167959e-06,
|
|
"loss": 1.2051,
|
|
"mean_token_accuracy": 0.7124501585960388,
|
|
"num_tokens": 344106894.0,
|
|
"step": 3730
|
|
},
|
|
{
|
|
"entropy": 1.1921875,
|
|
"epoch": 0.5111384447177805,
|
|
"grad_norm": 0.11672701444343671,
|
|
"learning_rate": 3.837889248978442e-06,
|
|
"loss": 1.1978,
|
|
"mean_token_accuracy": 0.7136788785457611,
|
|
"num_tokens": 345010494.0,
|
|
"step": 3740
|
|
},
|
|
{
|
|
"entropy": 1.18125,
|
|
"epoch": 0.5125051250512506,
|
|
"grad_norm": 0.12068459725437725,
|
|
"learning_rate": 3.834366633788925e-06,
|
|
"loss": 1.1905,
|
|
"mean_token_accuracy": 0.7165962815284729,
|
|
"num_tokens": 345934020.0,
|
|
"step": 3750
|
|
},
|
|
{
|
|
"entropy": 1.2078125,
|
|
"epoch": 0.5138718053847205,
|
|
"grad_norm": 0.12681532421204417,
|
|
"learning_rate": 3.830844018599408e-06,
|
|
"loss": 1.2077,
|
|
"mean_token_accuracy": 0.7119507491588593,
|
|
"num_tokens": 346799355.0,
|
|
"step": 3760
|
|
},
|
|
{
|
|
"entropy": 1.20234375,
|
|
"epoch": 0.5152384857181905,
|
|
"grad_norm": 0.11642313293634986,
|
|
"learning_rate": 3.8273214034098915e-06,
|
|
"loss": 1.2035,
|
|
"mean_token_accuracy": 0.7143694043159485,
|
|
"num_tokens": 347721206.0,
|
|
"step": 3770
|
|
},
|
|
{
|
|
"entropy": 1.2171875,
|
|
"epoch": 0.5166051660516605,
|
|
"grad_norm": 0.13681486805320622,
|
|
"learning_rate": 3.823798788220375e-06,
|
|
"loss": 1.2123,
|
|
"mean_token_accuracy": 0.7108630180358887,
|
|
"num_tokens": 348616795.0,
|
|
"step": 3780
|
|
},
|
|
{
|
|
"entropy": 1.15625,
|
|
"epoch": 0.5179718463851305,
|
|
"grad_norm": 0.1344447634882914,
|
|
"learning_rate": 3.820276173030859e-06,
|
|
"loss": 1.1621,
|
|
"mean_token_accuracy": 0.7220582902431488,
|
|
"num_tokens": 349508928.0,
|
|
"step": 3790
|
|
},
|
|
{
|
|
"entropy": 1.1796875,
|
|
"epoch": 0.5193385267186005,
|
|
"grad_norm": 0.1153021844017235,
|
|
"learning_rate": 3.816753557841341e-06,
|
|
"loss": 1.1892,
|
|
"mean_token_accuracy": 0.7152324378490448,
|
|
"num_tokens": 350404276.0,
|
|
"step": 3800
|
|
},
|
|
{
|
|
"entropy": 1.18828125,
|
|
"epoch": 0.5207052070520706,
|
|
"grad_norm": 0.13017570295748743,
|
|
"learning_rate": 3.813230942651825e-06,
|
|
"loss": 1.1892,
|
|
"mean_token_accuracy": 0.7166761159896851,
|
|
"num_tokens": 351273919.0,
|
|
"step": 3810
|
|
},
|
|
{
|
|
"entropy": 1.20078125,
|
|
"epoch": 0.5220718873855406,
|
|
"grad_norm": 0.10857780322011869,
|
|
"learning_rate": 3.809708327462308e-06,
|
|
"loss": 1.2027,
|
|
"mean_token_accuracy": 0.7120778381824493,
|
|
"num_tokens": 352170178.0,
|
|
"step": 3820
|
|
},
|
|
{
|
|
"entropy": 1.20859375,
|
|
"epoch": 0.5234385677190105,
|
|
"grad_norm": 0.12054401755085783,
|
|
"learning_rate": 3.806185712272792e-06,
|
|
"loss": 1.218,
|
|
"mean_token_accuracy": 0.7123370110988617,
|
|
"num_tokens": 353108363.0,
|
|
"step": 3830
|
|
},
|
|
{
|
|
"entropy": 1.26015625,
|
|
"epoch": 0.5248052480524805,
|
|
"grad_norm": 0.1256009018660128,
|
|
"learning_rate": 3.802663097083275e-06,
|
|
"loss": 1.2773,
|
|
"mean_token_accuracy": 0.6994435846805572,
|
|
"num_tokens": 353972875.0,
|
|
"step": 3840
|
|
},
|
|
{
|
|
"entropy": 1.19375,
|
|
"epoch": 0.5261719283859505,
|
|
"grad_norm": 0.15189018085821035,
|
|
"learning_rate": 3.799140481893758e-06,
|
|
"loss": 1.1942,
|
|
"mean_token_accuracy": 0.715214216709137,
|
|
"num_tokens": 354864325.0,
|
|
"step": 3850
|
|
},
|
|
{
|
|
"entropy": 1.1765625,
|
|
"epoch": 0.5275386087194205,
|
|
"grad_norm": 0.15567432850605153,
|
|
"learning_rate": 3.7956178667042413e-06,
|
|
"loss": 1.1721,
|
|
"mean_token_accuracy": 0.7210634291172028,
|
|
"num_tokens": 355743210.0,
|
|
"step": 3860
|
|
},
|
|
{
|
|
"entropy": 1.150390625,
|
|
"epoch": 0.5289052890528906,
|
|
"grad_norm": 0.12304898827828249,
|
|
"learning_rate": 3.792095251514725e-06,
|
|
"loss": 1.148,
|
|
"mean_token_accuracy": 0.7217421233654022,
|
|
"num_tokens": 356680442.0,
|
|
"step": 3870
|
|
},
|
|
{
|
|
"entropy": 1.1703125,
|
|
"epoch": 0.5302719693863606,
|
|
"grad_norm": 0.10996396338005872,
|
|
"learning_rate": 3.788572636325208e-06,
|
|
"loss": 1.1797,
|
|
"mean_token_accuracy": 0.7195286214351654,
|
|
"num_tokens": 357633513.0,
|
|
"step": 3880
|
|
},
|
|
{
|
|
"entropy": 1.1484375,
|
|
"epoch": 0.5316386497198305,
|
|
"grad_norm": 0.1279906216274161,
|
|
"learning_rate": 3.7850500211356916e-06,
|
|
"loss": 1.1543,
|
|
"mean_token_accuracy": 0.7220454096794129,
|
|
"num_tokens": 358560575.0,
|
|
"step": 3890
|
|
},
|
|
{
|
|
"entropy": 1.1984375,
|
|
"epoch": 0.5330053300533005,
|
|
"grad_norm": 0.11441091365927547,
|
|
"learning_rate": 3.7815274059461747e-06,
|
|
"loss": 1.2041,
|
|
"mean_token_accuracy": 0.7121405601501465,
|
|
"num_tokens": 359474308.0,
|
|
"step": 3900
|
|
},
|
|
{
|
|
"entropy": 1.16484375,
|
|
"epoch": 0.5343720103867705,
|
|
"grad_norm": 0.10912040610982676,
|
|
"learning_rate": 3.7780047907566582e-06,
|
|
"loss": 1.1812,
|
|
"mean_token_accuracy": 0.7162030458450317,
|
|
"num_tokens": 360411784.0,
|
|
"step": 3910
|
|
},
|
|
{
|
|
"entropy": 1.1796875,
|
|
"epoch": 0.5357386907202405,
|
|
"grad_norm": 0.15042228680126626,
|
|
"learning_rate": 3.7744821755671413e-06,
|
|
"loss": 1.183,
|
|
"mean_token_accuracy": 0.7194423079490662,
|
|
"num_tokens": 361350524.0,
|
|
"step": 3920
|
|
},
|
|
{
|
|
"entropy": 1.19609375,
|
|
"epoch": 0.5371053710537106,
|
|
"grad_norm": 0.11541356764933358,
|
|
"learning_rate": 3.7709595603776245e-06,
|
|
"loss": 1.2263,
|
|
"mean_token_accuracy": 0.7103711426258087,
|
|
"num_tokens": 362283058.0,
|
|
"step": 3930
|
|
},
|
|
{
|
|
"entropy": 1.20546875,
|
|
"epoch": 0.5384720513871806,
|
|
"grad_norm": 0.12957454728150575,
|
|
"learning_rate": 3.767436945188108e-06,
|
|
"loss": 1.2194,
|
|
"mean_token_accuracy": 0.7098959743976593,
|
|
"num_tokens": 363194527.0,
|
|
"step": 3940
|
|
},
|
|
{
|
|
"entropy": 1.1640625,
|
|
"epoch": 0.5398387317206506,
|
|
"grad_norm": 0.1197051679783492,
|
|
"learning_rate": 3.7639143299985916e-06,
|
|
"loss": 1.1687,
|
|
"mean_token_accuracy": 0.7213878095149994,
|
|
"num_tokens": 364108504.0,
|
|
"step": 3950
|
|
},
|
|
{
|
|
"entropy": 1.215625,
|
|
"epoch": 0.5412054120541205,
|
|
"grad_norm": 0.12006683096266546,
|
|
"learning_rate": 3.7603917148090747e-06,
|
|
"loss": 1.2302,
|
|
"mean_token_accuracy": 0.7099004983901978,
|
|
"num_tokens": 365029878.0,
|
|
"step": 3960
|
|
},
|
|
{
|
|
"entropy": 1.14453125,
|
|
"epoch": 0.5425720923875905,
|
|
"grad_norm": 0.11020632606997828,
|
|
"learning_rate": 3.756869099619558e-06,
|
|
"loss": 1.1521,
|
|
"mean_token_accuracy": 0.724221932888031,
|
|
"num_tokens": 365964480.0,
|
|
"step": 3970
|
|
},
|
|
{
|
|
"entropy": 1.14765625,
|
|
"epoch": 0.5439387727210605,
|
|
"grad_norm": 0.11935064419406534,
|
|
"learning_rate": 3.753346484430041e-06,
|
|
"loss": 1.1571,
|
|
"mean_token_accuracy": 0.7218895614147186,
|
|
"num_tokens": 366872780.0,
|
|
"step": 3980
|
|
},
|
|
{
|
|
"entropy": 1.15859375,
|
|
"epoch": 0.5453054530545306,
|
|
"grad_norm": 0.11932813694985797,
|
|
"learning_rate": 3.749823869240524e-06,
|
|
"loss": 1.1623,
|
|
"mean_token_accuracy": 0.7223969042301178,
|
|
"num_tokens": 367776522.0,
|
|
"step": 3990
|
|
},
|
|
{
|
|
"entropy": 1.141796875,
|
|
"epoch": 0.5466721333880006,
|
|
"grad_norm": 0.12269791267678166,
|
|
"learning_rate": 3.746301254051008e-06,
|
|
"loss": 1.1416,
|
|
"mean_token_accuracy": 0.7246180772781372,
|
|
"num_tokens": 368687371.0,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"entropy": 1.15234375,
|
|
"epoch": 0.5480388137214706,
|
|
"grad_norm": 0.11708242105420581,
|
|
"learning_rate": 3.742778638861491e-06,
|
|
"loss": 1.1571,
|
|
"mean_token_accuracy": 0.7212917745113373,
|
|
"num_tokens": 369596596.0,
|
|
"step": 4010
|
|
},
|
|
{
|
|
"entropy": 1.19140625,
|
|
"epoch": 0.5494054940549405,
|
|
"grad_norm": 0.1188566539717013,
|
|
"learning_rate": 3.7392560236719743e-06,
|
|
"loss": 1.1896,
|
|
"mean_token_accuracy": 0.7149365246295929,
|
|
"num_tokens": 370474928.0,
|
|
"step": 4020
|
|
},
|
|
{
|
|
"entropy": 1.1234375,
|
|
"epoch": 0.5507721743884105,
|
|
"grad_norm": 0.1319706225299887,
|
|
"learning_rate": 3.7357334084824574e-06,
|
|
"loss": 1.129,
|
|
"mean_token_accuracy": 0.7278317272663116,
|
|
"num_tokens": 371384674.0,
|
|
"step": 4030
|
|
},
|
|
{
|
|
"entropy": 1.15078125,
|
|
"epoch": 0.5521388547218805,
|
|
"grad_norm": 0.12628008634304239,
|
|
"learning_rate": 3.732210793292941e-06,
|
|
"loss": 1.1415,
|
|
"mean_token_accuracy": 0.7246921241283417,
|
|
"num_tokens": 372309541.0,
|
|
"step": 4040
|
|
},
|
|
{
|
|
"entropy": 1.2109375,
|
|
"epoch": 0.5535055350553506,
|
|
"grad_norm": 0.12815985866201018,
|
|
"learning_rate": 3.7286881781034245e-06,
|
|
"loss": 1.2168,
|
|
"mean_token_accuracy": 0.7103616178035737,
|
|
"num_tokens": 373247679.0,
|
|
"step": 4050
|
|
},
|
|
{
|
|
"entropy": 1.16953125,
|
|
"epoch": 0.5548722153888206,
|
|
"grad_norm": 0.1172976074148134,
|
|
"learning_rate": 3.7251655629139076e-06,
|
|
"loss": 1.1692,
|
|
"mean_token_accuracy": 0.7199905931949615,
|
|
"num_tokens": 374199142.0,
|
|
"step": 4060
|
|
},
|
|
{
|
|
"entropy": 1.21796875,
|
|
"epoch": 0.5562388957222906,
|
|
"grad_norm": 0.107186929094466,
|
|
"learning_rate": 3.7216429477243907e-06,
|
|
"loss": 1.2095,
|
|
"mean_token_accuracy": 0.7114291369915009,
|
|
"num_tokens": 375123594.0,
|
|
"step": 4070
|
|
},
|
|
{
|
|
"entropy": 1.17265625,
|
|
"epoch": 0.5576055760557606,
|
|
"grad_norm": 0.11285351502755174,
|
|
"learning_rate": 3.7181203325348743e-06,
|
|
"loss": 1.1706,
|
|
"mean_token_accuracy": 0.7195236504077911,
|
|
"num_tokens": 376085352.0,
|
|
"step": 4080
|
|
},
|
|
{
|
|
"entropy": 1.225,
|
|
"epoch": 0.5589722563892305,
|
|
"grad_norm": 0.12972153739628206,
|
|
"learning_rate": 3.7145977173453574e-06,
|
|
"loss": 1.2257,
|
|
"mean_token_accuracy": 0.708150464296341,
|
|
"num_tokens": 376981305.0,
|
|
"step": 4090
|
|
},
|
|
{
|
|
"entropy": 1.16640625,
|
|
"epoch": 0.5603389367227005,
|
|
"grad_norm": 0.10948133378907594,
|
|
"learning_rate": 3.7110751021558405e-06,
|
|
"loss": 1.1705,
|
|
"mean_token_accuracy": 0.7182758450508118,
|
|
"num_tokens": 377879583.0,
|
|
"step": 4100
|
|
},
|
|
{
|
|
"entropy": 1.16171875,
|
|
"epoch": 0.5617056170561706,
|
|
"grad_norm": 0.11757967330124698,
|
|
"learning_rate": 3.7075524869663245e-06,
|
|
"loss": 1.1673,
|
|
"mean_token_accuracy": 0.7183128833770752,
|
|
"num_tokens": 378832645.0,
|
|
"step": 4110
|
|
},
|
|
{
|
|
"entropy": 1.1703125,
|
|
"epoch": 0.5630722973896406,
|
|
"grad_norm": 0.12711871204647285,
|
|
"learning_rate": 3.7040298717768076e-06,
|
|
"loss": 1.1674,
|
|
"mean_token_accuracy": 0.7209965407848358,
|
|
"num_tokens": 379779952.0,
|
|
"step": 4120
|
|
},
|
|
{
|
|
"entropy": 1.221875,
|
|
"epoch": 0.5644389777231106,
|
|
"grad_norm": 0.12166521747134118,
|
|
"learning_rate": 3.7005072565872908e-06,
|
|
"loss": 1.2115,
|
|
"mean_token_accuracy": 0.7115922331809997,
|
|
"num_tokens": 380737380.0,
|
|
"step": 4130
|
|
},
|
|
{
|
|
"entropy": 1.196875,
|
|
"epoch": 0.5658056580565806,
|
|
"grad_norm": 0.12284109645925986,
|
|
"learning_rate": 3.696984641397774e-06,
|
|
"loss": 1.2058,
|
|
"mean_token_accuracy": 0.7132474601268768,
|
|
"num_tokens": 381656151.0,
|
|
"step": 4140
|
|
},
|
|
{
|
|
"entropy": 1.1359375,
|
|
"epoch": 0.5671723383900505,
|
|
"grad_norm": 0.12843929476981603,
|
|
"learning_rate": 3.693462026208257e-06,
|
|
"loss": 1.1363,
|
|
"mean_token_accuracy": 0.725660753250122,
|
|
"num_tokens": 382571650.0,
|
|
"step": 4150
|
|
},
|
|
{
|
|
"entropy": 1.1828125,
|
|
"epoch": 0.5685390187235205,
|
|
"grad_norm": 0.129942502126031,
|
|
"learning_rate": 3.689939411018741e-06,
|
|
"loss": 1.1978,
|
|
"mean_token_accuracy": 0.7172651946544647,
|
|
"num_tokens": 383530667.0,
|
|
"step": 4160
|
|
},
|
|
{
|
|
"entropy": 1.11953125,
|
|
"epoch": 0.5699056990569906,
|
|
"grad_norm": 0.12198601370681103,
|
|
"learning_rate": 3.686416795829224e-06,
|
|
"loss": 1.1105,
|
|
"mean_token_accuracy": 0.729958313703537,
|
|
"num_tokens": 384438746.0,
|
|
"step": 4170
|
|
},
|
|
{
|
|
"entropy": 1.1625,
|
|
"epoch": 0.5712723793904606,
|
|
"grad_norm": 0.3538583651341181,
|
|
"learning_rate": 3.6828941806397072e-06,
|
|
"loss": 1.171,
|
|
"mean_token_accuracy": 0.7203901708126068,
|
|
"num_tokens": 385325961.0,
|
|
"step": 4180
|
|
},
|
|
{
|
|
"entropy": 1.17734375,
|
|
"epoch": 0.5726390597239306,
|
|
"grad_norm": 0.11805613374941669,
|
|
"learning_rate": 3.6793715654501903e-06,
|
|
"loss": 1.1762,
|
|
"mean_token_accuracy": 0.7177635073661804,
|
|
"num_tokens": 386264764.0,
|
|
"step": 4190
|
|
},
|
|
{
|
|
"entropy": 1.1890625,
|
|
"epoch": 0.5740057400574006,
|
|
"grad_norm": 0.13020614213659218,
|
|
"learning_rate": 3.6758489502606735e-06,
|
|
"loss": 1.1941,
|
|
"mean_token_accuracy": 0.716736352443695,
|
|
"num_tokens": 387231562.0,
|
|
"step": 4200
|
|
},
|
|
{
|
|
"entropy": 1.1875,
|
|
"epoch": 0.5753724203908706,
|
|
"grad_norm": 0.1310602125757875,
|
|
"learning_rate": 3.672326335071157e-06,
|
|
"loss": 1.1877,
|
|
"mean_token_accuracy": 0.7167354226112366,
|
|
"num_tokens": 388124752.0,
|
|
"step": 4210
|
|
},
|
|
{
|
|
"entropy": 1.171875,
|
|
"epoch": 0.5767391007243405,
|
|
"grad_norm": 0.14593020699895246,
|
|
"learning_rate": 3.6688037198816406e-06,
|
|
"loss": 1.1841,
|
|
"mean_token_accuracy": 0.718797481060028,
|
|
"num_tokens": 389034093.0,
|
|
"step": 4220
|
|
},
|
|
{
|
|
"entropy": 1.16171875,
|
|
"epoch": 0.5781057810578106,
|
|
"grad_norm": 0.12912864611985472,
|
|
"learning_rate": 3.6652811046921237e-06,
|
|
"loss": 1.1671,
|
|
"mean_token_accuracy": 0.7215290069580078,
|
|
"num_tokens": 389920605.0,
|
|
"step": 4230
|
|
},
|
|
{
|
|
"entropy": 1.15703125,
|
|
"epoch": 0.5794724613912806,
|
|
"grad_norm": 0.10697315353771132,
|
|
"learning_rate": 3.6617584895026072e-06,
|
|
"loss": 1.1581,
|
|
"mean_token_accuracy": 0.721472418308258,
|
|
"num_tokens": 390802728.0,
|
|
"step": 4240
|
|
},
|
|
{
|
|
"entropy": 1.21171875,
|
|
"epoch": 0.5808391417247506,
|
|
"grad_norm": 0.11190284477134636,
|
|
"learning_rate": 3.6582358743130904e-06,
|
|
"loss": 1.214,
|
|
"mean_token_accuracy": 0.7080538690090179,
|
|
"num_tokens": 391735027.0,
|
|
"step": 4250
|
|
},
|
|
{
|
|
"entropy": 1.18828125,
|
|
"epoch": 0.5822058220582206,
|
|
"grad_norm": 0.12653869507478807,
|
|
"learning_rate": 3.6547132591235735e-06,
|
|
"loss": 1.1927,
|
|
"mean_token_accuracy": 0.7147288143634796,
|
|
"num_tokens": 392621164.0,
|
|
"step": 4260
|
|
},
|
|
{
|
|
"entropy": 1.19765625,
|
|
"epoch": 0.5835725023916906,
|
|
"grad_norm": 0.13838471208270214,
|
|
"learning_rate": 3.651190643934057e-06,
|
|
"loss": 1.2114,
|
|
"mean_token_accuracy": 0.7139334261417389,
|
|
"num_tokens": 393474315.0,
|
|
"step": 4270
|
|
},
|
|
{
|
|
"entropy": 1.17734375,
|
|
"epoch": 0.5849391827251605,
|
|
"grad_norm": 0.11290297003650952,
|
|
"learning_rate": 3.6476680287445406e-06,
|
|
"loss": 1.1834,
|
|
"mean_token_accuracy": 0.7179689705371857,
|
|
"num_tokens": 394416453.0,
|
|
"step": 4280
|
|
},
|
|
{
|
|
"entropy": 1.153125,
|
|
"epoch": 0.5863058630586306,
|
|
"grad_norm": 0.11731920676633982,
|
|
"learning_rate": 3.6441454135550237e-06,
|
|
"loss": 1.1479,
|
|
"mean_token_accuracy": 0.7256379663944245,
|
|
"num_tokens": 395353788.0,
|
|
"step": 4290
|
|
},
|
|
{
|
|
"entropy": 1.184375,
|
|
"epoch": 0.5876725433921006,
|
|
"grad_norm": 0.11237900771515738,
|
|
"learning_rate": 3.640622798365507e-06,
|
|
"loss": 1.1919,
|
|
"mean_token_accuracy": 0.7148195803165436,
|
|
"num_tokens": 396277079.0,
|
|
"step": 4300
|
|
},
|
|
{
|
|
"entropy": 1.2234375,
|
|
"epoch": 0.5890392237255706,
|
|
"grad_norm": 0.10991345154209077,
|
|
"learning_rate": 3.63710018317599e-06,
|
|
"loss": 1.2188,
|
|
"mean_token_accuracy": 0.7092743158340454,
|
|
"num_tokens": 397204872.0,
|
|
"step": 4310
|
|
},
|
|
{
|
|
"entropy": 1.2421875,
|
|
"epoch": 0.5904059040590406,
|
|
"grad_norm": 0.12579025250546666,
|
|
"learning_rate": 3.633577567986473e-06,
|
|
"loss": 1.2527,
|
|
"mean_token_accuracy": 0.7055298447608948,
|
|
"num_tokens": 398106709.0,
|
|
"step": 4320
|
|
},
|
|
{
|
|
"entropy": 1.17421875,
|
|
"epoch": 0.5917725843925106,
|
|
"grad_norm": 0.13144811060985137,
|
|
"learning_rate": 3.630054952796957e-06,
|
|
"loss": 1.1756,
|
|
"mean_token_accuracy": 0.7195990085601807,
|
|
"num_tokens": 399003959.0,
|
|
"step": 4330
|
|
},
|
|
{
|
|
"entropy": 1.23671875,
|
|
"epoch": 0.5931392647259806,
|
|
"grad_norm": 0.11383258560880076,
|
|
"learning_rate": 3.62653233760744e-06,
|
|
"loss": 1.2314,
|
|
"mean_token_accuracy": 0.708185750246048,
|
|
"num_tokens": 399928261.0,
|
|
"step": 4340
|
|
},
|
|
{
|
|
"entropy": 1.16796875,
|
|
"epoch": 0.5945059450594506,
|
|
"grad_norm": 0.13088651994886763,
|
|
"learning_rate": 3.6230097224179233e-06,
|
|
"loss": 1.165,
|
|
"mean_token_accuracy": 0.7209729313850403,
|
|
"num_tokens": 400734644.0,
|
|
"step": 4350
|
|
},
|
|
{
|
|
"entropy": 1.17109375,
|
|
"epoch": 0.5958726253929206,
|
|
"grad_norm": 0.12535489344889394,
|
|
"learning_rate": 3.6194871072284064e-06,
|
|
"loss": 1.1857,
|
|
"mean_token_accuracy": 0.7179886102676392,
|
|
"num_tokens": 401677248.0,
|
|
"step": 4360
|
|
},
|
|
{
|
|
"entropy": 1.20078125,
|
|
"epoch": 0.5972393057263906,
|
|
"grad_norm": 0.12096172885055309,
|
|
"learning_rate": 3.61596449203889e-06,
|
|
"loss": 1.1959,
|
|
"mean_token_accuracy": 0.7155452907085419,
|
|
"num_tokens": 402589892.0,
|
|
"step": 4370
|
|
},
|
|
{
|
|
"entropy": 1.13359375,
|
|
"epoch": 0.5986059860598606,
|
|
"grad_norm": 0.11276457614827354,
|
|
"learning_rate": 3.6124418768493735e-06,
|
|
"loss": 1.1285,
|
|
"mean_token_accuracy": 0.727908056974411,
|
|
"num_tokens": 403493521.0,
|
|
"step": 4380
|
|
},
|
|
{
|
|
"entropy": 1.19609375,
|
|
"epoch": 0.5999726663933306,
|
|
"grad_norm": 0.11073377206348867,
|
|
"learning_rate": 3.6089192616598566e-06,
|
|
"loss": 1.2108,
|
|
"mean_token_accuracy": 0.7134578287601471,
|
|
"num_tokens": 404481917.0,
|
|
"step": 4390
|
|
},
|
|
{
|
|
"entropy": 1.196875,
|
|
"epoch": 0.6013393467268006,
|
|
"grad_norm": 0.12110010381815331,
|
|
"learning_rate": 3.6053966464703398e-06,
|
|
"loss": 1.2112,
|
|
"mean_token_accuracy": 0.7124480366706848,
|
|
"num_tokens": 405381103.0,
|
|
"step": 4400
|
|
},
|
|
{
|
|
"entropy": 1.2140625,
|
|
"epoch": 0.6027060270602707,
|
|
"grad_norm": 0.11487872121273031,
|
|
"learning_rate": 3.6018740312808233e-06,
|
|
"loss": 1.2249,
|
|
"mean_token_accuracy": 0.70823575258255,
|
|
"num_tokens": 406360551.0,
|
|
"step": 4410
|
|
},
|
|
{
|
|
"entropy": 1.178125,
|
|
"epoch": 0.6040727073937406,
|
|
"grad_norm": 0.1315131533543646,
|
|
"learning_rate": 3.5983514160913064e-06,
|
|
"loss": 1.1852,
|
|
"mean_token_accuracy": 0.7134269952774048,
|
|
"num_tokens": 407306942.0,
|
|
"step": 4420
|
|
},
|
|
{
|
|
"entropy": 1.178125,
|
|
"epoch": 0.6054393877272106,
|
|
"grad_norm": 0.10961317125141873,
|
|
"learning_rate": 3.5948288009017895e-06,
|
|
"loss": 1.1783,
|
|
"mean_token_accuracy": 0.7171334207057953,
|
|
"num_tokens": 408238455.0,
|
|
"step": 4430
|
|
},
|
|
{
|
|
"entropy": 1.196875,
|
|
"epoch": 0.6068060680606806,
|
|
"grad_norm": 0.11772826376153622,
|
|
"learning_rate": 3.591306185712273e-06,
|
|
"loss": 1.1975,
|
|
"mean_token_accuracy": 0.7145091652870178,
|
|
"num_tokens": 409152396.0,
|
|
"step": 4440
|
|
},
|
|
{
|
|
"entropy": 1.2171875,
|
|
"epoch": 0.6081727483941506,
|
|
"grad_norm": 0.10904237823933996,
|
|
"learning_rate": 3.5877835705227566e-06,
|
|
"loss": 1.2152,
|
|
"mean_token_accuracy": 0.7108929097652436,
|
|
"num_tokens": 410053504.0,
|
|
"step": 4450
|
|
},
|
|
{
|
|
"entropy": 1.13515625,
|
|
"epoch": 0.6095394287276206,
|
|
"grad_norm": 0.15197614860952868,
|
|
"learning_rate": 3.5842609553332398e-06,
|
|
"loss": 1.142,
|
|
"mean_token_accuracy": 0.7250160336494446,
|
|
"num_tokens": 410972109.0,
|
|
"step": 4460
|
|
},
|
|
{
|
|
"entropy": 1.196875,
|
|
"epoch": 0.6109061090610907,
|
|
"grad_norm": 0.12893200552901582,
|
|
"learning_rate": 3.580738340143723e-06,
|
|
"loss": 1.2017,
|
|
"mean_token_accuracy": 0.7129972040653229,
|
|
"num_tokens": 411889574.0,
|
|
"step": 4470
|
|
},
|
|
{
|
|
"entropy": 1.16640625,
|
|
"epoch": 0.6122727893945606,
|
|
"grad_norm": 0.11850560343743193,
|
|
"learning_rate": 3.577215724954206e-06,
|
|
"loss": 1.1749,
|
|
"mean_token_accuracy": 0.7198867440223694,
|
|
"num_tokens": 412842473.0,
|
|
"step": 4480
|
|
},
|
|
{
|
|
"entropy": 1.2171875,
|
|
"epoch": 0.6136394697280306,
|
|
"grad_norm": 0.10831588491510251,
|
|
"learning_rate": 3.57369310976469e-06,
|
|
"loss": 1.2221,
|
|
"mean_token_accuracy": 0.7072912275791168,
|
|
"num_tokens": 413720887.0,
|
|
"step": 4490
|
|
},
|
|
{
|
|
"entropy": 1.209375,
|
|
"epoch": 0.6150061500615006,
|
|
"grad_norm": 0.11077016707544395,
|
|
"learning_rate": 3.570170494575173e-06,
|
|
"loss": 1.2094,
|
|
"mean_token_accuracy": 0.7166453123092651,
|
|
"num_tokens": 414663481.0,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"entropy": 1.25234375,
|
|
"epoch": 0.6163728303949706,
|
|
"grad_norm": 0.11449338377922355,
|
|
"learning_rate": 3.5666478793856562e-06,
|
|
"loss": 1.2727,
|
|
"mean_token_accuracy": 0.7011445522308349,
|
|
"num_tokens": 415566128.0,
|
|
"step": 4510
|
|
},
|
|
{
|
|
"entropy": 1.16171875,
|
|
"epoch": 0.6177395107284406,
|
|
"grad_norm": 0.1252664073675538,
|
|
"learning_rate": 3.5631252641961394e-06,
|
|
"loss": 1.1681,
|
|
"mean_token_accuracy": 0.7201820850372315,
|
|
"num_tokens": 416455167.0,
|
|
"step": 4520
|
|
},
|
|
{
|
|
"entropy": 1.11171875,
|
|
"epoch": 0.6191061910619107,
|
|
"grad_norm": 0.1221147937119642,
|
|
"learning_rate": 3.5596026490066225e-06,
|
|
"loss": 1.1246,
|
|
"mean_token_accuracy": 0.7267882704734803,
|
|
"num_tokens": 417403231.0,
|
|
"step": 4530
|
|
},
|
|
{
|
|
"entropy": 1.1578125,
|
|
"epoch": 0.6204728713953807,
|
|
"grad_norm": 0.1507001143818919,
|
|
"learning_rate": 3.556080033817106e-06,
|
|
"loss": 1.1766,
|
|
"mean_token_accuracy": 0.7198359429836273,
|
|
"num_tokens": 418291523.0,
|
|
"step": 4540
|
|
},
|
|
{
|
|
"entropy": 1.22421875,
|
|
"epoch": 0.6218395517288506,
|
|
"grad_norm": 0.126938403140394,
|
|
"learning_rate": 3.5525574186275896e-06,
|
|
"loss": 1.2176,
|
|
"mean_token_accuracy": 0.7106315612792968,
|
|
"num_tokens": 419177494.0,
|
|
"step": 4550
|
|
},
|
|
{
|
|
"entropy": 1.20859375,
|
|
"epoch": 0.6232062320623206,
|
|
"grad_norm": 0.11678850782085441,
|
|
"learning_rate": 3.5490348034380727e-06,
|
|
"loss": 1.2252,
|
|
"mean_token_accuracy": 0.7094271242618561,
|
|
"num_tokens": 420118197.0,
|
|
"step": 4560
|
|
},
|
|
{
|
|
"entropy": 1.19609375,
|
|
"epoch": 0.6245729123957906,
|
|
"grad_norm": 0.11232969423422187,
|
|
"learning_rate": 3.545512188248556e-06,
|
|
"loss": 1.2001,
|
|
"mean_token_accuracy": 0.712465476989746,
|
|
"num_tokens": 421017126.0,
|
|
"step": 4570
|
|
},
|
|
{
|
|
"entropy": 1.1546875,
|
|
"epoch": 0.6259395927292606,
|
|
"grad_norm": 0.12335534796380125,
|
|
"learning_rate": 3.5419895730590394e-06,
|
|
"loss": 1.1625,
|
|
"mean_token_accuracy": 0.7195082783699036,
|
|
"num_tokens": 421902299.0,
|
|
"step": 4580
|
|
},
|
|
{
|
|
"entropy": 1.22265625,
|
|
"epoch": 0.6273062730627307,
|
|
"grad_norm": 0.12986930314923173,
|
|
"learning_rate": 3.5384669578695225e-06,
|
|
"loss": 1.226,
|
|
"mean_token_accuracy": 0.7102423429489135,
|
|
"num_tokens": 422810394.0,
|
|
"step": 4590
|
|
},
|
|
{
|
|
"entropy": 1.2125,
|
|
"epoch": 0.6286729533962007,
|
|
"grad_norm": 0.11914512505695363,
|
|
"learning_rate": 3.534944342680006e-06,
|
|
"loss": 1.2089,
|
|
"mean_token_accuracy": 0.7133949041366577,
|
|
"num_tokens": 423762943.0,
|
|
"step": 4600
|
|
},
|
|
{
|
|
"entropy": 1.165625,
|
|
"epoch": 0.6300396337296706,
|
|
"grad_norm": 0.11194490242716217,
|
|
"learning_rate": 3.5314217274904896e-06,
|
|
"loss": 1.1747,
|
|
"mean_token_accuracy": 0.7183404803276062,
|
|
"num_tokens": 424688079.0,
|
|
"step": 4610
|
|
},
|
|
{
|
|
"entropy": 1.18671875,
|
|
"epoch": 0.6314063140631406,
|
|
"grad_norm": 0.14954484870312104,
|
|
"learning_rate": 3.5278991123009727e-06,
|
|
"loss": 1.1947,
|
|
"mean_token_accuracy": 0.7162772119045258,
|
|
"num_tokens": 425593816.0,
|
|
"step": 4620
|
|
},
|
|
{
|
|
"entropy": 1.20390625,
|
|
"epoch": 0.6327729943966106,
|
|
"grad_norm": 0.13001302197106174,
|
|
"learning_rate": 3.524376497111456e-06,
|
|
"loss": 1.2063,
|
|
"mean_token_accuracy": 0.7120014131069183,
|
|
"num_tokens": 426535659.0,
|
|
"step": 4630
|
|
},
|
|
{
|
|
"entropy": 1.18359375,
|
|
"epoch": 0.6341396747300806,
|
|
"grad_norm": 0.12761532725045271,
|
|
"learning_rate": 3.520853881921939e-06,
|
|
"loss": 1.1857,
|
|
"mean_token_accuracy": 0.7166577994823455,
|
|
"num_tokens": 427448468.0,
|
|
"step": 4640
|
|
},
|
|
{
|
|
"entropy": 1.1875,
|
|
"epoch": 0.6355063550635507,
|
|
"grad_norm": 0.11424728969800861,
|
|
"learning_rate": 3.517331266732422e-06,
|
|
"loss": 1.1939,
|
|
"mean_token_accuracy": 0.716823935508728,
|
|
"num_tokens": 428404261.0,
|
|
"step": 4650
|
|
},
|
|
{
|
|
"entropy": 1.2390625,
|
|
"epoch": 0.6368730353970207,
|
|
"grad_norm": 0.11977479056906075,
|
|
"learning_rate": 3.513808651542906e-06,
|
|
"loss": 1.2481,
|
|
"mean_token_accuracy": 0.7076241910457611,
|
|
"num_tokens": 429314923.0,
|
|
"step": 4660
|
|
},
|
|
{
|
|
"entropy": 1.17421875,
|
|
"epoch": 0.6382397157304907,
|
|
"grad_norm": 0.1316774979048583,
|
|
"learning_rate": 3.510286036353389e-06,
|
|
"loss": 1.1691,
|
|
"mean_token_accuracy": 0.7197317957878113,
|
|
"num_tokens": 430177549.0,
|
|
"step": 4670
|
|
},
|
|
{
|
|
"entropy": 1.1796875,
|
|
"epoch": 0.6396063960639606,
|
|
"grad_norm": 0.10959064759874232,
|
|
"learning_rate": 3.5067634211638723e-06,
|
|
"loss": 1.1878,
|
|
"mean_token_accuracy": 0.7164951682090759,
|
|
"num_tokens": 431135284.0,
|
|
"step": 4680
|
|
},
|
|
{
|
|
"entropy": 1.16484375,
|
|
"epoch": 0.6409730763974306,
|
|
"grad_norm": 0.1133012171855591,
|
|
"learning_rate": 3.5032408059743554e-06,
|
|
"loss": 1.1755,
|
|
"mean_token_accuracy": 0.7190956771373749,
|
|
"num_tokens": 432065066.0,
|
|
"step": 4690
|
|
},
|
|
{
|
|
"entropy": 1.17109375,
|
|
"epoch": 0.6423397567309006,
|
|
"grad_norm": 0.12593530486483887,
|
|
"learning_rate": 3.499718190784839e-06,
|
|
"loss": 1.1778,
|
|
"mean_token_accuracy": 0.7173510551452636,
|
|
"num_tokens": 432943463.0,
|
|
"step": 4700
|
|
},
|
|
{
|
|
"entropy": 1.1828125,
|
|
"epoch": 0.6437064370643707,
|
|
"grad_norm": 0.11959051920880195,
|
|
"learning_rate": 3.4961955755953225e-06,
|
|
"loss": 1.1844,
|
|
"mean_token_accuracy": 0.7161161959171295,
|
|
"num_tokens": 433869594.0,
|
|
"step": 4710
|
|
},
|
|
{
|
|
"entropy": 1.2,
|
|
"epoch": 0.6450731173978407,
|
|
"grad_norm": 0.11829816704224903,
|
|
"learning_rate": 3.4926729604058056e-06,
|
|
"loss": 1.2056,
|
|
"mean_token_accuracy": 0.7127971589565277,
|
|
"num_tokens": 434779798.0,
|
|
"step": 4720
|
|
},
|
|
{
|
|
"entropy": 1.16953125,
|
|
"epoch": 0.6464397977313107,
|
|
"grad_norm": 0.1271301111565912,
|
|
"learning_rate": 3.4891503452162888e-06,
|
|
"loss": 1.1571,
|
|
"mean_token_accuracy": 0.72311772108078,
|
|
"num_tokens": 435652803.0,
|
|
"step": 4730
|
|
},
|
|
{
|
|
"entropy": 1.14296875,
|
|
"epoch": 0.6478064780647806,
|
|
"grad_norm": 0.11553688802959688,
|
|
"learning_rate": 3.4856277300267723e-06,
|
|
"loss": 1.1487,
|
|
"mean_token_accuracy": 0.7232375741004944,
|
|
"num_tokens": 436545226.0,
|
|
"step": 4740
|
|
},
|
|
{
|
|
"entropy": 1.17578125,
|
|
"epoch": 0.6491731583982506,
|
|
"grad_norm": 0.12275853471449225,
|
|
"learning_rate": 3.4821051148372554e-06,
|
|
"loss": 1.1817,
|
|
"mean_token_accuracy": 0.7170550644397735,
|
|
"num_tokens": 437438317.0,
|
|
"step": 4750
|
|
},
|
|
{
|
|
"entropy": 1.1234375,
|
|
"epoch": 0.6505398387317206,
|
|
"grad_norm": 0.13786393956296775,
|
|
"learning_rate": 3.4785824996477386e-06,
|
|
"loss": 1.1121,
|
|
"mean_token_accuracy": 0.7304640471935272,
|
|
"num_tokens": 438316880.0,
|
|
"step": 4760
|
|
},
|
|
{
|
|
"entropy": 1.108203125,
|
|
"epoch": 0.6519065190651907,
|
|
"grad_norm": 0.11720058943835181,
|
|
"learning_rate": 3.475059884458222e-06,
|
|
"loss": 1.1194,
|
|
"mean_token_accuracy": 0.7286982297897339,
|
|
"num_tokens": 439224474.0,
|
|
"step": 4770
|
|
},
|
|
{
|
|
"entropy": 1.146875,
|
|
"epoch": 0.6532731993986607,
|
|
"grad_norm": 0.11587702630988944,
|
|
"learning_rate": 3.4715372692687057e-06,
|
|
"loss": 1.156,
|
|
"mean_token_accuracy": 0.7219819188117981,
|
|
"num_tokens": 440100963.0,
|
|
"step": 4780
|
|
},
|
|
{
|
|
"entropy": 1.20078125,
|
|
"epoch": 0.6546398797321307,
|
|
"grad_norm": 0.11865786779017762,
|
|
"learning_rate": 3.4680146540791888e-06,
|
|
"loss": 1.2131,
|
|
"mean_token_accuracy": 0.7116888284683227,
|
|
"num_tokens": 441037518.0,
|
|
"step": 4790
|
|
},
|
|
{
|
|
"entropy": 1.1796875,
|
|
"epoch": 0.6560065600656007,
|
|
"grad_norm": 0.1274921088143658,
|
|
"learning_rate": 3.464492038889672e-06,
|
|
"loss": 1.1752,
|
|
"mean_token_accuracy": 0.719227546453476,
|
|
"num_tokens": 441908052.0,
|
|
"step": 4800
|
|
},
|
|
{
|
|
"entropy": 1.178125,
|
|
"epoch": 0.6573732403990706,
|
|
"grad_norm": 0.12052369946642744,
|
|
"learning_rate": 3.460969423700155e-06,
|
|
"loss": 1.1895,
|
|
"mean_token_accuracy": 0.7142337381839752,
|
|
"num_tokens": 442877731.0,
|
|
"step": 4810
|
|
},
|
|
{
|
|
"entropy": 1.1890625,
|
|
"epoch": 0.6587399207325406,
|
|
"grad_norm": 0.23620544058613,
|
|
"learning_rate": 3.457446808510639e-06,
|
|
"loss": 1.1812,
|
|
"mean_token_accuracy": 0.7184404253959655,
|
|
"num_tokens": 443815286.0,
|
|
"step": 4820
|
|
},
|
|
{
|
|
"entropy": 1.1578125,
|
|
"epoch": 0.6601066010660107,
|
|
"grad_norm": 0.1086871261380614,
|
|
"learning_rate": 3.453924193321122e-06,
|
|
"loss": 1.1732,
|
|
"mean_token_accuracy": 0.7177457928657531,
|
|
"num_tokens": 444773427.0,
|
|
"step": 4830
|
|
},
|
|
{
|
|
"entropy": 1.151953125,
|
|
"epoch": 0.6614732813994807,
|
|
"grad_norm": 0.1319099339427452,
|
|
"learning_rate": 3.4504015781316052e-06,
|
|
"loss": 1.1425,
|
|
"mean_token_accuracy": 0.7249931454658508,
|
|
"num_tokens": 445718805.0,
|
|
"step": 4840
|
|
},
|
|
{
|
|
"entropy": 1.19453125,
|
|
"epoch": 0.6628399617329507,
|
|
"grad_norm": 0.11388194410152358,
|
|
"learning_rate": 3.4468789629420884e-06,
|
|
"loss": 1.2109,
|
|
"mean_token_accuracy": 0.7110455989837646,
|
|
"num_tokens": 446702383.0,
|
|
"step": 4850
|
|
},
|
|
{
|
|
"entropy": 1.123828125,
|
|
"epoch": 0.6642066420664207,
|
|
"grad_norm": 0.12385194012593102,
|
|
"learning_rate": 3.4433563477525715e-06,
|
|
"loss": 1.1374,
|
|
"mean_token_accuracy": 0.7263393759727478,
|
|
"num_tokens": 447616023.0,
|
|
"step": 4860
|
|
},
|
|
{
|
|
"entropy": 1.1828125,
|
|
"epoch": 0.6655733223998906,
|
|
"grad_norm": 0.11610190750195781,
|
|
"learning_rate": 3.439833732563055e-06,
|
|
"loss": 1.1828,
|
|
"mean_token_accuracy": 0.7176982760429382,
|
|
"num_tokens": 448525850.0,
|
|
"step": 4870
|
|
},
|
|
{
|
|
"entropy": 1.16640625,
|
|
"epoch": 0.6669400027333606,
|
|
"grad_norm": 0.1234976713016392,
|
|
"learning_rate": 3.4363111173735386e-06,
|
|
"loss": 1.1754,
|
|
"mean_token_accuracy": 0.717546421289444,
|
|
"num_tokens": 449488499.0,
|
|
"step": 4880
|
|
},
|
|
{
|
|
"entropy": 1.16875,
|
|
"epoch": 0.6683066830668307,
|
|
"grad_norm": 0.11331510293862714,
|
|
"learning_rate": 3.4327885021840217e-06,
|
|
"loss": 1.1777,
|
|
"mean_token_accuracy": 0.7186765313148499,
|
|
"num_tokens": 450427887.0,
|
|
"step": 4890
|
|
},
|
|
{
|
|
"entropy": 1.175,
|
|
"epoch": 0.6696733634003007,
|
|
"grad_norm": 0.12744375703178912,
|
|
"learning_rate": 3.429265886994505e-06,
|
|
"loss": 1.1878,
|
|
"mean_token_accuracy": 0.7169139802455902,
|
|
"num_tokens": 451388079.0,
|
|
"step": 4900
|
|
},
|
|
{
|
|
"entropy": 1.1640625,
|
|
"epoch": 0.6710400437337707,
|
|
"grad_norm": 0.13493943013865908,
|
|
"learning_rate": 3.4257432718049884e-06,
|
|
"loss": 1.1639,
|
|
"mean_token_accuracy": 0.7204837620258331,
|
|
"num_tokens": 452325548.0,
|
|
"step": 4910
|
|
},
|
|
{
|
|
"entropy": 1.1515625,
|
|
"epoch": 0.6724067240672407,
|
|
"grad_norm": 0.10978403425817124,
|
|
"learning_rate": 3.4222206566154715e-06,
|
|
"loss": 1.1738,
|
|
"mean_token_accuracy": 0.7202117681503296,
|
|
"num_tokens": 453283517.0,
|
|
"step": 4920
|
|
},
|
|
{
|
|
"entropy": 1.19296875,
|
|
"epoch": 0.6737734044007107,
|
|
"grad_norm": 0.11568152150741871,
|
|
"learning_rate": 3.418698041425955e-06,
|
|
"loss": 1.1969,
|
|
"mean_token_accuracy": 0.7131158888339997,
|
|
"num_tokens": 454196973.0,
|
|
"step": 4930
|
|
},
|
|
{
|
|
"entropy": 1.133984375,
|
|
"epoch": 0.6751400847341806,
|
|
"grad_norm": 0.11253124885200226,
|
|
"learning_rate": 3.415175426236438e-06,
|
|
"loss": 1.1344,
|
|
"mean_token_accuracy": 0.7262726187705993,
|
|
"num_tokens": 455141874.0,
|
|
"step": 4940
|
|
},
|
|
{
|
|
"entropy": 1.09921875,
|
|
"epoch": 0.6765067650676507,
|
|
"grad_norm": 0.11152998879835962,
|
|
"learning_rate": 3.4116528110469217e-06,
|
|
"loss": 1.0989,
|
|
"mean_token_accuracy": 0.7353122949600219,
|
|
"num_tokens": 456058190.0,
|
|
"step": 4950
|
|
},
|
|
{
|
|
"entropy": 1.159375,
|
|
"epoch": 0.6778734454011207,
|
|
"grad_norm": 0.1268164181111793,
|
|
"learning_rate": 3.408130195857405e-06,
|
|
"loss": 1.1617,
|
|
"mean_token_accuracy": 0.7219428420066833,
|
|
"num_tokens": 456928962.0,
|
|
"step": 4960
|
|
},
|
|
{
|
|
"entropy": 1.1828125,
|
|
"epoch": 0.6792401257345907,
|
|
"grad_norm": 0.14558157032386543,
|
|
"learning_rate": 3.404607580667888e-06,
|
|
"loss": 1.1837,
|
|
"mean_token_accuracy": 0.7179809749126435,
|
|
"num_tokens": 457856361.0,
|
|
"step": 4970
|
|
},
|
|
{
|
|
"entropy": 1.134375,
|
|
"epoch": 0.6806068060680607,
|
|
"grad_norm": 0.12852791406755443,
|
|
"learning_rate": 3.401084965478371e-06,
|
|
"loss": 1.1402,
|
|
"mean_token_accuracy": 0.7273676931858063,
|
|
"num_tokens": 458756840.0,
|
|
"step": 4980
|
|
},
|
|
{
|
|
"entropy": 1.18359375,
|
|
"epoch": 0.6819734864015307,
|
|
"grad_norm": 0.11737090214239798,
|
|
"learning_rate": 3.397562350288855e-06,
|
|
"loss": 1.19,
|
|
"mean_token_accuracy": 0.7172590494155884,
|
|
"num_tokens": 459682905.0,
|
|
"step": 4990
|
|
},
|
|
{
|
|
"entropy": 1.15,
|
|
"epoch": 0.6833401667350006,
|
|
"grad_norm": 0.11389270967799213,
|
|
"learning_rate": 3.394039735099338e-06,
|
|
"loss": 1.1555,
|
|
"mean_token_accuracy": 0.7218644022941589,
|
|
"num_tokens": 460604101.0,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"entropy": 1.12109375,
|
|
"epoch": 0.6847068470684707,
|
|
"grad_norm": 0.11725789049433252,
|
|
"learning_rate": 3.3905171199098213e-06,
|
|
"loss": 1.1337,
|
|
"mean_token_accuracy": 0.7245361268520355,
|
|
"num_tokens": 461512991.0,
|
|
"step": 5010
|
|
},
|
|
{
|
|
"entropy": 1.14765625,
|
|
"epoch": 0.6860735274019407,
|
|
"grad_norm": 0.10803892646013451,
|
|
"learning_rate": 3.3869945047203044e-06,
|
|
"loss": 1.1581,
|
|
"mean_token_accuracy": 0.7206469655036927,
|
|
"num_tokens": 462454053.0,
|
|
"step": 5020
|
|
},
|
|
{
|
|
"entropy": 1.1671875,
|
|
"epoch": 0.6874402077354107,
|
|
"grad_norm": 0.11843937159894127,
|
|
"learning_rate": 3.3834718895307876e-06,
|
|
"loss": 1.1576,
|
|
"mean_token_accuracy": 0.7206592977046966,
|
|
"num_tokens": 463360778.0,
|
|
"step": 5030
|
|
},
|
|
{
|
|
"entropy": 1.159375,
|
|
"epoch": 0.6888068880688807,
|
|
"grad_norm": 0.10865512848667028,
|
|
"learning_rate": 3.3799492743412715e-06,
|
|
"loss": 1.1722,
|
|
"mean_token_accuracy": 0.7199220597743988,
|
|
"num_tokens": 464284769.0,
|
|
"step": 5040
|
|
},
|
|
{
|
|
"entropy": 1.142578125,
|
|
"epoch": 0.6901735684023507,
|
|
"grad_norm": 0.12282195152564211,
|
|
"learning_rate": 3.3764266591517547e-06,
|
|
"loss": 1.1481,
|
|
"mean_token_accuracy": 0.7236648738384247,
|
|
"num_tokens": 465220861.0,
|
|
"step": 5050
|
|
},
|
|
{
|
|
"entropy": 1.16875,
|
|
"epoch": 0.6915402487358207,
|
|
"grad_norm": 0.12132667418238609,
|
|
"learning_rate": 3.3729040439622378e-06,
|
|
"loss": 1.1623,
|
|
"mean_token_accuracy": 0.7194007277488709,
|
|
"num_tokens": 466152597.0,
|
|
"step": 5060
|
|
},
|
|
{
|
|
"entropy": 1.1875,
|
|
"epoch": 0.6929069290692907,
|
|
"grad_norm": 0.1304327927177627,
|
|
"learning_rate": 3.3693814287727213e-06,
|
|
"loss": 1.1847,
|
|
"mean_token_accuracy": 0.7156933426856995,
|
|
"num_tokens": 467052088.0,
|
|
"step": 5070
|
|
},
|
|
{
|
|
"entropy": 1.203125,
|
|
"epoch": 0.6942736094027607,
|
|
"grad_norm": 0.1201767790861157,
|
|
"learning_rate": 3.3658588135832044e-06,
|
|
"loss": 1.2079,
|
|
"mean_token_accuracy": 0.714240676164627,
|
|
"num_tokens": 468034958.0,
|
|
"step": 5080
|
|
},
|
|
{
|
|
"entropy": 1.15234375,
|
|
"epoch": 0.6956402897362307,
|
|
"grad_norm": 0.11580548533392808,
|
|
"learning_rate": 3.3623361983936876e-06,
|
|
"loss": 1.1576,
|
|
"mean_token_accuracy": 0.7239193975925445,
|
|
"num_tokens": 469021348.0,
|
|
"step": 5090
|
|
},
|
|
{
|
|
"entropy": 1.18125,
|
|
"epoch": 0.6970069700697007,
|
|
"grad_norm": 0.1129527418818433,
|
|
"learning_rate": 3.358813583204171e-06,
|
|
"loss": 1.1847,
|
|
"mean_token_accuracy": 0.715942257642746,
|
|
"num_tokens": 469996550.0,
|
|
"step": 5100
|
|
},
|
|
{
|
|
"entropy": 1.23515625,
|
|
"epoch": 0.6983736504031707,
|
|
"grad_norm": 0.12946128538829293,
|
|
"learning_rate": 3.3552909680146547e-06,
|
|
"loss": 1.2366,
|
|
"mean_token_accuracy": 0.7078143179416656,
|
|
"num_tokens": 470920985.0,
|
|
"step": 5110
|
|
},
|
|
{
|
|
"entropy": 1.1296875,
|
|
"epoch": 0.6997403307366407,
|
|
"grad_norm": 0.11754399475277835,
|
|
"learning_rate": 3.3517683528251378e-06,
|
|
"loss": 1.1381,
|
|
"mean_token_accuracy": 0.7253030836582184,
|
|
"num_tokens": 471845765.0,
|
|
"step": 5120
|
|
},
|
|
{
|
|
"entropy": 1.1421875,
|
|
"epoch": 0.7011070110701108,
|
|
"grad_norm": 0.12011917738194014,
|
|
"learning_rate": 3.348245737635621e-06,
|
|
"loss": 1.1386,
|
|
"mean_token_accuracy": 0.7253535091876984,
|
|
"num_tokens": 472791415.0,
|
|
"step": 5130
|
|
},
|
|
{
|
|
"entropy": 1.18046875,
|
|
"epoch": 0.7024736914035807,
|
|
"grad_norm": 0.10762018094223616,
|
|
"learning_rate": 3.344723122446104e-06,
|
|
"loss": 1.1717,
|
|
"mean_token_accuracy": 0.7177989840507507,
|
|
"num_tokens": 473685911.0,
|
|
"step": 5140
|
|
},
|
|
{
|
|
"entropy": 1.190625,
|
|
"epoch": 0.7038403717370507,
|
|
"grad_norm": 0.11739060664643343,
|
|
"learning_rate": 3.341200507256587e-06,
|
|
"loss": 1.1994,
|
|
"mean_token_accuracy": 0.7161009728908538,
|
|
"num_tokens": 474598701.0,
|
|
"step": 5150
|
|
},
|
|
{
|
|
"entropy": 1.209375,
|
|
"epoch": 0.7052070520705207,
|
|
"grad_norm": 0.12539044474577143,
|
|
"learning_rate": 3.337677892067071e-06,
|
|
"loss": 1.2063,
|
|
"mean_token_accuracy": 0.714670842885971,
|
|
"num_tokens": 475505552.0,
|
|
"step": 5160
|
|
},
|
|
{
|
|
"entropy": 1.18984375,
|
|
"epoch": 0.7065737324039907,
|
|
"grad_norm": 0.1342516614411594,
|
|
"learning_rate": 3.3341552768775543e-06,
|
|
"loss": 1.1999,
|
|
"mean_token_accuracy": 0.7114529550075531,
|
|
"num_tokens": 476440433.0,
|
|
"step": 5170
|
|
},
|
|
{
|
|
"entropy": 1.17734375,
|
|
"epoch": 0.7079404127374607,
|
|
"grad_norm": 0.12657138069678986,
|
|
"learning_rate": 3.3306326616880374e-06,
|
|
"loss": 1.1895,
|
|
"mean_token_accuracy": 0.7154150068759918,
|
|
"num_tokens": 477364926.0,
|
|
"step": 5180
|
|
},
|
|
{
|
|
"entropy": 1.20703125,
|
|
"epoch": 0.7093070930709308,
|
|
"grad_norm": 0.11475598372860284,
|
|
"learning_rate": 3.3271100464985205e-06,
|
|
"loss": 1.2024,
|
|
"mean_token_accuracy": 0.714757114648819,
|
|
"num_tokens": 478303729.0,
|
|
"step": 5190
|
|
},
|
|
{
|
|
"entropy": 1.15859375,
|
|
"epoch": 0.7106737734044007,
|
|
"grad_norm": 0.12148337357086215,
|
|
"learning_rate": 3.323587431309004e-06,
|
|
"loss": 1.1602,
|
|
"mean_token_accuracy": 0.7240491032600402,
|
|
"num_tokens": 479265206.0,
|
|
"step": 5200
|
|
},
|
|
{
|
|
"entropy": 1.203125,
|
|
"epoch": 0.7120404537378707,
|
|
"grad_norm": 0.12251551145414877,
|
|
"learning_rate": 3.3200648161194876e-06,
|
|
"loss": 1.1935,
|
|
"mean_token_accuracy": 0.714001727104187,
|
|
"num_tokens": 480196185.0,
|
|
"step": 5210
|
|
},
|
|
{
|
|
"entropy": 1.16640625,
|
|
"epoch": 0.7134071340713407,
|
|
"grad_norm": 0.11515384484429181,
|
|
"learning_rate": 3.3165422009299707e-06,
|
|
"loss": 1.1738,
|
|
"mean_token_accuracy": 0.7193644046783447,
|
|
"num_tokens": 481119022.0,
|
|
"step": 5220
|
|
},
|
|
{
|
|
"entropy": 1.190625,
|
|
"epoch": 0.7147738144048107,
|
|
"grad_norm": 0.11862047941186664,
|
|
"learning_rate": 3.313019585740454e-06,
|
|
"loss": 1.1913,
|
|
"mean_token_accuracy": 0.7159637153148651,
|
|
"num_tokens": 482034528.0,
|
|
"step": 5230
|
|
},
|
|
{
|
|
"entropy": 1.203125,
|
|
"epoch": 0.7161404947382807,
|
|
"grad_norm": 0.13097605553696035,
|
|
"learning_rate": 3.3094969705509374e-06,
|
|
"loss": 1.2143,
|
|
"mean_token_accuracy": 0.7104229867458344,
|
|
"num_tokens": 482963072.0,
|
|
"step": 5240
|
|
},
|
|
{
|
|
"entropy": 1.16171875,
|
|
"epoch": 0.7175071750717508,
|
|
"grad_norm": 0.12074689042864989,
|
|
"learning_rate": 3.3059743553614205e-06,
|
|
"loss": 1.1761,
|
|
"mean_token_accuracy": 0.7190313518047333,
|
|
"num_tokens": 483869466.0,
|
|
"step": 5250
|
|
},
|
|
{
|
|
"entropy": 1.14609375,
|
|
"epoch": 0.7188738554052208,
|
|
"grad_norm": 0.11897721626140363,
|
|
"learning_rate": 3.3024517401719036e-06,
|
|
"loss": 1.143,
|
|
"mean_token_accuracy": 0.7222785472869873,
|
|
"num_tokens": 484772219.0,
|
|
"step": 5260
|
|
},
|
|
{
|
|
"entropy": 1.17578125,
|
|
"epoch": 0.7202405357386907,
|
|
"grad_norm": 0.12454238424032706,
|
|
"learning_rate": 3.298929124982387e-06,
|
|
"loss": 1.1936,
|
|
"mean_token_accuracy": 0.7199256300926209,
|
|
"num_tokens": 485705049.0,
|
|
"step": 5270
|
|
},
|
|
{
|
|
"entropy": 1.203125,
|
|
"epoch": 0.7216072160721607,
|
|
"grad_norm": 0.12481649255531777,
|
|
"learning_rate": 3.2954065097928707e-06,
|
|
"loss": 1.2181,
|
|
"mean_token_accuracy": 0.7124844133853913,
|
|
"num_tokens": 486647525.0,
|
|
"step": 5280
|
|
},
|
|
{
|
|
"entropy": 1.18359375,
|
|
"epoch": 0.7229738964056307,
|
|
"grad_norm": 0.11348577957896035,
|
|
"learning_rate": 3.291883894603354e-06,
|
|
"loss": 1.1881,
|
|
"mean_token_accuracy": 0.7183633089065552,
|
|
"num_tokens": 487571730.0,
|
|
"step": 5290
|
|
},
|
|
{
|
|
"entropy": 1.12578125,
|
|
"epoch": 0.7243405767391007,
|
|
"grad_norm": 0.1418920454374185,
|
|
"learning_rate": 3.288361279413837e-06,
|
|
"loss": 1.1266,
|
|
"mean_token_accuracy": 0.7254550218582153,
|
|
"num_tokens": 488486162.0,
|
|
"step": 5300
|
|
},
|
|
{
|
|
"entropy": 1.1359375,
|
|
"epoch": 0.7257072570725708,
|
|
"grad_norm": 0.11489265154709079,
|
|
"learning_rate": 3.28483866422432e-06,
|
|
"loss": 1.1378,
|
|
"mean_token_accuracy": 0.7268625140190125,
|
|
"num_tokens": 489397994.0,
|
|
"step": 5310
|
|
},
|
|
{
|
|
"entropy": 1.18671875,
|
|
"epoch": 0.7270739374060408,
|
|
"grad_norm": 0.12678914925288343,
|
|
"learning_rate": 3.281316049034804e-06,
|
|
"loss": 1.2018,
|
|
"mean_token_accuracy": 0.7110317647457123,
|
|
"num_tokens": 490378487.0,
|
|
"step": 5320
|
|
},
|
|
{
|
|
"entropy": 1.16015625,
|
|
"epoch": 0.7284406177395107,
|
|
"grad_norm": 0.12242946556787464,
|
|
"learning_rate": 3.277793433845287e-06,
|
|
"loss": 1.1583,
|
|
"mean_token_accuracy": 0.7177072405815125,
|
|
"num_tokens": 491314657.0,
|
|
"step": 5330
|
|
},
|
|
{
|
|
"entropy": 1.1578125,
|
|
"epoch": 0.7298072980729807,
|
|
"grad_norm": 0.11480929529512028,
|
|
"learning_rate": 3.2742708186557703e-06,
|
|
"loss": 1.1629,
|
|
"mean_token_accuracy": 0.723511028289795,
|
|
"num_tokens": 492253180.0,
|
|
"step": 5340
|
|
},
|
|
{
|
|
"entropy": 1.18046875,
|
|
"epoch": 0.7311739784064507,
|
|
"grad_norm": 0.12601063355975212,
|
|
"learning_rate": 3.2707482034662534e-06,
|
|
"loss": 1.1785,
|
|
"mean_token_accuracy": 0.7161814987659454,
|
|
"num_tokens": 493185858.0,
|
|
"step": 5350
|
|
},
|
|
{
|
|
"entropy": 1.166015625,
|
|
"epoch": 0.7325406587399207,
|
|
"grad_norm": 0.11889751262683779,
|
|
"learning_rate": 3.2672255882767366e-06,
|
|
"loss": 1.1621,
|
|
"mean_token_accuracy": 0.7235997080802917,
|
|
"num_tokens": 494110299.0,
|
|
"step": 5360
|
|
},
|
|
{
|
|
"entropy": 1.1453125,
|
|
"epoch": 0.7339073390733908,
|
|
"grad_norm": 0.10592888105192573,
|
|
"learning_rate": 3.26370297308722e-06,
|
|
"loss": 1.1509,
|
|
"mean_token_accuracy": 0.723431122303009,
|
|
"num_tokens": 495063936.0,
|
|
"step": 5370
|
|
},
|
|
{
|
|
"entropy": 1.16875,
|
|
"epoch": 0.7352740194068608,
|
|
"grad_norm": 0.12186922354177555,
|
|
"learning_rate": 3.2601803578977037e-06,
|
|
"loss": 1.1764,
|
|
"mean_token_accuracy": 0.7168774545192719,
|
|
"num_tokens": 495982933.0,
|
|
"step": 5380
|
|
},
|
|
{
|
|
"entropy": 1.127734375,
|
|
"epoch": 0.7366406997403308,
|
|
"grad_norm": 0.13993550039706193,
|
|
"learning_rate": 3.2566577427081868e-06,
|
|
"loss": 1.1355,
|
|
"mean_token_accuracy": 0.7255950093269348,
|
|
"num_tokens": 496933673.0,
|
|
"step": 5390
|
|
},
|
|
{
|
|
"entropy": 1.1703125,
|
|
"epoch": 0.7380073800738007,
|
|
"grad_norm": 0.12289108315068929,
|
|
"learning_rate": 3.25313512751867e-06,
|
|
"loss": 1.1796,
|
|
"mean_token_accuracy": 0.7177545487880707,
|
|
"num_tokens": 497833208.0,
|
|
"step": 5400
|
|
},
|
|
{
|
|
"entropy": 1.2046875,
|
|
"epoch": 0.7393740604072707,
|
|
"grad_norm": 0.12588089196410332,
|
|
"learning_rate": 3.2496125123291535e-06,
|
|
"loss": 1.2203,
|
|
"mean_token_accuracy": 0.7102026283740998,
|
|
"num_tokens": 498759050.0,
|
|
"step": 5410
|
|
},
|
|
{
|
|
"entropy": 1.146875,
|
|
"epoch": 0.7407407407407407,
|
|
"grad_norm": 0.12699068478376344,
|
|
"learning_rate": 3.2460898971396366e-06,
|
|
"loss": 1.1491,
|
|
"mean_token_accuracy": 0.7257092118263244,
|
|
"num_tokens": 499683263.0,
|
|
"step": 5420
|
|
},
|
|
{
|
|
"entropy": 1.17265625,
|
|
"epoch": 0.7421074210742108,
|
|
"grad_norm": 0.11622733464437315,
|
|
"learning_rate": 3.24256728195012e-06,
|
|
"loss": 1.1829,
|
|
"mean_token_accuracy": 0.7172724425792694,
|
|
"num_tokens": 500587418.0,
|
|
"step": 5430
|
|
},
|
|
{
|
|
"entropy": 1.15625,
|
|
"epoch": 0.7434741014076808,
|
|
"grad_norm": 0.11435091361081975,
|
|
"learning_rate": 3.2390446667606037e-06,
|
|
"loss": 1.1688,
|
|
"mean_token_accuracy": 0.7195350706577301,
|
|
"num_tokens": 501538593.0,
|
|
"step": 5440
|
|
},
|
|
{
|
|
"entropy": 1.16640625,
|
|
"epoch": 0.7448407817411508,
|
|
"grad_norm": 0.11892482488803734,
|
|
"learning_rate": 3.235522051571087e-06,
|
|
"loss": 1.1657,
|
|
"mean_token_accuracy": 0.7215801417827606,
|
|
"num_tokens": 502458567.0,
|
|
"step": 5450
|
|
},
|
|
{
|
|
"entropy": 1.18828125,
|
|
"epoch": 0.7462074620746207,
|
|
"grad_norm": 0.11457447969457348,
|
|
"learning_rate": 3.23199943638157e-06,
|
|
"loss": 1.197,
|
|
"mean_token_accuracy": 0.7151006758213043,
|
|
"num_tokens": 503343942.0,
|
|
"step": 5460
|
|
},
|
|
{
|
|
"entropy": 1.19453125,
|
|
"epoch": 0.7475741424080907,
|
|
"grad_norm": 0.11590670617764837,
|
|
"learning_rate": 3.228476821192053e-06,
|
|
"loss": 1.2167,
|
|
"mean_token_accuracy": 0.7115792334079742,
|
|
"num_tokens": 504273657.0,
|
|
"step": 5470
|
|
},
|
|
{
|
|
"entropy": 1.18359375,
|
|
"epoch": 0.7489408227415607,
|
|
"grad_norm": 0.1283282186241754,
|
|
"learning_rate": 3.224954206002536e-06,
|
|
"loss": 1.1972,
|
|
"mean_token_accuracy": 0.7140901446342468,
|
|
"num_tokens": 505223492.0,
|
|
"step": 5480
|
|
},
|
|
{
|
|
"entropy": 1.1421875,
|
|
"epoch": 0.7503075030750308,
|
|
"grad_norm": 0.11917969852437499,
|
|
"learning_rate": 3.22143159081302e-06,
|
|
"loss": 1.1429,
|
|
"mean_token_accuracy": 0.7254290640354156,
|
|
"num_tokens": 506149685.0,
|
|
"step": 5490
|
|
},
|
|
{
|
|
"entropy": 1.2046875,
|
|
"epoch": 0.7516741834085008,
|
|
"grad_norm": 0.14312726255818503,
|
|
"learning_rate": 3.2179089756235033e-06,
|
|
"loss": 1.199,
|
|
"mean_token_accuracy": 0.7127479195594788,
|
|
"num_tokens": 507071941.0,
|
|
"step": 5500
|
|
},
|
|
{
|
|
"entropy": 1.20390625,
|
|
"epoch": 0.7530408637419708,
|
|
"grad_norm": 0.11328668839695345,
|
|
"learning_rate": 3.2143863604339864e-06,
|
|
"loss": 1.2065,
|
|
"mean_token_accuracy": 0.7128283143043518,
|
|
"num_tokens": 508013836.0,
|
|
"step": 5510
|
|
},
|
|
{
|
|
"entropy": 1.1859375,
|
|
"epoch": 0.7544075440754408,
|
|
"grad_norm": 0.127626249309043,
|
|
"learning_rate": 3.2108637452444695e-06,
|
|
"loss": 1.1956,
|
|
"mean_token_accuracy": 0.7164035558700561,
|
|
"num_tokens": 508928276.0,
|
|
"step": 5520
|
|
},
|
|
{
|
|
"entropy": 1.2125,
|
|
"epoch": 0.7557742244089107,
|
|
"grad_norm": 0.11471960900120945,
|
|
"learning_rate": 3.2073411300549526e-06,
|
|
"loss": 1.2177,
|
|
"mean_token_accuracy": 0.7127901136875152,
|
|
"num_tokens": 509829624.0,
|
|
"step": 5530
|
|
},
|
|
{
|
|
"entropy": 1.125390625,
|
|
"epoch": 0.7571409047423807,
|
|
"grad_norm": 0.11807747196549914,
|
|
"learning_rate": 3.2038185148654366e-06,
|
|
"loss": 1.1266,
|
|
"mean_token_accuracy": 0.7278662741184234,
|
|
"num_tokens": 510741617.0,
|
|
"step": 5540
|
|
},
|
|
{
|
|
"entropy": 1.19609375,
|
|
"epoch": 0.7585075850758508,
|
|
"grad_norm": 0.15656194612660632,
|
|
"learning_rate": 3.2002958996759197e-06,
|
|
"loss": 1.2048,
|
|
"mean_token_accuracy": 0.7128302335739136,
|
|
"num_tokens": 511633875.0,
|
|
"step": 5550
|
|
},
|
|
{
|
|
"entropy": 1.190625,
|
|
"epoch": 0.7598742654093208,
|
|
"grad_norm": 0.116989520697055,
|
|
"learning_rate": 3.196773284486403e-06,
|
|
"loss": 1.1944,
|
|
"mean_token_accuracy": 0.7176302611827851,
|
|
"num_tokens": 512550694.0,
|
|
"step": 5560
|
|
},
|
|
{
|
|
"entropy": 1.19375,
|
|
"epoch": 0.7612409457427908,
|
|
"grad_norm": 0.12464074557578506,
|
|
"learning_rate": 3.1932506692968864e-06,
|
|
"loss": 1.199,
|
|
"mean_token_accuracy": 0.7135713517665863,
|
|
"num_tokens": 513493021.0,
|
|
"step": 5570
|
|
},
|
|
{
|
|
"entropy": 1.15234375,
|
|
"epoch": 0.7626076260762608,
|
|
"grad_norm": 0.11893450970939654,
|
|
"learning_rate": 3.1897280541073695e-06,
|
|
"loss": 1.1548,
|
|
"mean_token_accuracy": 0.7219637572765351,
|
|
"num_tokens": 514438780.0,
|
|
"step": 5580
|
|
},
|
|
{
|
|
"entropy": 1.16875,
|
|
"epoch": 0.7639743064097307,
|
|
"grad_norm": 0.11035196521154642,
|
|
"learning_rate": 3.1862054389178526e-06,
|
|
"loss": 1.1753,
|
|
"mean_token_accuracy": 0.7191912531852722,
|
|
"num_tokens": 515416366.0,
|
|
"step": 5590
|
|
},
|
|
{
|
|
"entropy": 1.146875,
|
|
"epoch": 0.7653409867432007,
|
|
"grad_norm": 0.12902148864452434,
|
|
"learning_rate": 3.182682823728336e-06,
|
|
"loss": 1.1506,
|
|
"mean_token_accuracy": 0.720986521244049,
|
|
"num_tokens": 516348299.0,
|
|
"step": 5600
|
|
},
|
|
{
|
|
"entropy": 1.14296875,
|
|
"epoch": 0.7667076670766708,
|
|
"grad_norm": 0.11555541532363192,
|
|
"learning_rate": 3.1791602085388197e-06,
|
|
"loss": 1.1467,
|
|
"mean_token_accuracy": 0.724240529537201,
|
|
"num_tokens": 517284732.0,
|
|
"step": 5610
|
|
},
|
|
{
|
|
"entropy": 1.21796875,
|
|
"epoch": 0.7680743474101408,
|
|
"grad_norm": 0.15370487995553359,
|
|
"learning_rate": 3.175637593349303e-06,
|
|
"loss": 1.2316,
|
|
"mean_token_accuracy": 0.7098627805709838,
|
|
"num_tokens": 518184206.0,
|
|
"step": 5620
|
|
},
|
|
{
|
|
"entropy": 1.19921875,
|
|
"epoch": 0.7694410277436108,
|
|
"grad_norm": 0.11453412013326471,
|
|
"learning_rate": 3.172114978159786e-06,
|
|
"loss": 1.2038,
|
|
"mean_token_accuracy": 0.713918673992157,
|
|
"num_tokens": 519133164.0,
|
|
"step": 5630
|
|
},
|
|
{
|
|
"entropy": 1.16640625,
|
|
"epoch": 0.7708077080770808,
|
|
"grad_norm": 0.11642723133543813,
|
|
"learning_rate": 3.168592362970269e-06,
|
|
"loss": 1.1832,
|
|
"mean_token_accuracy": 0.7192805349826813,
|
|
"num_tokens": 520018070.0,
|
|
"step": 5640
|
|
},
|
|
{
|
|
"entropy": 1.14375,
|
|
"epoch": 0.7721743884105507,
|
|
"grad_norm": 0.11747727653386844,
|
|
"learning_rate": 3.165069747780753e-06,
|
|
"loss": 1.1498,
|
|
"mean_token_accuracy": 0.7241020739078522,
|
|
"num_tokens": 520910258.0,
|
|
"step": 5650
|
|
},
|
|
{
|
|
"entropy": 1.19765625,
|
|
"epoch": 0.7735410687440207,
|
|
"grad_norm": 0.11504223834683512,
|
|
"learning_rate": 3.161547132591236e-06,
|
|
"loss": 1.2063,
|
|
"mean_token_accuracy": 0.713119512796402,
|
|
"num_tokens": 521858095.0,
|
|
"step": 5660
|
|
},
|
|
{
|
|
"entropy": 1.18203125,
|
|
"epoch": 0.7749077490774908,
|
|
"grad_norm": 0.1129799665833953,
|
|
"learning_rate": 3.1580245174017193e-06,
|
|
"loss": 1.1982,
|
|
"mean_token_accuracy": 0.7144485771656036,
|
|
"num_tokens": 522796118.0,
|
|
"step": 5670
|
|
},
|
|
{
|
|
"entropy": 1.183984375,
|
|
"epoch": 0.7762744294109608,
|
|
"grad_norm": 0.12738510181294915,
|
|
"learning_rate": 3.1545019022122025e-06,
|
|
"loss": 1.1824,
|
|
"mean_token_accuracy": 0.7174862086772918,
|
|
"num_tokens": 523717529.0,
|
|
"step": 5680
|
|
},
|
|
{
|
|
"entropy": 1.2078125,
|
|
"epoch": 0.7776411097444308,
|
|
"grad_norm": 0.11812337887840917,
|
|
"learning_rate": 3.1509792870226856e-06,
|
|
"loss": 1.2211,
|
|
"mean_token_accuracy": 0.709436559677124,
|
|
"num_tokens": 524649560.0,
|
|
"step": 5690
|
|
},
|
|
{
|
|
"entropy": 1.14296875,
|
|
"epoch": 0.7790077900779008,
|
|
"grad_norm": 0.1407127429547104,
|
|
"learning_rate": 3.147456671833169e-06,
|
|
"loss": 1.1533,
|
|
"mean_token_accuracy": 0.7231734871864319,
|
|
"num_tokens": 525561680.0,
|
|
"step": 5700
|
|
},
|
|
{
|
|
"entropy": 1.166796875,
|
|
"epoch": 0.7803744704113708,
|
|
"grad_norm": 0.11506331606665975,
|
|
"learning_rate": 3.1439340566436527e-06,
|
|
"loss": 1.1601,
|
|
"mean_token_accuracy": 0.72093967795372,
|
|
"num_tokens": 526487767.0,
|
|
"step": 5710
|
|
},
|
|
{
|
|
"entropy": 1.1796875,
|
|
"epoch": 0.7817411507448407,
|
|
"grad_norm": 0.11648160817149189,
|
|
"learning_rate": 3.140411441454136e-06,
|
|
"loss": 1.1875,
|
|
"mean_token_accuracy": 0.7174256622791291,
|
|
"num_tokens": 527395760.0,
|
|
"step": 5720
|
|
},
|
|
{
|
|
"entropy": 1.1703125,
|
|
"epoch": 0.7831078310783108,
|
|
"grad_norm": 0.11576831677610766,
|
|
"learning_rate": 3.136888826264619e-06,
|
|
"loss": 1.1796,
|
|
"mean_token_accuracy": 0.7161907076835632,
|
|
"num_tokens": 528315987.0,
|
|
"step": 5730
|
|
},
|
|
{
|
|
"entropy": 1.14609375,
|
|
"epoch": 0.7844745114117808,
|
|
"grad_norm": 0.12328056023760331,
|
|
"learning_rate": 3.1333662110751025e-06,
|
|
"loss": 1.1517,
|
|
"mean_token_accuracy": 0.7225177168846131,
|
|
"num_tokens": 529222893.0,
|
|
"step": 5740
|
|
},
|
|
{
|
|
"entropy": 1.109765625,
|
|
"epoch": 0.7858411917452508,
|
|
"grad_norm": 0.13323120299298458,
|
|
"learning_rate": 3.1298435958855856e-06,
|
|
"loss": 1.1089,
|
|
"mean_token_accuracy": 0.7313652694225311,
|
|
"num_tokens": 530101850.0,
|
|
"step": 5750
|
|
},
|
|
{
|
|
"entropy": 1.18203125,
|
|
"epoch": 0.7872078720787208,
|
|
"grad_norm": 0.11394762929337816,
|
|
"learning_rate": 3.126320980696069e-06,
|
|
"loss": 1.1886,
|
|
"mean_token_accuracy": 0.716292405128479,
|
|
"num_tokens": 531000655.0,
|
|
"step": 5760
|
|
},
|
|
{
|
|
"entropy": 1.18046875,
|
|
"epoch": 0.7885745524121908,
|
|
"grad_norm": 0.12580266174538096,
|
|
"learning_rate": 3.1227983655065523e-06,
|
|
"loss": 1.18,
|
|
"mean_token_accuracy": 0.7170616149902344,
|
|
"num_tokens": 531903053.0,
|
|
"step": 5770
|
|
},
|
|
{
|
|
"entropy": 1.12421875,
|
|
"epoch": 0.7899412327456607,
|
|
"grad_norm": 0.11550904521405149,
|
|
"learning_rate": 3.119275750317036e-06,
|
|
"loss": 1.117,
|
|
"mean_token_accuracy": 0.7284311830997467,
|
|
"num_tokens": 532817171.0,
|
|
"step": 5780
|
|
},
|
|
{
|
|
"entropy": 1.21875,
|
|
"epoch": 0.7913079130791308,
|
|
"grad_norm": 0.11969659670027208,
|
|
"learning_rate": 3.115753135127519e-06,
|
|
"loss": 1.2244,
|
|
"mean_token_accuracy": 0.7079818844795227,
|
|
"num_tokens": 533748500.0,
|
|
"step": 5790
|
|
},
|
|
{
|
|
"entropy": 1.162109375,
|
|
"epoch": 0.7926745934126008,
|
|
"grad_norm": 0.1284644061675522,
|
|
"learning_rate": 3.112230519938002e-06,
|
|
"loss": 1.1611,
|
|
"mean_token_accuracy": 0.721414715051651,
|
|
"num_tokens": 534701836.0,
|
|
"step": 5800
|
|
},
|
|
{
|
|
"entropy": 1.14609375,
|
|
"epoch": 0.7940412737460708,
|
|
"grad_norm": 0.12779354056880582,
|
|
"learning_rate": 3.108707904748485e-06,
|
|
"loss": 1.1473,
|
|
"mean_token_accuracy": 0.7246361494064331,
|
|
"num_tokens": 535560186.0,
|
|
"step": 5810
|
|
},
|
|
{
|
|
"entropy": 1.184375,
|
|
"epoch": 0.7954079540795408,
|
|
"grad_norm": 0.11408866310750994,
|
|
"learning_rate": 3.105185289558969e-06,
|
|
"loss": 1.2068,
|
|
"mean_token_accuracy": 0.7144944548606873,
|
|
"num_tokens": 536522995.0,
|
|
"step": 5820
|
|
},
|
|
{
|
|
"entropy": 1.14609375,
|
|
"epoch": 0.7967746344130108,
|
|
"grad_norm": 0.11982889601978144,
|
|
"learning_rate": 3.1016626743694523e-06,
|
|
"loss": 1.1485,
|
|
"mean_token_accuracy": 0.7222751438617706,
|
|
"num_tokens": 537446008.0,
|
|
"step": 5830
|
|
},
|
|
{
|
|
"entropy": 1.120703125,
|
|
"epoch": 0.7981413147464808,
|
|
"grad_norm": 0.11632009513650654,
|
|
"learning_rate": 3.0981400591799354e-06,
|
|
"loss": 1.1201,
|
|
"mean_token_accuracy": 0.7278498947620392,
|
|
"num_tokens": 538421022.0,
|
|
"step": 5840
|
|
},
|
|
{
|
|
"entropy": 1.1375,
|
|
"epoch": 0.7995079950799509,
|
|
"grad_norm": 0.11210288755211707,
|
|
"learning_rate": 3.0946174439904185e-06,
|
|
"loss": 1.1404,
|
|
"mean_token_accuracy": 0.7265640556812286,
|
|
"num_tokens": 539358105.0,
|
|
"step": 5850
|
|
},
|
|
{
|
|
"entropy": 1.14453125,
|
|
"epoch": 0.8008746754134208,
|
|
"grad_norm": 0.10937259396787694,
|
|
"learning_rate": 3.0910948288009016e-06,
|
|
"loss": 1.1422,
|
|
"mean_token_accuracy": 0.7255623996257782,
|
|
"num_tokens": 540247054.0,
|
|
"step": 5860
|
|
},
|
|
{
|
|
"entropy": 1.146875,
|
|
"epoch": 0.8022413557468908,
|
|
"grad_norm": 0.12071863673762089,
|
|
"learning_rate": 3.0875722136113856e-06,
|
|
"loss": 1.1573,
|
|
"mean_token_accuracy": 0.7205578327178955,
|
|
"num_tokens": 541163430.0,
|
|
"step": 5870
|
|
},
|
|
{
|
|
"entropy": 1.18828125,
|
|
"epoch": 0.8036080360803608,
|
|
"grad_norm": 0.12274943973794165,
|
|
"learning_rate": 3.0840495984218687e-06,
|
|
"loss": 1.1865,
|
|
"mean_token_accuracy": 0.7175587713718414,
|
|
"num_tokens": 542107927.0,
|
|
"step": 5880
|
|
},
|
|
{
|
|
"entropy": 1.18515625,
|
|
"epoch": 0.8049747164138308,
|
|
"grad_norm": 0.11738855785422235,
|
|
"learning_rate": 3.080526983232352e-06,
|
|
"loss": 1.1906,
|
|
"mean_token_accuracy": 0.7144558131694794,
|
|
"num_tokens": 543019413.0,
|
|
"step": 5890
|
|
},
|
|
{
|
|
"entropy": 1.175,
|
|
"epoch": 0.8063413967473008,
|
|
"grad_norm": 0.11165942553979372,
|
|
"learning_rate": 3.0770043680428354e-06,
|
|
"loss": 1.1761,
|
|
"mean_token_accuracy": 0.7174758553504944,
|
|
"num_tokens": 543947382.0,
|
|
"step": 5900
|
|
},
|
|
{
|
|
"entropy": 1.14453125,
|
|
"epoch": 0.8077080770807709,
|
|
"grad_norm": 0.12449745995197,
|
|
"learning_rate": 3.0734817528533185e-06,
|
|
"loss": 1.1538,
|
|
"mean_token_accuracy": 0.7240786015987396,
|
|
"num_tokens": 544847561.0,
|
|
"step": 5910
|
|
},
|
|
{
|
|
"entropy": 1.18828125,
|
|
"epoch": 0.8090747574142408,
|
|
"grad_norm": 0.13514481658184518,
|
|
"learning_rate": 3.0699591376638017e-06,
|
|
"loss": 1.1956,
|
|
"mean_token_accuracy": 0.7151591181755066,
|
|
"num_tokens": 545757015.0,
|
|
"step": 5920
|
|
},
|
|
{
|
|
"entropy": 1.14296875,
|
|
"epoch": 0.8104414377477108,
|
|
"grad_norm": 0.11873766429528913,
|
|
"learning_rate": 3.066436522474285e-06,
|
|
"loss": 1.1394,
|
|
"mean_token_accuracy": 0.7282706379890442,
|
|
"num_tokens": 546673248.0,
|
|
"step": 5930
|
|
},
|
|
{
|
|
"entropy": 1.1125,
|
|
"epoch": 0.8118081180811808,
|
|
"grad_norm": 0.11667132774269338,
|
|
"learning_rate": 3.0629139072847688e-06,
|
|
"loss": 1.1213,
|
|
"mean_token_accuracy": 0.7269328534603119,
|
|
"num_tokens": 547651025.0,
|
|
"step": 5940
|
|
},
|
|
{
|
|
"entropy": 1.19609375,
|
|
"epoch": 0.8131747984146508,
|
|
"grad_norm": 0.12604455292642977,
|
|
"learning_rate": 3.059391292095252e-06,
|
|
"loss": 1.2005,
|
|
"mean_token_accuracy": 0.7132715225219727,
|
|
"num_tokens": 548553494.0,
|
|
"step": 5950
|
|
},
|
|
{
|
|
"entropy": 1.1703125,
|
|
"epoch": 0.8145414787481208,
|
|
"grad_norm": 0.11534644130653307,
|
|
"learning_rate": 3.055868676905735e-06,
|
|
"loss": 1.1783,
|
|
"mean_token_accuracy": 0.7195155143737793,
|
|
"num_tokens": 549523321.0,
|
|
"step": 5960
|
|
},
|
|
{
|
|
"entropy": 1.18515625,
|
|
"epoch": 0.8159081590815909,
|
|
"grad_norm": 0.13630356645362676,
|
|
"learning_rate": 3.052346061716218e-06,
|
|
"loss": 1.1973,
|
|
"mean_token_accuracy": 0.7172083735466004,
|
|
"num_tokens": 550382497.0,
|
|
"step": 5970
|
|
},
|
|
{
|
|
"entropy": 1.165625,
|
|
"epoch": 0.8172748394150608,
|
|
"grad_norm": 0.13552260197121446,
|
|
"learning_rate": 3.048823446526702e-06,
|
|
"loss": 1.173,
|
|
"mean_token_accuracy": 0.719155776500702,
|
|
"num_tokens": 551315192.0,
|
|
"step": 5980
|
|
},
|
|
{
|
|
"entropy": 1.1984375,
|
|
"epoch": 0.8186415197485308,
|
|
"grad_norm": 0.1273219104640017,
|
|
"learning_rate": 3.0453008313371852e-06,
|
|
"loss": 1.2046,
|
|
"mean_token_accuracy": 0.7140669107437134,
|
|
"num_tokens": 552283477.0,
|
|
"step": 5990
|
|
},
|
|
{
|
|
"entropy": 1.15390625,
|
|
"epoch": 0.8200082000820008,
|
|
"grad_norm": 0.1192701672721223,
|
|
"learning_rate": 3.0417782161476683e-06,
|
|
"loss": 1.1663,
|
|
"mean_token_accuracy": 0.7196597039699555,
|
|
"num_tokens": 553194884.0,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"entropy": 1.21640625,
|
|
"epoch": 0.8213748804154708,
|
|
"grad_norm": 0.12631775772517187,
|
|
"learning_rate": 3.0382556009581515e-06,
|
|
"loss": 1.2331,
|
|
"mean_token_accuracy": 0.7084260582923889,
|
|
"num_tokens": 554107536.0,
|
|
"step": 6010
|
|
},
|
|
{
|
|
"entropy": 1.14609375,
|
|
"epoch": 0.8227415607489408,
|
|
"grad_norm": 0.11233353129182348,
|
|
"learning_rate": 3.0347329857686346e-06,
|
|
"loss": 1.1308,
|
|
"mean_token_accuracy": 0.725510448217392,
|
|
"num_tokens": 554994790.0,
|
|
"step": 6020
|
|
},
|
|
{
|
|
"entropy": 1.14453125,
|
|
"epoch": 0.8241082410824109,
|
|
"grad_norm": 0.12534829582838353,
|
|
"learning_rate": 3.031210370579118e-06,
|
|
"loss": 1.1491,
|
|
"mean_token_accuracy": 0.7267814755439759,
|
|
"num_tokens": 555917889.0,
|
|
"step": 6030
|
|
},
|
|
{
|
|
"entropy": 1.1578125,
|
|
"epoch": 0.8254749214158809,
|
|
"grad_norm": 0.11782994030455107,
|
|
"learning_rate": 3.0276877553896017e-06,
|
|
"loss": 1.155,
|
|
"mean_token_accuracy": 0.7200382292270661,
|
|
"num_tokens": 556842526.0,
|
|
"step": 6040
|
|
},
|
|
{
|
|
"entropy": 1.16171875,
|
|
"epoch": 0.8268416017493508,
|
|
"grad_norm": 0.12349570901603803,
|
|
"learning_rate": 3.024165140200085e-06,
|
|
"loss": 1.1626,
|
|
"mean_token_accuracy": 0.7218066692352295,
|
|
"num_tokens": 557755256.0,
|
|
"step": 6050
|
|
},
|
|
{
|
|
"entropy": 1.17578125,
|
|
"epoch": 0.8282082820828208,
|
|
"grad_norm": 0.1291458025590982,
|
|
"learning_rate": 3.020642525010568e-06,
|
|
"loss": 1.1859,
|
|
"mean_token_accuracy": 0.7183383524417877,
|
|
"num_tokens": 558660408.0,
|
|
"step": 6060
|
|
},
|
|
{
|
|
"entropy": 1.14140625,
|
|
"epoch": 0.8295749624162908,
|
|
"grad_norm": 0.13407731760972275,
|
|
"learning_rate": 3.0171199098210515e-06,
|
|
"loss": 1.138,
|
|
"mean_token_accuracy": 0.7258215248584747,
|
|
"num_tokens": 559553626.0,
|
|
"step": 6070
|
|
},
|
|
{
|
|
"entropy": 1.16640625,
|
|
"epoch": 0.8309416427497608,
|
|
"grad_norm": 0.1232245360870136,
|
|
"learning_rate": 3.0135972946315346e-06,
|
|
"loss": 1.1699,
|
|
"mean_token_accuracy": 0.7214470326900482,
|
|
"num_tokens": 560480649.0,
|
|
"step": 6080
|
|
},
|
|
{
|
|
"entropy": 1.171875,
|
|
"epoch": 0.8323083230832309,
|
|
"grad_norm": 0.11327993560048322,
|
|
"learning_rate": 3.010074679442018e-06,
|
|
"loss": 1.1766,
|
|
"mean_token_accuracy": 0.7191742181777954,
|
|
"num_tokens": 561376221.0,
|
|
"step": 6090
|
|
},
|
|
{
|
|
"entropy": 1.17578125,
|
|
"epoch": 0.8336750034167009,
|
|
"grad_norm": 0.12311064758286218,
|
|
"learning_rate": 3.0065520642525013e-06,
|
|
"loss": 1.1828,
|
|
"mean_token_accuracy": 0.7187857389450073,
|
|
"num_tokens": 562274400.0,
|
|
"step": 6100
|
|
},
|
|
{
|
|
"entropy": 1.190625,
|
|
"epoch": 0.8350416837501708,
|
|
"grad_norm": 0.12433139800658237,
|
|
"learning_rate": 3.003029449062985e-06,
|
|
"loss": 1.1914,
|
|
"mean_token_accuracy": 0.7153880834579468,
|
|
"num_tokens": 563215626.0,
|
|
"step": 6110
|
|
},
|
|
{
|
|
"entropy": 1.175390625,
|
|
"epoch": 0.8364083640836408,
|
|
"grad_norm": 0.12633474165191985,
|
|
"learning_rate": 2.999506833873468e-06,
|
|
"loss": 1.1738,
|
|
"mean_token_accuracy": 0.7184747278690338,
|
|
"num_tokens": 564091313.0,
|
|
"step": 6120
|
|
},
|
|
{
|
|
"entropy": 1.203125,
|
|
"epoch": 0.8377750444171108,
|
|
"grad_norm": 0.12269140459520632,
|
|
"learning_rate": 2.995984218683951e-06,
|
|
"loss": 1.1959,
|
|
"mean_token_accuracy": 0.7131108522415162,
|
|
"num_tokens": 565012295.0,
|
|
"step": 6130
|
|
},
|
|
{
|
|
"entropy": 1.152734375,
|
|
"epoch": 0.8391417247505808,
|
|
"grad_norm": 0.11449118061268922,
|
|
"learning_rate": 2.992461603494434e-06,
|
|
"loss": 1.1562,
|
|
"mean_token_accuracy": 0.7203204095363617,
|
|
"num_tokens": 565954544.0,
|
|
"step": 6140
|
|
},
|
|
{
|
|
"entropy": 1.1609375,
|
|
"epoch": 0.8405084050840509,
|
|
"grad_norm": 0.12313722563020868,
|
|
"learning_rate": 2.988938988304918e-06,
|
|
"loss": 1.1633,
|
|
"mean_token_accuracy": 0.7199534773826599,
|
|
"num_tokens": 566894690.0,
|
|
"step": 6150
|
|
},
|
|
{
|
|
"entropy": 1.16328125,
|
|
"epoch": 0.8418750854175209,
|
|
"grad_norm": 0.11518106312866595,
|
|
"learning_rate": 2.9854163731154013e-06,
|
|
"loss": 1.1656,
|
|
"mean_token_accuracy": 0.7221100628376007,
|
|
"num_tokens": 567803082.0,
|
|
"step": 6160
|
|
},
|
|
{
|
|
"entropy": 1.17265625,
|
|
"epoch": 0.8432417657509909,
|
|
"grad_norm": 0.11368464073370953,
|
|
"learning_rate": 2.9818937579258844e-06,
|
|
"loss": 1.1843,
|
|
"mean_token_accuracy": 0.715998500585556,
|
|
"num_tokens": 568676822.0,
|
|
"step": 6170
|
|
},
|
|
{
|
|
"entropy": 1.15390625,
|
|
"epoch": 0.8446084460844608,
|
|
"grad_norm": 0.12187010798351484,
|
|
"learning_rate": 2.9783711427363675e-06,
|
|
"loss": 1.159,
|
|
"mean_token_accuracy": 0.7204768061637878,
|
|
"num_tokens": 569602337.0,
|
|
"step": 6180
|
|
},
|
|
{
|
|
"entropy": 1.17578125,
|
|
"epoch": 0.8459751264179308,
|
|
"grad_norm": 0.12847825002632687,
|
|
"learning_rate": 2.9748485275468507e-06,
|
|
"loss": 1.1764,
|
|
"mean_token_accuracy": 0.7198773205280304,
|
|
"num_tokens": 570513919.0,
|
|
"step": 6190
|
|
},
|
|
{
|
|
"entropy": 1.15078125,
|
|
"epoch": 0.8473418067514008,
|
|
"grad_norm": 0.12134056427460836,
|
|
"learning_rate": 2.9713259123573346e-06,
|
|
"loss": 1.1657,
|
|
"mean_token_accuracy": 0.7212791860103607,
|
|
"num_tokens": 571394215.0,
|
|
"step": 6200
|
|
},
|
|
{
|
|
"entropy": 1.14296875,
|
|
"epoch": 0.8487084870848709,
|
|
"grad_norm": 0.11551164007179213,
|
|
"learning_rate": 2.9678032971678177e-06,
|
|
"loss": 1.1544,
|
|
"mean_token_accuracy": 0.7213574886322022,
|
|
"num_tokens": 572265499.0,
|
|
"step": 6210
|
|
},
|
|
{
|
|
"entropy": 1.16171875,
|
|
"epoch": 0.8500751674183409,
|
|
"grad_norm": 0.12214799898520447,
|
|
"learning_rate": 2.964280681978301e-06,
|
|
"loss": 1.1736,
|
|
"mean_token_accuracy": 0.7162809312343598,
|
|
"num_tokens": 573215815.0,
|
|
"step": 6220
|
|
},
|
|
{
|
|
"entropy": 1.13671875,
|
|
"epoch": 0.8514418477518109,
|
|
"grad_norm": 0.20149636509600294,
|
|
"learning_rate": 2.960758066788784e-06,
|
|
"loss": 1.1387,
|
|
"mean_token_accuracy": 0.7238964438438416,
|
|
"num_tokens": 574095844.0,
|
|
"step": 6230
|
|
},
|
|
{
|
|
"entropy": 1.19375,
|
|
"epoch": 0.8528085280852808,
|
|
"grad_norm": 0.11719439300308682,
|
|
"learning_rate": 2.9572354515992675e-06,
|
|
"loss": 1.1913,
|
|
"mean_token_accuracy": 0.7139512062072754,
|
|
"num_tokens": 574990843.0,
|
|
"step": 6240
|
|
},
|
|
{
|
|
"entropy": 1.09296875,
|
|
"epoch": 0.8541752084187508,
|
|
"grad_norm": 0.11557376832467169,
|
|
"learning_rate": 2.9537128364097507e-06,
|
|
"loss": 1.0881,
|
|
"mean_token_accuracy": 0.7382570981979371,
|
|
"num_tokens": 575911166.0,
|
|
"step": 6250
|
|
},
|
|
{
|
|
"entropy": 1.17109375,
|
|
"epoch": 0.8555418887522208,
|
|
"grad_norm": 0.10781442408382858,
|
|
"learning_rate": 2.9501902212202342e-06,
|
|
"loss": 1.1743,
|
|
"mean_token_accuracy": 0.7194125413894653,
|
|
"num_tokens": 576865158.0,
|
|
"step": 6260
|
|
},
|
|
{
|
|
"entropy": 1.1625,
|
|
"epoch": 0.8569085690856909,
|
|
"grad_norm": 0.1180814716546871,
|
|
"learning_rate": 2.9466676060307178e-06,
|
|
"loss": 1.1754,
|
|
"mean_token_accuracy": 0.7174225568771362,
|
|
"num_tokens": 577798743.0,
|
|
"step": 6270
|
|
},
|
|
{
|
|
"entropy": 1.1859375,
|
|
"epoch": 0.8582752494191609,
|
|
"grad_norm": 0.11697925948366841,
|
|
"learning_rate": 2.943144990841201e-06,
|
|
"loss": 1.19,
|
|
"mean_token_accuracy": 0.7169835269451141,
|
|
"num_tokens": 578736942.0,
|
|
"step": 6280
|
|
},
|
|
{
|
|
"entropy": 1.16640625,
|
|
"epoch": 0.8596419297526309,
|
|
"grad_norm": 0.12261826082221244,
|
|
"learning_rate": 2.939622375651684e-06,
|
|
"loss": 1.1643,
|
|
"mean_token_accuracy": 0.7222884953022003,
|
|
"num_tokens": 579657758.0,
|
|
"step": 6290
|
|
},
|
|
{
|
|
"entropy": 1.1796875,
|
|
"epoch": 0.8610086100861009,
|
|
"grad_norm": 0.12332839943502046,
|
|
"learning_rate": 2.936099760462167e-06,
|
|
"loss": 1.1811,
|
|
"mean_token_accuracy": 0.7173564255237579,
|
|
"num_tokens": 580624116.0,
|
|
"step": 6300
|
|
},
|
|
{
|
|
"entropy": 1.141796875,
|
|
"epoch": 0.8623752904195708,
|
|
"grad_norm": 0.12162076679118004,
|
|
"learning_rate": 2.932577145272651e-06,
|
|
"loss": 1.1475,
|
|
"mean_token_accuracy": 0.7244245409965515,
|
|
"num_tokens": 581517703.0,
|
|
"step": 6310
|
|
},
|
|
{
|
|
"entropy": 1.18359375,
|
|
"epoch": 0.8637419707530408,
|
|
"grad_norm": 0.10895595400775065,
|
|
"learning_rate": 2.9290545300831342e-06,
|
|
"loss": 1.1886,
|
|
"mean_token_accuracy": 0.7168744266033172,
|
|
"num_tokens": 582458745.0,
|
|
"step": 6320
|
|
},
|
|
{
|
|
"entropy": 1.18984375,
|
|
"epoch": 0.8651086510865109,
|
|
"grad_norm": 0.12153924221282783,
|
|
"learning_rate": 2.9255319148936174e-06,
|
|
"loss": 1.1878,
|
|
"mean_token_accuracy": 0.7183955132961273,
|
|
"num_tokens": 583377378.0,
|
|
"step": 6330
|
|
},
|
|
{
|
|
"entropy": 1.10625,
|
|
"epoch": 0.8664753314199809,
|
|
"grad_norm": 0.14113933998644035,
|
|
"learning_rate": 2.9220092997041005e-06,
|
|
"loss": 1.1004,
|
|
"mean_token_accuracy": 0.735378873348236,
|
|
"num_tokens": 584312704.0,
|
|
"step": 6340
|
|
},
|
|
{
|
|
"entropy": 1.159375,
|
|
"epoch": 0.8678420117534509,
|
|
"grad_norm": 0.12332162570001302,
|
|
"learning_rate": 2.9184866845145836e-06,
|
|
"loss": 1.1575,
|
|
"mean_token_accuracy": 0.720214706659317,
|
|
"num_tokens": 585234665.0,
|
|
"step": 6350
|
|
},
|
|
{
|
|
"entropy": 1.1609375,
|
|
"epoch": 0.8692086920869209,
|
|
"grad_norm": 0.12799412631720847,
|
|
"learning_rate": 2.9149640693250667e-06,
|
|
"loss": 1.1591,
|
|
"mean_token_accuracy": 0.7213240206241608,
|
|
"num_tokens": 586185228.0,
|
|
"step": 6360
|
|
},
|
|
{
|
|
"entropy": 1.16953125,
|
|
"epoch": 0.8705753724203908,
|
|
"grad_norm": 0.2445253929619339,
|
|
"learning_rate": 2.9114414541355507e-06,
|
|
"loss": 1.1774,
|
|
"mean_token_accuracy": 0.7181272804737091,
|
|
"num_tokens": 587089018.0,
|
|
"step": 6370
|
|
},
|
|
{
|
|
"entropy": 1.1796875,
|
|
"epoch": 0.8719420527538608,
|
|
"grad_norm": 0.1251362568153301,
|
|
"learning_rate": 2.907918838946034e-06,
|
|
"loss": 1.1796,
|
|
"mean_token_accuracy": 0.7203266561031342,
|
|
"num_tokens": 588017673.0,
|
|
"step": 6380
|
|
},
|
|
{
|
|
"entropy": 1.15859375,
|
|
"epoch": 0.8733087330873309,
|
|
"grad_norm": 0.11959384661200811,
|
|
"learning_rate": 2.904396223756517e-06,
|
|
"loss": 1.1536,
|
|
"mean_token_accuracy": 0.7241129159927369,
|
|
"num_tokens": 588935725.0,
|
|
"step": 6390
|
|
},
|
|
{
|
|
"entropy": 1.18359375,
|
|
"epoch": 0.8746754134208009,
|
|
"grad_norm": 0.1190310935265598,
|
|
"learning_rate": 2.9008736085670005e-06,
|
|
"loss": 1.1816,
|
|
"mean_token_accuracy": 0.7178591132164002,
|
|
"num_tokens": 589851938.0,
|
|
"step": 6400
|
|
},
|
|
{
|
|
"entropy": 1.11796875,
|
|
"epoch": 0.8760420937542709,
|
|
"grad_norm": 0.12787471673694215,
|
|
"learning_rate": 2.8973509933774836e-06,
|
|
"loss": 1.1137,
|
|
"mean_token_accuracy": 0.7299280047416687,
|
|
"num_tokens": 590750641.0,
|
|
"step": 6410
|
|
},
|
|
{
|
|
"entropy": 1.18671875,
|
|
"epoch": 0.8774087740877409,
|
|
"grad_norm": 0.11787835580495003,
|
|
"learning_rate": 2.893828378187967e-06,
|
|
"loss": 1.1987,
|
|
"mean_token_accuracy": 0.7141609787940979,
|
|
"num_tokens": 591691593.0,
|
|
"step": 6420
|
|
},
|
|
{
|
|
"entropy": 1.1375,
|
|
"epoch": 0.8787754544212109,
|
|
"grad_norm": 0.12373571920036577,
|
|
"learning_rate": 2.8903057629984503e-06,
|
|
"loss": 1.1368,
|
|
"mean_token_accuracy": 0.7258001804351807,
|
|
"num_tokens": 592586272.0,
|
|
"step": 6430
|
|
},
|
|
{
|
|
"entropy": 1.13671875,
|
|
"epoch": 0.8801421347546808,
|
|
"grad_norm": 0.13582735735573798,
|
|
"learning_rate": 2.886783147808934e-06,
|
|
"loss": 1.139,
|
|
"mean_token_accuracy": 0.7276371955871582,
|
|
"num_tokens": 593523306.0,
|
|
"step": 6440
|
|
},
|
|
{
|
|
"entropy": 1.20390625,
|
|
"epoch": 0.8815088150881509,
|
|
"grad_norm": 0.11431608001810951,
|
|
"learning_rate": 2.883260532619417e-06,
|
|
"loss": 1.2074,
|
|
"mean_token_accuracy": 0.7111124992370605,
|
|
"num_tokens": 594414758.0,
|
|
"step": 6450
|
|
},
|
|
{
|
|
"entropy": 1.15546875,
|
|
"epoch": 0.8828754954216209,
|
|
"grad_norm": 0.13332254256425344,
|
|
"learning_rate": 2.8797379174299e-06,
|
|
"loss": 1.1438,
|
|
"mean_token_accuracy": 0.7247833251953125,
|
|
"num_tokens": 595285971.0,
|
|
"step": 6460
|
|
},
|
|
{
|
|
"entropy": 1.107421875,
|
|
"epoch": 0.8842421757550909,
|
|
"grad_norm": 0.1357058722912347,
|
|
"learning_rate": 2.876215302240383e-06,
|
|
"loss": 1.112,
|
|
"mean_token_accuracy": 0.7336461782455445,
|
|
"num_tokens": 596196999.0,
|
|
"step": 6470
|
|
},
|
|
{
|
|
"entropy": 1.18671875,
|
|
"epoch": 0.8856088560885609,
|
|
"grad_norm": 0.1291420158876675,
|
|
"learning_rate": 2.872692687050867e-06,
|
|
"loss": 1.1788,
|
|
"mean_token_accuracy": 0.7188616156578064,
|
|
"num_tokens": 597148890.0,
|
|
"step": 6480
|
|
},
|
|
{
|
|
"entropy": 1.15859375,
|
|
"epoch": 0.8869755364220309,
|
|
"grad_norm": 0.13946799953157132,
|
|
"learning_rate": 2.8691700718613503e-06,
|
|
"loss": 1.163,
|
|
"mean_token_accuracy": 0.7186404347419739,
|
|
"num_tokens": 598063968.0,
|
|
"step": 6490
|
|
},
|
|
{
|
|
"entropy": 1.14375,
|
|
"epoch": 0.8883422167555008,
|
|
"grad_norm": 0.13050091821992427,
|
|
"learning_rate": 2.8656474566718334e-06,
|
|
"loss": 1.1468,
|
|
"mean_token_accuracy": 0.7221731245517731,
|
|
"num_tokens": 599016906.0,
|
|
"step": 6500
|
|
},
|
|
{
|
|
"entropy": 1.13671875,
|
|
"epoch": 0.8897088970889709,
|
|
"grad_norm": 0.12048598650975824,
|
|
"learning_rate": 2.8621248414823165e-06,
|
|
"loss": 1.1369,
|
|
"mean_token_accuracy": 0.7242832839488983,
|
|
"num_tokens": 599932323.0,
|
|
"step": 6510
|
|
},
|
|
{
|
|
"entropy": 1.16875,
|
|
"epoch": 0.8910755774224409,
|
|
"grad_norm": 0.13021882902795792,
|
|
"learning_rate": 2.8586022262927997e-06,
|
|
"loss": 1.1573,
|
|
"mean_token_accuracy": 0.7211673259735107,
|
|
"num_tokens": 600787290.0,
|
|
"step": 6520
|
|
},
|
|
{
|
|
"entropy": 1.22421875,
|
|
"epoch": 0.8924422577559109,
|
|
"grad_norm": 0.12573993945075435,
|
|
"learning_rate": 2.8550796111032836e-06,
|
|
"loss": 1.2288,
|
|
"mean_token_accuracy": 0.707545804977417,
|
|
"num_tokens": 601718435.0,
|
|
"step": 6530
|
|
},
|
|
{
|
|
"entropy": 1.17265625,
|
|
"epoch": 0.8938089380893809,
|
|
"grad_norm": 0.12208481694132042,
|
|
"learning_rate": 2.8515569959137668e-06,
|
|
"loss": 1.1902,
|
|
"mean_token_accuracy": 0.7157919824123382,
|
|
"num_tokens": 602629379.0,
|
|
"step": 6540
|
|
},
|
|
{
|
|
"entropy": 1.14140625,
|
|
"epoch": 0.8951756184228509,
|
|
"grad_norm": 0.14080125474377195,
|
|
"learning_rate": 2.84803438072425e-06,
|
|
"loss": 1.1488,
|
|
"mean_token_accuracy": 0.7254204392433167,
|
|
"num_tokens": 603541464.0,
|
|
"step": 6550
|
|
},
|
|
{
|
|
"entropy": 1.14453125,
|
|
"epoch": 0.8965422987563209,
|
|
"grad_norm": 0.11774969976058058,
|
|
"learning_rate": 2.844511765534733e-06,
|
|
"loss": 1.1529,
|
|
"mean_token_accuracy": 0.722474068403244,
|
|
"num_tokens": 604455855.0,
|
|
"step": 6560
|
|
},
|
|
{
|
|
"entropy": 1.12109375,
|
|
"epoch": 0.897908979089791,
|
|
"grad_norm": 0.26517501513902514,
|
|
"learning_rate": 2.8409891503452166e-06,
|
|
"loss": 1.1169,
|
|
"mean_token_accuracy": 0.7295224130153656,
|
|
"num_tokens": 605395006.0,
|
|
"step": 6570
|
|
},
|
|
{
|
|
"entropy": 1.171875,
|
|
"epoch": 0.8992756594232609,
|
|
"grad_norm": 0.11454343524926823,
|
|
"learning_rate": 2.8374665351556997e-06,
|
|
"loss": 1.1799,
|
|
"mean_token_accuracy": 0.7165465652942657,
|
|
"num_tokens": 606286202.0,
|
|
"step": 6580
|
|
},
|
|
{
|
|
"entropy": 1.15859375,
|
|
"epoch": 0.9006423397567309,
|
|
"grad_norm": 0.11749098364441603,
|
|
"learning_rate": 2.8339439199661832e-06,
|
|
"loss": 1.1593,
|
|
"mean_token_accuracy": 0.7243418276309967,
|
|
"num_tokens": 607248617.0,
|
|
"step": 6590
|
|
},
|
|
{
|
|
"entropy": 1.164453125,
|
|
"epoch": 0.9020090200902009,
|
|
"grad_norm": 0.11917365335814006,
|
|
"learning_rate": 2.8304213047766663e-06,
|
|
"loss": 1.1796,
|
|
"mean_token_accuracy": 0.7183767139911652,
|
|
"num_tokens": 608137958.0,
|
|
"step": 6600
|
|
},
|
|
{
|
|
"entropy": 1.112890625,
|
|
"epoch": 0.9033757004236709,
|
|
"grad_norm": 0.14664849367221128,
|
|
"learning_rate": 2.82689868958715e-06,
|
|
"loss": 1.1166,
|
|
"mean_token_accuracy": 0.7303303778171539,
|
|
"num_tokens": 609065549.0,
|
|
"step": 6610
|
|
},
|
|
{
|
|
"entropy": 1.158984375,
|
|
"epoch": 0.9047423807571409,
|
|
"grad_norm": 0.12992720376728267,
|
|
"learning_rate": 2.823376074397633e-06,
|
|
"loss": 1.1579,
|
|
"mean_token_accuracy": 0.7197949707508087,
|
|
"num_tokens": 609996904.0,
|
|
"step": 6620
|
|
},
|
|
{
|
|
"entropy": 1.15625,
|
|
"epoch": 0.906109061090611,
|
|
"grad_norm": 0.12252081707467097,
|
|
"learning_rate": 2.819853459208116e-06,
|
|
"loss": 1.1485,
|
|
"mean_token_accuracy": 0.724325317144394,
|
|
"num_tokens": 610881045.0,
|
|
"step": 6630
|
|
},
|
|
{
|
|
"entropy": 1.19921875,
|
|
"epoch": 0.9074757414240809,
|
|
"grad_norm": 0.119392159288792,
|
|
"learning_rate": 2.8163308440186e-06,
|
|
"loss": 1.2132,
|
|
"mean_token_accuracy": 0.7112180888652802,
|
|
"num_tokens": 611806746.0,
|
|
"step": 6640
|
|
},
|
|
{
|
|
"entropy": 1.159375,
|
|
"epoch": 0.9088424217575509,
|
|
"grad_norm": 0.1272130124389931,
|
|
"learning_rate": 2.8128082288290832e-06,
|
|
"loss": 1.1625,
|
|
"mean_token_accuracy": 0.721429032087326,
|
|
"num_tokens": 612709434.0,
|
|
"step": 6650
|
|
},
|
|
{
|
|
"entropy": 1.1765625,
|
|
"epoch": 0.9102091020910209,
|
|
"grad_norm": 0.12263069026485535,
|
|
"learning_rate": 2.8092856136395664e-06,
|
|
"loss": 1.1746,
|
|
"mean_token_accuracy": 0.7187263071537018,
|
|
"num_tokens": 613639390.0,
|
|
"step": 6660
|
|
},
|
|
{
|
|
"entropy": 1.17109375,
|
|
"epoch": 0.9115757824244909,
|
|
"grad_norm": 0.1312735782515691,
|
|
"learning_rate": 2.8057629984500495e-06,
|
|
"loss": 1.1759,
|
|
"mean_token_accuracy": 0.7159229040145874,
|
|
"num_tokens": 614605380.0,
|
|
"step": 6670
|
|
},
|
|
{
|
|
"entropy": 1.14453125,
|
|
"epoch": 0.9129424627579609,
|
|
"grad_norm": 0.28391994922553243,
|
|
"learning_rate": 2.8022403832605326e-06,
|
|
"loss": 1.1364,
|
|
"mean_token_accuracy": 0.7266815066337585,
|
|
"num_tokens": 615539760.0,
|
|
"step": 6680
|
|
},
|
|
{
|
|
"entropy": 1.1640625,
|
|
"epoch": 0.914309143091431,
|
|
"grad_norm": 0.11447150468137332,
|
|
"learning_rate": 2.7987177680710157e-06,
|
|
"loss": 1.171,
|
|
"mean_token_accuracy": 0.7215101242065429,
|
|
"num_tokens": 616477576.0,
|
|
"step": 6690
|
|
},
|
|
{
|
|
"entropy": 1.17265625,
|
|
"epoch": 0.915675823424901,
|
|
"grad_norm": 0.16967736237957343,
|
|
"learning_rate": 2.7951951528814997e-06,
|
|
"loss": 1.1818,
|
|
"mean_token_accuracy": 0.7168243646621704,
|
|
"num_tokens": 617421890.0,
|
|
"step": 6700
|
|
},
|
|
{
|
|
"entropy": 1.166796875,
|
|
"epoch": 0.9170425037583709,
|
|
"grad_norm": 0.11712122647331154,
|
|
"learning_rate": 2.791672537691983e-06,
|
|
"loss": 1.1867,
|
|
"mean_token_accuracy": 0.7141905307769776,
|
|
"num_tokens": 618393590.0,
|
|
"step": 6710
|
|
},
|
|
{
|
|
"entropy": 1.1578125,
|
|
"epoch": 0.9184091840918409,
|
|
"grad_norm": 0.12244786500842877,
|
|
"learning_rate": 2.788149922502466e-06,
|
|
"loss": 1.1598,
|
|
"mean_token_accuracy": 0.7233612775802613,
|
|
"num_tokens": 619296207.0,
|
|
"step": 6720
|
|
},
|
|
{
|
|
"entropy": 1.146875,
|
|
"epoch": 0.9197758644253109,
|
|
"grad_norm": 0.1345913935316019,
|
|
"learning_rate": 2.784627307312949e-06,
|
|
"loss": 1.1535,
|
|
"mean_token_accuracy": 0.7249669969081879,
|
|
"num_tokens": 620235237.0,
|
|
"step": 6730
|
|
},
|
|
{
|
|
"entropy": 1.13359375,
|
|
"epoch": 0.9211425447587809,
|
|
"grad_norm": 0.12547422507617398,
|
|
"learning_rate": 2.7811046921234326e-06,
|
|
"loss": 1.1282,
|
|
"mean_token_accuracy": 0.726342898607254,
|
|
"num_tokens": 621157947.0,
|
|
"step": 6740
|
|
},
|
|
{
|
|
"entropy": 1.19609375,
|
|
"epoch": 0.922509225092251,
|
|
"grad_norm": 0.12720609446148878,
|
|
"learning_rate": 2.777582076933916e-06,
|
|
"loss": 1.1985,
|
|
"mean_token_accuracy": 0.7162838995456695,
|
|
"num_tokens": 622061617.0,
|
|
"step": 6750
|
|
},
|
|
{
|
|
"entropy": 1.15,
|
|
"epoch": 0.923875905425721,
|
|
"grad_norm": 0.10864548442066144,
|
|
"learning_rate": 2.7740594617443993e-06,
|
|
"loss": 1.1523,
|
|
"mean_token_accuracy": 0.7226825892925263,
|
|
"num_tokens": 623006404.0,
|
|
"step": 6760
|
|
},
|
|
{
|
|
"entropy": 1.140625,
|
|
"epoch": 0.9252425857591909,
|
|
"grad_norm": 0.11980232756037422,
|
|
"learning_rate": 2.770536846554883e-06,
|
|
"loss": 1.1416,
|
|
"mean_token_accuracy": 0.726541417837143,
|
|
"num_tokens": 623971191.0,
|
|
"step": 6770
|
|
},
|
|
{
|
|
"entropy": 1.1890625,
|
|
"epoch": 0.9266092660926609,
|
|
"grad_norm": 0.1283093663564779,
|
|
"learning_rate": 2.767014231365366e-06,
|
|
"loss": 1.2048,
|
|
"mean_token_accuracy": 0.7148972153663635,
|
|
"num_tokens": 624883123.0,
|
|
"step": 6780
|
|
},
|
|
{
|
|
"entropy": 1.17578125,
|
|
"epoch": 0.9279759464261309,
|
|
"grad_norm": 0.10898174977870088,
|
|
"learning_rate": 2.763491616175849e-06,
|
|
"loss": 1.1758,
|
|
"mean_token_accuracy": 0.7196938455104828,
|
|
"num_tokens": 625850233.0,
|
|
"step": 6790
|
|
},
|
|
{
|
|
"entropy": 1.140234375,
|
|
"epoch": 0.9293426267596009,
|
|
"grad_norm": 0.12118132190437914,
|
|
"learning_rate": 2.759969000986332e-06,
|
|
"loss": 1.1397,
|
|
"mean_token_accuracy": 0.7245150208473206,
|
|
"num_tokens": 626789700.0,
|
|
"step": 6800
|
|
},
|
|
{
|
|
"entropy": 1.103125,
|
|
"epoch": 0.930709307093071,
|
|
"grad_norm": 0.11880564691151452,
|
|
"learning_rate": 2.756446385796816e-06,
|
|
"loss": 1.1095,
|
|
"mean_token_accuracy": 0.7303105294704437,
|
|
"num_tokens": 627697328.0,
|
|
"step": 6810
|
|
},
|
|
{
|
|
"entropy": 1.137109375,
|
|
"epoch": 0.932075987426541,
|
|
"grad_norm": 0.11640654013370651,
|
|
"learning_rate": 2.7529237706072993e-06,
|
|
"loss": 1.1319,
|
|
"mean_token_accuracy": 0.7258359074592591,
|
|
"num_tokens": 628630914.0,
|
|
"step": 6820
|
|
},
|
|
{
|
|
"entropy": 1.13671875,
|
|
"epoch": 0.933442667760011,
|
|
"grad_norm": 0.1177486913968686,
|
|
"learning_rate": 2.7494011554177824e-06,
|
|
"loss": 1.1342,
|
|
"mean_token_accuracy": 0.7272465527057648,
|
|
"num_tokens": 629547186.0,
|
|
"step": 6830
|
|
},
|
|
{
|
|
"entropy": 1.18671875,
|
|
"epoch": 0.9348093480934809,
|
|
"grad_norm": 0.13205206985813056,
|
|
"learning_rate": 2.7458785402282656e-06,
|
|
"loss": 1.1865,
|
|
"mean_token_accuracy": 0.7183882415294647,
|
|
"num_tokens": 630457312.0,
|
|
"step": 6840
|
|
},
|
|
{
|
|
"entropy": 1.190625,
|
|
"epoch": 0.9361760284269509,
|
|
"grad_norm": 0.12000184766378959,
|
|
"learning_rate": 2.7423559250387487e-06,
|
|
"loss": 1.1914,
|
|
"mean_token_accuracy": 0.71746866106987,
|
|
"num_tokens": 631376333.0,
|
|
"step": 6850
|
|
},
|
|
{
|
|
"entropy": 1.153515625,
|
|
"epoch": 0.9375427087604209,
|
|
"grad_norm": 0.12814234585571174,
|
|
"learning_rate": 2.7388333098492326e-06,
|
|
"loss": 1.139,
|
|
"mean_token_accuracy": 0.7257308900356293,
|
|
"num_tokens": 632240772.0,
|
|
"step": 6860
|
|
},
|
|
{
|
|
"entropy": 1.17890625,
|
|
"epoch": 0.938909389093891,
|
|
"grad_norm": 0.12141282914387394,
|
|
"learning_rate": 2.7353106946597158e-06,
|
|
"loss": 1.1813,
|
|
"mean_token_accuracy": 0.7165574312210083,
|
|
"num_tokens": 633212586.0,
|
|
"step": 6870
|
|
},
|
|
{
|
|
"entropy": 1.1921875,
|
|
"epoch": 0.940276069427361,
|
|
"grad_norm": 0.1262977015669128,
|
|
"learning_rate": 2.731788079470199e-06,
|
|
"loss": 1.2096,
|
|
"mean_token_accuracy": 0.7116837620735168,
|
|
"num_tokens": 634093379.0,
|
|
"step": 6880
|
|
},
|
|
{
|
|
"entropy": 1.1796875,
|
|
"epoch": 0.941642749760831,
|
|
"grad_norm": 0.12714940554159315,
|
|
"learning_rate": 2.728265464280682e-06,
|
|
"loss": 1.1893,
|
|
"mean_token_accuracy": 0.7167741358280182,
|
|
"num_tokens": 634995635.0,
|
|
"step": 6890
|
|
},
|
|
{
|
|
"entropy": 1.1609375,
|
|
"epoch": 0.9430094300943009,
|
|
"grad_norm": 0.1161911808881228,
|
|
"learning_rate": 2.7247428490911656e-06,
|
|
"loss": 1.1709,
|
|
"mean_token_accuracy": 0.7208384156227112,
|
|
"num_tokens": 635882762.0,
|
|
"step": 6900
|
|
},
|
|
{
|
|
"entropy": 1.17421875,
|
|
"epoch": 0.9443761104277709,
|
|
"grad_norm": 0.12687401371322832,
|
|
"learning_rate": 2.7212202339016487e-06,
|
|
"loss": 1.1852,
|
|
"mean_token_accuracy": 0.718610143661499,
|
|
"num_tokens": 636815111.0,
|
|
"step": 6910
|
|
},
|
|
{
|
|
"entropy": 1.14453125,
|
|
"epoch": 0.9457427907612409,
|
|
"grad_norm": 0.10638230488608676,
|
|
"learning_rate": 2.7176976187121322e-06,
|
|
"loss": 1.1541,
|
|
"mean_token_accuracy": 0.7228697061538696,
|
|
"num_tokens": 637800464.0,
|
|
"step": 6920
|
|
},
|
|
{
|
|
"entropy": 1.19296875,
|
|
"epoch": 0.947109471094711,
|
|
"grad_norm": 0.11049632824423805,
|
|
"learning_rate": 2.7141750035226154e-06,
|
|
"loss": 1.1895,
|
|
"mean_token_accuracy": 0.7156046688556671,
|
|
"num_tokens": 638762414.0,
|
|
"step": 6930
|
|
},
|
|
{
|
|
"entropy": 1.171875,
|
|
"epoch": 0.948476151428181,
|
|
"grad_norm": 0.1198628174927651,
|
|
"learning_rate": 2.710652388333099e-06,
|
|
"loss": 1.1756,
|
|
"mean_token_accuracy": 0.7198344230651855,
|
|
"num_tokens": 639708115.0,
|
|
"step": 6940
|
|
},
|
|
{
|
|
"entropy": 1.18046875,
|
|
"epoch": 0.949842831761651,
|
|
"grad_norm": 0.11812256539126996,
|
|
"learning_rate": 2.707129773143582e-06,
|
|
"loss": 1.1929,
|
|
"mean_token_accuracy": 0.714892053604126,
|
|
"num_tokens": 640669487.0,
|
|
"step": 6950
|
|
},
|
|
{
|
|
"entropy": 1.135546875,
|
|
"epoch": 0.951209512095121,
|
|
"grad_norm": 0.1343554369747125,
|
|
"learning_rate": 2.703607157954065e-06,
|
|
"loss": 1.1395,
|
|
"mean_token_accuracy": 0.7256253242492676,
|
|
"num_tokens": 641609525.0,
|
|
"step": 6960
|
|
},
|
|
{
|
|
"entropy": 1.1921875,
|
|
"epoch": 0.9525761924285909,
|
|
"grad_norm": 0.12921139169226972,
|
|
"learning_rate": 2.7000845427645483e-06,
|
|
"loss": 1.1931,
|
|
"mean_token_accuracy": 0.7150632262229919,
|
|
"num_tokens": 642492862.0,
|
|
"step": 6970
|
|
},
|
|
{
|
|
"entropy": 1.12109375,
|
|
"epoch": 0.9539428727620609,
|
|
"grad_norm": 0.12296977281972703,
|
|
"learning_rate": 2.6965619275750322e-06,
|
|
"loss": 1.131,
|
|
"mean_token_accuracy": 0.7267428755760192,
|
|
"num_tokens": 643397187.0,
|
|
"step": 6980
|
|
},
|
|
{
|
|
"entropy": 1.2234375,
|
|
"epoch": 0.955309553095531,
|
|
"grad_norm": 0.12772909524989703,
|
|
"learning_rate": 2.6930393123855154e-06,
|
|
"loss": 1.2375,
|
|
"mean_token_accuracy": 0.7084857642650604,
|
|
"num_tokens": 644337919.0,
|
|
"step": 6990
|
|
},
|
|
{
|
|
"entropy": 1.2046875,
|
|
"epoch": 0.956676233429001,
|
|
"grad_norm": 0.1285725546156045,
|
|
"learning_rate": 2.6895166971959985e-06,
|
|
"loss": 1.2188,
|
|
"mean_token_accuracy": 0.7109667003154755,
|
|
"num_tokens": 645236378.0,
|
|
"step": 7000
|
|
},
|
|
{
|
|
"entropy": 1.1265625,
|
|
"epoch": 0.958042913762471,
|
|
"grad_norm": 0.12118566031821959,
|
|
"learning_rate": 2.6859940820064816e-06,
|
|
"loss": 1.1273,
|
|
"mean_token_accuracy": 0.7286602079868316,
|
|
"num_tokens": 646140990.0,
|
|
"step": 7010
|
|
},
|
|
{
|
|
"entropy": 1.17734375,
|
|
"epoch": 0.959409594095941,
|
|
"grad_norm": 0.13386237307447119,
|
|
"learning_rate": 2.6824714668169647e-06,
|
|
"loss": 1.1838,
|
|
"mean_token_accuracy": 0.7183421611785888,
|
|
"num_tokens": 647048468.0,
|
|
"step": 7020
|
|
},
|
|
{
|
|
"entropy": 1.16875,
|
|
"epoch": 0.9607762744294109,
|
|
"grad_norm": 0.11977701719931391,
|
|
"learning_rate": 2.6789488516274487e-06,
|
|
"loss": 1.1767,
|
|
"mean_token_accuracy": 0.7181770741939545,
|
|
"num_tokens": 648040312.0,
|
|
"step": 7030
|
|
},
|
|
{
|
|
"entropy": 1.16875,
|
|
"epoch": 0.9621429547628809,
|
|
"grad_norm": 0.14694980368519367,
|
|
"learning_rate": 2.675426236437932e-06,
|
|
"loss": 1.1746,
|
|
"mean_token_accuracy": 0.7198334336280823,
|
|
"num_tokens": 649004460.0,
|
|
"step": 7040
|
|
},
|
|
{
|
|
"entropy": 1.13671875,
|
|
"epoch": 0.963509635096351,
|
|
"grad_norm": 0.12270204418330026,
|
|
"learning_rate": 2.671903621248415e-06,
|
|
"loss": 1.1453,
|
|
"mean_token_accuracy": 0.7221453726291657,
|
|
"num_tokens": 649898524.0,
|
|
"step": 7050
|
|
},
|
|
{
|
|
"entropy": 1.165625,
|
|
"epoch": 0.964876315429821,
|
|
"grad_norm": 0.12228368877296923,
|
|
"learning_rate": 2.668381006058898e-06,
|
|
"loss": 1.1709,
|
|
"mean_token_accuracy": 0.717938232421875,
|
|
"num_tokens": 650801130.0,
|
|
"step": 7060
|
|
},
|
|
{
|
|
"entropy": 1.1703125,
|
|
"epoch": 0.966242995763291,
|
|
"grad_norm": 0.11940787864421933,
|
|
"learning_rate": 2.6648583908693816e-06,
|
|
"loss": 1.1641,
|
|
"mean_token_accuracy": 0.7213849008083344,
|
|
"num_tokens": 651718796.0,
|
|
"step": 7070
|
|
},
|
|
{
|
|
"entropy": 1.20859375,
|
|
"epoch": 0.967609676096761,
|
|
"grad_norm": 0.13428450602349773,
|
|
"learning_rate": 2.6613357756798648e-06,
|
|
"loss": 1.2232,
|
|
"mean_token_accuracy": 0.7098085701465606,
|
|
"num_tokens": 652638787.0,
|
|
"step": 7080
|
|
},
|
|
{
|
|
"entropy": 1.15390625,
|
|
"epoch": 0.968976356430231,
|
|
"grad_norm": 0.12196387366187045,
|
|
"learning_rate": 2.6578131604903483e-06,
|
|
"loss": 1.1639,
|
|
"mean_token_accuracy": 0.7207956731319427,
|
|
"num_tokens": 653525666.0,
|
|
"step": 7090
|
|
},
|
|
{
|
|
"entropy": 1.1546875,
|
|
"epoch": 0.9703430367637009,
|
|
"grad_norm": 0.12272332556946466,
|
|
"learning_rate": 2.6542905453008314e-06,
|
|
"loss": 1.1533,
|
|
"mean_token_accuracy": 0.7239842355251312,
|
|
"num_tokens": 654449752.0,
|
|
"step": 7100
|
|
},
|
|
{
|
|
"entropy": 1.18046875,
|
|
"epoch": 0.971709717097171,
|
|
"grad_norm": 0.11960486000621946,
|
|
"learning_rate": 2.650767930111315e-06,
|
|
"loss": 1.1744,
|
|
"mean_token_accuracy": 0.7208398699760437,
|
|
"num_tokens": 655381010.0,
|
|
"step": 7110
|
|
},
|
|
{
|
|
"entropy": 1.1671875,
|
|
"epoch": 0.973076397430641,
|
|
"grad_norm": 0.12124935948783186,
|
|
"learning_rate": 2.647245314921798e-06,
|
|
"loss": 1.1669,
|
|
"mean_token_accuracy": 0.7210037350654602,
|
|
"num_tokens": 656339372.0,
|
|
"step": 7120
|
|
},
|
|
{
|
|
"entropy": 1.140625,
|
|
"epoch": 0.974443077764111,
|
|
"grad_norm": 0.11839531519228028,
|
|
"learning_rate": 2.6437226997322812e-06,
|
|
"loss": 1.1423,
|
|
"mean_token_accuracy": 0.7258386969566345,
|
|
"num_tokens": 657308740.0,
|
|
"step": 7130
|
|
},
|
|
{
|
|
"entropy": 1.14296875,
|
|
"epoch": 0.975809758097581,
|
|
"grad_norm": 0.11691698716041965,
|
|
"learning_rate": 2.640200084542765e-06,
|
|
"loss": 1.1442,
|
|
"mean_token_accuracy": 0.722836297750473,
|
|
"num_tokens": 658245328.0,
|
|
"step": 7140
|
|
},
|
|
{
|
|
"entropy": 1.15625,
|
|
"epoch": 0.977176438431051,
|
|
"grad_norm": 0.11746471660373609,
|
|
"learning_rate": 2.6366774693532483e-06,
|
|
"loss": 1.1647,
|
|
"mean_token_accuracy": 0.7216321408748627,
|
|
"num_tokens": 659099116.0,
|
|
"step": 7150
|
|
},
|
|
{
|
|
"entropy": 1.1828125,
|
|
"epoch": 0.9785431187645209,
|
|
"grad_norm": 0.1370600948758328,
|
|
"learning_rate": 2.6331548541637314e-06,
|
|
"loss": 1.1972,
|
|
"mean_token_accuracy": 0.716209989786148,
|
|
"num_tokens": 660019179.0,
|
|
"step": 7160
|
|
},
|
|
{
|
|
"entropy": 1.13828125,
|
|
"epoch": 0.979909799097991,
|
|
"grad_norm": 0.1174910256725026,
|
|
"learning_rate": 2.6296322389742146e-06,
|
|
"loss": 1.1428,
|
|
"mean_token_accuracy": 0.7252731919288635,
|
|
"num_tokens": 660951954.0,
|
|
"step": 7170
|
|
},
|
|
{
|
|
"entropy": 1.11875,
|
|
"epoch": 0.981276479431461,
|
|
"grad_norm": 0.11043970506217086,
|
|
"learning_rate": 2.6261096237846977e-06,
|
|
"loss": 1.1256,
|
|
"mean_token_accuracy": 0.7280966460704803,
|
|
"num_tokens": 661862699.0,
|
|
"step": 7180
|
|
},
|
|
{
|
|
"entropy": 1.140625,
|
|
"epoch": 0.982643159764931,
|
|
"grad_norm": 0.12931076168087016,
|
|
"learning_rate": 2.622587008595181e-06,
|
|
"loss": 1.1431,
|
|
"mean_token_accuracy": 0.7254398763179779,
|
|
"num_tokens": 662762440.0,
|
|
"step": 7190
|
|
},
|
|
{
|
|
"entropy": 1.1859375,
|
|
"epoch": 0.984009840098401,
|
|
"grad_norm": 0.13606554993697517,
|
|
"learning_rate": 2.6190643934056648e-06,
|
|
"loss": 1.1824,
|
|
"mean_token_accuracy": 0.7157516181468964,
|
|
"num_tokens": 663695752.0,
|
|
"step": 7200
|
|
},
|
|
{
|
|
"entropy": 1.20703125,
|
|
"epoch": 0.985376520431871,
|
|
"grad_norm": 0.15590926558585677,
|
|
"learning_rate": 2.615541778216148e-06,
|
|
"loss": 1.2134,
|
|
"mean_token_accuracy": 0.7129352390766144,
|
|
"num_tokens": 664595316.0,
|
|
"step": 7210
|
|
},
|
|
{
|
|
"entropy": 1.14296875,
|
|
"epoch": 0.9867432007653409,
|
|
"grad_norm": 0.11651251706760118,
|
|
"learning_rate": 2.612019163026631e-06,
|
|
"loss": 1.1457,
|
|
"mean_token_accuracy": 0.724714207649231,
|
|
"num_tokens": 665499511.0,
|
|
"step": 7220
|
|
},
|
|
{
|
|
"entropy": 1.21875,
|
|
"epoch": 0.988109881098811,
|
|
"grad_norm": 0.1256526379568187,
|
|
"learning_rate": 2.6084965478371146e-06,
|
|
"loss": 1.2299,
|
|
"mean_token_accuracy": 0.7100003242492676,
|
|
"num_tokens": 666424892.0,
|
|
"step": 7230
|
|
},
|
|
{
|
|
"entropy": 1.12421875,
|
|
"epoch": 0.989476561432281,
|
|
"grad_norm": 0.11293974253486956,
|
|
"learning_rate": 2.6049739326475977e-06,
|
|
"loss": 1.1418,
|
|
"mean_token_accuracy": 0.7250280618667603,
|
|
"num_tokens": 667349028.0,
|
|
"step": 7240
|
|
},
|
|
{
|
|
"entropy": 1.1671875,
|
|
"epoch": 0.990843241765751,
|
|
"grad_norm": 0.1471397778928959,
|
|
"learning_rate": 2.6014513174580812e-06,
|
|
"loss": 1.1639,
|
|
"mean_token_accuracy": 0.7214419901371002,
|
|
"num_tokens": 668242767.0,
|
|
"step": 7250
|
|
},
|
|
{
|
|
"entropy": 1.1421875,
|
|
"epoch": 0.992209922099221,
|
|
"grad_norm": 0.11928712672840018,
|
|
"learning_rate": 2.5979287022685644e-06,
|
|
"loss": 1.1562,
|
|
"mean_token_accuracy": 0.7233744919300079,
|
|
"num_tokens": 669135549.0,
|
|
"step": 7260
|
|
},
|
|
{
|
|
"entropy": 1.12578125,
|
|
"epoch": 0.993576602432691,
|
|
"grad_norm": 0.1147936337889417,
|
|
"learning_rate": 2.594406087079048e-06,
|
|
"loss": 1.1285,
|
|
"mean_token_accuracy": 0.7266645431518555,
|
|
"num_tokens": 670042351.0,
|
|
"step": 7270
|
|
},
|
|
{
|
|
"entropy": 1.19453125,
|
|
"epoch": 0.994943282766161,
|
|
"grad_norm": 0.1146801039335181,
|
|
"learning_rate": 2.590883471889531e-06,
|
|
"loss": 1.203,
|
|
"mean_token_accuracy": 0.713228851556778,
|
|
"num_tokens": 670950924.0,
|
|
"step": 7280
|
|
},
|
|
{
|
|
"entropy": 1.140625,
|
|
"epoch": 0.996309963099631,
|
|
"grad_norm": 0.12856709391394897,
|
|
"learning_rate": 2.587360856700014e-06,
|
|
"loss": 1.1445,
|
|
"mean_token_accuracy": 0.7237795650959015,
|
|
"num_tokens": 671855775.0,
|
|
"step": 7290
|
|
},
|
|
{
|
|
"entropy": 1.15,
|
|
"epoch": 0.997676643433101,
|
|
"grad_norm": 0.12027782674569014,
|
|
"learning_rate": 2.5838382415104973e-06,
|
|
"loss": 1.1512,
|
|
"mean_token_accuracy": 0.7220204532146454,
|
|
"num_tokens": 672778864.0,
|
|
"step": 7300
|
|
},
|
|
{
|
|
"entropy": 1.20703125,
|
|
"epoch": 0.999043323766571,
|
|
"grad_norm": 0.11591062755743743,
|
|
"learning_rate": 2.5803156263209813e-06,
|
|
"loss": 1.2207,
|
|
"mean_token_accuracy": 0.7083764493465423,
|
|
"num_tokens": 673675531.0,
|
|
"step": 7310
|
|
},
|
|
{
|
|
"entropy": 1.2,
|
|
"epoch": 1.000410004100041,
|
|
"grad_norm": 0.15567890016610408,
|
|
"learning_rate": 2.5767930111314644e-06,
|
|
"loss": 1.204,
|
|
"mean_token_accuracy": 0.7132563889026642,
|
|
"num_tokens": 674636327.0,
|
|
"step": 7320
|
|
},
|
|
{
|
|
"entropy": 1.15234375,
|
|
"epoch": 1.001776684433511,
|
|
"grad_norm": 0.1233399501328328,
|
|
"learning_rate": 2.5732703959419475e-06,
|
|
"loss": 1.1565,
|
|
"mean_token_accuracy": 0.7221963047981262,
|
|
"num_tokens": 675525894.0,
|
|
"step": 7330
|
|
},
|
|
{
|
|
"entropy": 1.144140625,
|
|
"epoch": 1.003143364766981,
|
|
"grad_norm": 0.1095390976015993,
|
|
"learning_rate": 2.5697477807524306e-06,
|
|
"loss": 1.1542,
|
|
"mean_token_accuracy": 0.7201478660106659,
|
|
"num_tokens": 676510301.0,
|
|
"step": 7340
|
|
},
|
|
{
|
|
"entropy": 1.1109375,
|
|
"epoch": 1.004510045100451,
|
|
"grad_norm": 0.1430694023842203,
|
|
"learning_rate": 2.5662251655629138e-06,
|
|
"loss": 1.1078,
|
|
"mean_token_accuracy": 0.7310290515422821,
|
|
"num_tokens": 677432141.0,
|
|
"step": 7350
|
|
},
|
|
{
|
|
"entropy": 1.1875,
|
|
"epoch": 1.005876725433921,
|
|
"grad_norm": 0.12598338743868326,
|
|
"learning_rate": 2.5627025503733977e-06,
|
|
"loss": 1.1941,
|
|
"mean_token_accuracy": 0.7132766842842102,
|
|
"num_tokens": 678360785.0,
|
|
"step": 7360
|
|
},
|
|
{
|
|
"entropy": 1.19453125,
|
|
"epoch": 1.007243405767391,
|
|
"grad_norm": 0.11137269944439651,
|
|
"learning_rate": 2.559179935183881e-06,
|
|
"loss": 1.1902,
|
|
"mean_token_accuracy": 0.716802567243576,
|
|
"num_tokens": 679264490.0,
|
|
"step": 7370
|
|
},
|
|
{
|
|
"entropy": 1.165625,
|
|
"epoch": 1.0086100861008611,
|
|
"grad_norm": 0.12631954857640554,
|
|
"learning_rate": 2.555657319994364e-06,
|
|
"loss": 1.1761,
|
|
"mean_token_accuracy": 0.7197421967983246,
|
|
"num_tokens": 680184449.0,
|
|
"step": 7380
|
|
},
|
|
{
|
|
"entropy": 1.16015625,
|
|
"epoch": 1.009976766434331,
|
|
"grad_norm": 0.11727890716699979,
|
|
"learning_rate": 2.552134704804847e-06,
|
|
"loss": 1.1483,
|
|
"mean_token_accuracy": 0.7215545773506165,
|
|
"num_tokens": 681146324.0,
|
|
"step": 7390
|
|
},
|
|
{
|
|
"entropy": 1.21015625,
|
|
"epoch": 1.011343446767801,
|
|
"grad_norm": 0.12703278209383462,
|
|
"learning_rate": 2.5486120896153306e-06,
|
|
"loss": 1.2273,
|
|
"mean_token_accuracy": 0.7076104998588562,
|
|
"num_tokens": 682048165.0,
|
|
"step": 7400
|
|
},
|
|
{
|
|
"entropy": 1.14453125,
|
|
"epoch": 1.012710127101271,
|
|
"grad_norm": 0.13818767594590423,
|
|
"learning_rate": 2.5450894744258138e-06,
|
|
"loss": 1.1526,
|
|
"mean_token_accuracy": 0.7242256939411164,
|
|
"num_tokens": 682971836.0,
|
|
"step": 7410
|
|
},
|
|
{
|
|
"entropy": 1.11640625,
|
|
"epoch": 1.014076807434741,
|
|
"grad_norm": 0.1273438325719574,
|
|
"learning_rate": 2.5415668592362973e-06,
|
|
"loss": 1.113,
|
|
"mean_token_accuracy": 0.7319232821464539,
|
|
"num_tokens": 683851000.0,
|
|
"step": 7420
|
|
},
|
|
{
|
|
"entropy": 1.13828125,
|
|
"epoch": 1.015443487768211,
|
|
"grad_norm": 0.13021832131294142,
|
|
"learning_rate": 2.5380442440467804e-06,
|
|
"loss": 1.1427,
|
|
"mean_token_accuracy": 0.7275414347648621,
|
|
"num_tokens": 684791401.0,
|
|
"step": 7430
|
|
},
|
|
{
|
|
"entropy": 1.20703125,
|
|
"epoch": 1.016810168101681,
|
|
"grad_norm": 0.1304741343547311,
|
|
"learning_rate": 2.534521628857264e-06,
|
|
"loss": 1.2139,
|
|
"mean_token_accuracy": 0.7103776097297668,
|
|
"num_tokens": 685705685.0,
|
|
"step": 7440
|
|
},
|
|
{
|
|
"entropy": 1.1578125,
|
|
"epoch": 1.018176848435151,
|
|
"grad_norm": 0.11094650513042074,
|
|
"learning_rate": 2.530999013667747e-06,
|
|
"loss": 1.1627,
|
|
"mean_token_accuracy": 0.7216002523899079,
|
|
"num_tokens": 686605303.0,
|
|
"step": 7450
|
|
},
|
|
{
|
|
"entropy": 1.25,
|
|
"epoch": 1.019543528768621,
|
|
"grad_norm": 0.1260665931198243,
|
|
"learning_rate": 2.5274763984782302e-06,
|
|
"loss": 1.2464,
|
|
"mean_token_accuracy": 0.7072399079799652,
|
|
"num_tokens": 687526237.0,
|
|
"step": 7460
|
|
},
|
|
{
|
|
"entropy": 1.11015625,
|
|
"epoch": 1.020910209102091,
|
|
"grad_norm": 0.11039403061267686,
|
|
"learning_rate": 2.5239537832887138e-06,
|
|
"loss": 1.1211,
|
|
"mean_token_accuracy": 0.7294821441173553,
|
|
"num_tokens": 688486449.0,
|
|
"step": 7470
|
|
},
|
|
{
|
|
"entropy": 1.1609375,
|
|
"epoch": 1.022276889435561,
|
|
"grad_norm": 0.13579160736552615,
|
|
"learning_rate": 2.5204311680991973e-06,
|
|
"loss": 1.1623,
|
|
"mean_token_accuracy": 0.7187879085540771,
|
|
"num_tokens": 689414922.0,
|
|
"step": 7480
|
|
},
|
|
{
|
|
"entropy": 1.11796875,
|
|
"epoch": 1.023643569769031,
|
|
"grad_norm": 0.11828167491103003,
|
|
"learning_rate": 2.5169085529096804e-06,
|
|
"loss": 1.1192,
|
|
"mean_token_accuracy": 0.7299780905246734,
|
|
"num_tokens": 690270353.0,
|
|
"step": 7490
|
|
},
|
|
{
|
|
"entropy": 1.1734375,
|
|
"epoch": 1.0250102501025011,
|
|
"grad_norm": 0.1304474262078274,
|
|
"learning_rate": 2.5133859377201636e-06,
|
|
"loss": 1.181,
|
|
"mean_token_accuracy": 0.7142954409122467,
|
|
"num_tokens": 691186114.0,
|
|
"step": 7500
|
|
},
|
|
{
|
|
"entropy": 1.21015625,
|
|
"epoch": 1.0263769304359711,
|
|
"grad_norm": 0.12321537753228991,
|
|
"learning_rate": 2.5098633225306467e-06,
|
|
"loss": 1.2128,
|
|
"mean_token_accuracy": 0.7117343962192535,
|
|
"num_tokens": 692117025.0,
|
|
"step": 7510
|
|
},
|
|
{
|
|
"entropy": 1.13984375,
|
|
"epoch": 1.027743610769441,
|
|
"grad_norm": 0.11771902858494279,
|
|
"learning_rate": 2.50634070734113e-06,
|
|
"loss": 1.143,
|
|
"mean_token_accuracy": 0.7244961857795715,
|
|
"num_tokens": 693050611.0,
|
|
"step": 7520
|
|
},
|
|
{
|
|
"entropy": 1.14296875,
|
|
"epoch": 1.029110291102911,
|
|
"grad_norm": 0.11241578591987884,
|
|
"learning_rate": 2.502818092151614e-06,
|
|
"loss": 1.1459,
|
|
"mean_token_accuracy": 0.7250086486339569,
|
|
"num_tokens": 693952358.0,
|
|
"step": 7530
|
|
},
|
|
{
|
|
"entropy": 1.148828125,
|
|
"epoch": 1.030476971436381,
|
|
"grad_norm": 0.13155545153933398,
|
|
"learning_rate": 2.499295476962097e-06,
|
|
"loss": 1.1403,
|
|
"mean_token_accuracy": 0.7240517497062683,
|
|
"num_tokens": 694878753.0,
|
|
"step": 7540
|
|
},
|
|
{
|
|
"entropy": 1.165625,
|
|
"epoch": 1.031843651769851,
|
|
"grad_norm": 0.12171853298467372,
|
|
"learning_rate": 2.49577286177258e-06,
|
|
"loss": 1.1747,
|
|
"mean_token_accuracy": 0.7215390920639038,
|
|
"num_tokens": 695831733.0,
|
|
"step": 7550
|
|
},
|
|
{
|
|
"entropy": 1.163671875,
|
|
"epoch": 1.033210332103321,
|
|
"grad_norm": 0.1245964335437009,
|
|
"learning_rate": 2.492250246583063e-06,
|
|
"loss": 1.172,
|
|
"mean_token_accuracy": 0.7199906885623932,
|
|
"num_tokens": 696739857.0,
|
|
"step": 7560
|
|
},
|
|
{
|
|
"entropy": 1.152734375,
|
|
"epoch": 1.034577012436791,
|
|
"grad_norm": 0.11754793253472459,
|
|
"learning_rate": 2.4887276313935467e-06,
|
|
"loss": 1.1487,
|
|
"mean_token_accuracy": 0.7230475723743439,
|
|
"num_tokens": 697643679.0,
|
|
"step": 7570
|
|
},
|
|
{
|
|
"entropy": 1.1484375,
|
|
"epoch": 1.035943692770261,
|
|
"grad_norm": 0.13883450523985366,
|
|
"learning_rate": 2.48520501620403e-06,
|
|
"loss": 1.1525,
|
|
"mean_token_accuracy": 0.7205734610557556,
|
|
"num_tokens": 698587437.0,
|
|
"step": 7580
|
|
},
|
|
{
|
|
"entropy": 1.12109375,
|
|
"epoch": 1.037310373103731,
|
|
"grad_norm": 0.11944782699335893,
|
|
"learning_rate": 2.4816824010145134e-06,
|
|
"loss": 1.135,
|
|
"mean_token_accuracy": 0.7229545891284943,
|
|
"num_tokens": 699490277.0,
|
|
"step": 7590
|
|
},
|
|
{
|
|
"entropy": 1.153125,
|
|
"epoch": 1.038677053437201,
|
|
"grad_norm": 0.12624137387880116,
|
|
"learning_rate": 2.478159785824997e-06,
|
|
"loss": 1.1553,
|
|
"mean_token_accuracy": 0.7224205672740937,
|
|
"num_tokens": 700469765.0,
|
|
"step": 7600
|
|
},
|
|
{
|
|
"entropy": 1.1625,
|
|
"epoch": 1.040043733770671,
|
|
"grad_norm": 0.1287532255676996,
|
|
"learning_rate": 2.47463717063548e-06,
|
|
"loss": 1.1736,
|
|
"mean_token_accuracy": 0.7194590449333191,
|
|
"num_tokens": 701399673.0,
|
|
"step": 7610
|
|
},
|
|
{
|
|
"entropy": 1.19375,
|
|
"epoch": 1.0414104141041411,
|
|
"grad_norm": 0.12809130820108197,
|
|
"learning_rate": 2.4711145554459636e-06,
|
|
"loss": 1.1973,
|
|
"mean_token_accuracy": 0.714744484424591,
|
|
"num_tokens": 702352041.0,
|
|
"step": 7620
|
|
},
|
|
{
|
|
"entropy": 1.18203125,
|
|
"epoch": 1.0427770944376111,
|
|
"grad_norm": 0.12245971062635065,
|
|
"learning_rate": 2.4675919402564467e-06,
|
|
"loss": 1.183,
|
|
"mean_token_accuracy": 0.7166260838508606,
|
|
"num_tokens": 703273557.0,
|
|
"step": 7630
|
|
},
|
|
{
|
|
"entropy": 1.175,
|
|
"epoch": 1.0441437747710811,
|
|
"grad_norm": 0.1310364170640006,
|
|
"learning_rate": 2.46406932506693e-06,
|
|
"loss": 1.1814,
|
|
"mean_token_accuracy": 0.7170513153076172,
|
|
"num_tokens": 704228082.0,
|
|
"step": 7640
|
|
},
|
|
{
|
|
"entropy": 1.15,
|
|
"epoch": 1.045510455104551,
|
|
"grad_norm": 0.11693002179792762,
|
|
"learning_rate": 2.4605467098774134e-06,
|
|
"loss": 1.1523,
|
|
"mean_token_accuracy": 0.7237214207649231,
|
|
"num_tokens": 705192144.0,
|
|
"step": 7650
|
|
},
|
|
{
|
|
"entropy": 1.124609375,
|
|
"epoch": 1.046877135438021,
|
|
"grad_norm": 0.1211361000002975,
|
|
"learning_rate": 2.4570240946878965e-06,
|
|
"loss": 1.1314,
|
|
"mean_token_accuracy": 0.7262718915939331,
|
|
"num_tokens": 706106625.0,
|
|
"step": 7660
|
|
},
|
|
{
|
|
"entropy": 1.18046875,
|
|
"epoch": 1.048243815771491,
|
|
"grad_norm": 0.1265367060092912,
|
|
"learning_rate": 2.4535014794983796e-06,
|
|
"loss": 1.1988,
|
|
"mean_token_accuracy": 0.7148855447769165,
|
|
"num_tokens": 707021496.0,
|
|
"step": 7670
|
|
},
|
|
{
|
|
"entropy": 1.121875,
|
|
"epoch": 1.049610496104961,
|
|
"grad_norm": 0.1412759688357347,
|
|
"learning_rate": 2.449978864308863e-06,
|
|
"loss": 1.1268,
|
|
"mean_token_accuracy": 0.7286526739597321,
|
|
"num_tokens": 707906178.0,
|
|
"step": 7680
|
|
},
|
|
{
|
|
"entropy": 1.163671875,
|
|
"epoch": 1.050977176438431,
|
|
"grad_norm": 0.11753413365910716,
|
|
"learning_rate": 2.4464562491193463e-06,
|
|
"loss": 1.1614,
|
|
"mean_token_accuracy": 0.7214572966098786,
|
|
"num_tokens": 708801138.0,
|
|
"step": 7690
|
|
},
|
|
{
|
|
"entropy": 1.17578125,
|
|
"epoch": 1.052343856771901,
|
|
"grad_norm": 0.14626245431538676,
|
|
"learning_rate": 2.4429336339298294e-06,
|
|
"loss": 1.1989,
|
|
"mean_token_accuracy": 0.7159459352493286,
|
|
"num_tokens": 709746763.0,
|
|
"step": 7700
|
|
},
|
|
{
|
|
"entropy": 1.091015625,
|
|
"epoch": 1.053710537105371,
|
|
"grad_norm": 0.12331012488826937,
|
|
"learning_rate": 2.439411018740313e-06,
|
|
"loss": 1.0897,
|
|
"mean_token_accuracy": 0.7318323493003845,
|
|
"num_tokens": 710631483.0,
|
|
"step": 7710
|
|
},
|
|
{
|
|
"entropy": 1.10859375,
|
|
"epoch": 1.055077217438841,
|
|
"grad_norm": 0.16015964634363586,
|
|
"learning_rate": 2.435888403550796e-06,
|
|
"loss": 1.1061,
|
|
"mean_token_accuracy": 0.7308462381362915,
|
|
"num_tokens": 711528540.0,
|
|
"step": 7720
|
|
},
|
|
{
|
|
"entropy": 1.16171875,
|
|
"epoch": 1.056443897772311,
|
|
"grad_norm": 0.12968189996373447,
|
|
"learning_rate": 2.4323657883612797e-06,
|
|
"loss": 1.1752,
|
|
"mean_token_accuracy": 0.7189302265644073,
|
|
"num_tokens": 712460832.0,
|
|
"step": 7730
|
|
},
|
|
{
|
|
"entropy": 1.17109375,
|
|
"epoch": 1.0578105781057812,
|
|
"grad_norm": 0.12524850295922815,
|
|
"learning_rate": 2.4288431731717628e-06,
|
|
"loss": 1.1766,
|
|
"mean_token_accuracy": 0.7191151916980744,
|
|
"num_tokens": 713350708.0,
|
|
"step": 7740
|
|
},
|
|
{
|
|
"entropy": 1.146875,
|
|
"epoch": 1.0591772584392511,
|
|
"grad_norm": 0.1404134418792903,
|
|
"learning_rate": 2.4253205579822463e-06,
|
|
"loss": 1.1555,
|
|
"mean_token_accuracy": 0.7205634415149689,
|
|
"num_tokens": 714247281.0,
|
|
"step": 7750
|
|
},
|
|
{
|
|
"entropy": 1.12890625,
|
|
"epoch": 1.0605439387727211,
|
|
"grad_norm": 0.12896853329693733,
|
|
"learning_rate": 2.4217979427927294e-06,
|
|
"loss": 1.1205,
|
|
"mean_token_accuracy": 0.7297866642475128,
|
|
"num_tokens": 715138854.0,
|
|
"step": 7760
|
|
},
|
|
{
|
|
"entropy": 1.1109375,
|
|
"epoch": 1.0619106191061911,
|
|
"grad_norm": 0.12214940472566645,
|
|
"learning_rate": 2.418275327603213e-06,
|
|
"loss": 1.1202,
|
|
"mean_token_accuracy": 0.7283789694309235,
|
|
"num_tokens": 716042245.0,
|
|
"step": 7770
|
|
},
|
|
{
|
|
"entropy": 1.147265625,
|
|
"epoch": 1.063277299439661,
|
|
"grad_norm": 0.13719771202100295,
|
|
"learning_rate": 2.414752712413696e-06,
|
|
"loss": 1.1381,
|
|
"mean_token_accuracy": 0.7245360374450683,
|
|
"num_tokens": 716962640.0,
|
|
"step": 7780
|
|
},
|
|
{
|
|
"entropy": 1.104296875,
|
|
"epoch": 1.064643979773131,
|
|
"grad_norm": 0.11162349233830911,
|
|
"learning_rate": 2.4112300972241797e-06,
|
|
"loss": 1.1043,
|
|
"mean_token_accuracy": 0.7327768445014954,
|
|
"num_tokens": 717870625.0,
|
|
"step": 7790
|
|
},
|
|
{
|
|
"entropy": 1.15703125,
|
|
"epoch": 1.066010660106601,
|
|
"grad_norm": 0.12328394065505462,
|
|
"learning_rate": 2.4077074820346628e-06,
|
|
"loss": 1.1665,
|
|
"mean_token_accuracy": 0.7205740094184876,
|
|
"num_tokens": 718799421.0,
|
|
"step": 7800
|
|
},
|
|
{
|
|
"entropy": 1.17734375,
|
|
"epoch": 1.067377340440071,
|
|
"grad_norm": 0.1226075426933368,
|
|
"learning_rate": 2.404184866845146e-06,
|
|
"loss": 1.1826,
|
|
"mean_token_accuracy": 0.7163676857948303,
|
|
"num_tokens": 719746616.0,
|
|
"step": 7810
|
|
},
|
|
{
|
|
"entropy": 1.17109375,
|
|
"epoch": 1.068744020773541,
|
|
"grad_norm": 0.12201772462297424,
|
|
"learning_rate": 2.4006622516556295e-06,
|
|
"loss": 1.1714,
|
|
"mean_token_accuracy": 0.7193345487117767,
|
|
"num_tokens": 720672544.0,
|
|
"step": 7820
|
|
},
|
|
{
|
|
"entropy": 1.19453125,
|
|
"epoch": 1.070110701107011,
|
|
"grad_norm": 0.13200156318047276,
|
|
"learning_rate": 2.3971396364661126e-06,
|
|
"loss": 1.1961,
|
|
"mean_token_accuracy": 0.7153163313865661,
|
|
"num_tokens": 721583384.0,
|
|
"step": 7830
|
|
},
|
|
{
|
|
"entropy": 1.17890625,
|
|
"epoch": 1.071477381440481,
|
|
"grad_norm": 0.12965149533174852,
|
|
"learning_rate": 2.393617021276596e-06,
|
|
"loss": 1.1867,
|
|
"mean_token_accuracy": 0.7173632860183716,
|
|
"num_tokens": 722493561.0,
|
|
"step": 7840
|
|
},
|
|
{
|
|
"entropy": 1.171875,
|
|
"epoch": 1.072844061773951,
|
|
"grad_norm": 0.11496565981967447,
|
|
"learning_rate": 2.3900944060870793e-06,
|
|
"loss": 1.1864,
|
|
"mean_token_accuracy": 0.7192498981952667,
|
|
"num_tokens": 723384528.0,
|
|
"step": 7850
|
|
},
|
|
{
|
|
"entropy": 1.15703125,
|
|
"epoch": 1.0742107421074212,
|
|
"grad_norm": 0.1164034001878635,
|
|
"learning_rate": 2.3865717908975624e-06,
|
|
"loss": 1.1531,
|
|
"mean_token_accuracy": 0.7216554284095764,
|
|
"num_tokens": 724319167.0,
|
|
"step": 7860
|
|
},
|
|
{
|
|
"entropy": 1.17421875,
|
|
"epoch": 1.0755774224408912,
|
|
"grad_norm": 0.14581895235079115,
|
|
"learning_rate": 2.383049175708046e-06,
|
|
"loss": 1.1697,
|
|
"mean_token_accuracy": 0.7195780813694,
|
|
"num_tokens": 725236882.0,
|
|
"step": 7870
|
|
},
|
|
{
|
|
"entropy": 1.1359375,
|
|
"epoch": 1.0769441027743611,
|
|
"grad_norm": 0.1413359135007305,
|
|
"learning_rate": 2.379526560518529e-06,
|
|
"loss": 1.142,
|
|
"mean_token_accuracy": 0.7241243362426758,
|
|
"num_tokens": 726143276.0,
|
|
"step": 7880
|
|
},
|
|
{
|
|
"entropy": 1.18828125,
|
|
"epoch": 1.0783107831078311,
|
|
"grad_norm": 0.1211555085382797,
|
|
"learning_rate": 2.376003945329012e-06,
|
|
"loss": 1.1866,
|
|
"mean_token_accuracy": 0.7149641156196594,
|
|
"num_tokens": 727083219.0,
|
|
"step": 7890
|
|
},
|
|
{
|
|
"entropy": 1.09765625,
|
|
"epoch": 1.079677463441301,
|
|
"grad_norm": 0.12432472883180418,
|
|
"learning_rate": 2.3724813301394957e-06,
|
|
"loss": 1.1054,
|
|
"mean_token_accuracy": 0.7312858164310455,
|
|
"num_tokens": 727975162.0,
|
|
"step": 7900
|
|
},
|
|
{
|
|
"entropy": 1.108203125,
|
|
"epoch": 1.081044143774771,
|
|
"grad_norm": 0.1269230559396547,
|
|
"learning_rate": 2.368958714949979e-06,
|
|
"loss": 1.108,
|
|
"mean_token_accuracy": 0.7317028522491456,
|
|
"num_tokens": 728859542.0,
|
|
"step": 7910
|
|
},
|
|
{
|
|
"entropy": 1.13984375,
|
|
"epoch": 1.082410824108241,
|
|
"grad_norm": 0.1181569937857736,
|
|
"learning_rate": 2.3654360997604624e-06,
|
|
"loss": 1.1279,
|
|
"mean_token_accuracy": 0.7282794535160064,
|
|
"num_tokens": 729761593.0,
|
|
"step": 7920
|
|
},
|
|
{
|
|
"entropy": 1.163671875,
|
|
"epoch": 1.083777504441711,
|
|
"grad_norm": 0.11923234508115499,
|
|
"learning_rate": 2.3619134845709455e-06,
|
|
"loss": 1.1749,
|
|
"mean_token_accuracy": 0.7209339499473572,
|
|
"num_tokens": 730709039.0,
|
|
"step": 7930
|
|
},
|
|
{
|
|
"entropy": 1.157421875,
|
|
"epoch": 1.085144184775181,
|
|
"grad_norm": 0.13104284019499768,
|
|
"learning_rate": 2.358390869381429e-06,
|
|
"loss": 1.1553,
|
|
"mean_token_accuracy": 0.7231940507888794,
|
|
"num_tokens": 731624659.0,
|
|
"step": 7940
|
|
},
|
|
{
|
|
"entropy": 1.18046875,
|
|
"epoch": 1.086510865108651,
|
|
"grad_norm": 0.12851810107064812,
|
|
"learning_rate": 2.354868254191912e-06,
|
|
"loss": 1.199,
|
|
"mean_token_accuracy": 0.7149342894554138,
|
|
"num_tokens": 732566122.0,
|
|
"step": 7950
|
|
},
|
|
{
|
|
"entropy": 1.1625,
|
|
"epoch": 1.087877545442121,
|
|
"grad_norm": 0.13249919145473213,
|
|
"learning_rate": 2.3513456390023957e-06,
|
|
"loss": 1.1626,
|
|
"mean_token_accuracy": 0.7219501495361328,
|
|
"num_tokens": 733499588.0,
|
|
"step": 7960
|
|
},
|
|
{
|
|
"entropy": 1.18828125,
|
|
"epoch": 1.089244225775591,
|
|
"grad_norm": 0.12205694945957057,
|
|
"learning_rate": 2.347823023812879e-06,
|
|
"loss": 1.1981,
|
|
"mean_token_accuracy": 0.714136254787445,
|
|
"num_tokens": 734386011.0,
|
|
"step": 7970
|
|
},
|
|
{
|
|
"entropy": 1.190625,
|
|
"epoch": 1.090610906109061,
|
|
"grad_norm": 0.1301925779150322,
|
|
"learning_rate": 2.3443004086233624e-06,
|
|
"loss": 1.1897,
|
|
"mean_token_accuracy": 0.7168404281139373,
|
|
"num_tokens": 735290799.0,
|
|
"step": 7980
|
|
},
|
|
{
|
|
"entropy": 1.14140625,
|
|
"epoch": 1.0919775864425312,
|
|
"grad_norm": 0.13873586869073073,
|
|
"learning_rate": 2.3407777934338455e-06,
|
|
"loss": 1.1517,
|
|
"mean_token_accuracy": 0.7256628632545471,
|
|
"num_tokens": 736222579.0,
|
|
"step": 7990
|
|
},
|
|
{
|
|
"entropy": 1.134375,
|
|
"epoch": 1.0933442667760012,
|
|
"grad_norm": 0.12276907514068072,
|
|
"learning_rate": 2.3372551782443286e-06,
|
|
"loss": 1.1358,
|
|
"mean_token_accuracy": 0.7257734119892121,
|
|
"num_tokens": 737134733.0,
|
|
"step": 8000
|
|
},
|
|
{
|
|
"entropy": 1.162890625,
|
|
"epoch": 1.0947109471094711,
|
|
"grad_norm": 0.13610649366841082,
|
|
"learning_rate": 2.333732563054812e-06,
|
|
"loss": 1.1746,
|
|
"mean_token_accuracy": 0.7189841628074646,
|
|
"num_tokens": 738079862.0,
|
|
"step": 8010
|
|
},
|
|
{
|
|
"entropy": 1.17421875,
|
|
"epoch": 1.0960776274429411,
|
|
"grad_norm": 0.12532964823943998,
|
|
"learning_rate": 2.3302099478652953e-06,
|
|
"loss": 1.178,
|
|
"mean_token_accuracy": 0.7192782104015351,
|
|
"num_tokens": 739009539.0,
|
|
"step": 8020
|
|
},
|
|
{
|
|
"entropy": 1.14765625,
|
|
"epoch": 1.097444307776411,
|
|
"grad_norm": 0.12661922101059608,
|
|
"learning_rate": 2.3266873326757784e-06,
|
|
"loss": 1.1619,
|
|
"mean_token_accuracy": 0.7221329212188721,
|
|
"num_tokens": 739893426.0,
|
|
"step": 8030
|
|
},
|
|
{
|
|
"entropy": 1.225,
|
|
"epoch": 1.098810988109881,
|
|
"grad_norm": 0.17222950257496578,
|
|
"learning_rate": 2.323164717486262e-06,
|
|
"loss": 1.2367,
|
|
"mean_token_accuracy": 0.7042568981647491,
|
|
"num_tokens": 740840806.0,
|
|
"step": 8040
|
|
},
|
|
{
|
|
"entropy": 1.1,
|
|
"epoch": 1.100177668443351,
|
|
"grad_norm": 0.11658684001496122,
|
|
"learning_rate": 2.319642102296745e-06,
|
|
"loss": 1.1044,
|
|
"mean_token_accuracy": 0.7339006781578064,
|
|
"num_tokens": 741766338.0,
|
|
"step": 8050
|
|
},
|
|
{
|
|
"entropy": 1.15234375,
|
|
"epoch": 1.101544348776821,
|
|
"grad_norm": 0.12300422530325535,
|
|
"learning_rate": 2.3161194871072287e-06,
|
|
"loss": 1.1516,
|
|
"mean_token_accuracy": 0.7216509580612183,
|
|
"num_tokens": 742676539.0,
|
|
"step": 8060
|
|
},
|
|
{
|
|
"entropy": 1.13203125,
|
|
"epoch": 1.102911029110291,
|
|
"grad_norm": 0.11382462826810817,
|
|
"learning_rate": 2.3125968719177118e-06,
|
|
"loss": 1.1321,
|
|
"mean_token_accuracy": 0.7280880749225617,
|
|
"num_tokens": 743576261.0,
|
|
"step": 8070
|
|
},
|
|
{
|
|
"entropy": 1.2046875,
|
|
"epoch": 1.104277709443761,
|
|
"grad_norm": 0.13037933312089303,
|
|
"learning_rate": 2.3090742567281953e-06,
|
|
"loss": 1.1975,
|
|
"mean_token_accuracy": 0.7141757071018219,
|
|
"num_tokens": 744533015.0,
|
|
"step": 8080
|
|
},
|
|
{
|
|
"entropy": 1.13359375,
|
|
"epoch": 1.105644389777231,
|
|
"grad_norm": 0.11260597234348008,
|
|
"learning_rate": 2.3055516415386785e-06,
|
|
"loss": 1.1418,
|
|
"mean_token_accuracy": 0.7264070808887482,
|
|
"num_tokens": 745439539.0,
|
|
"step": 8090
|
|
},
|
|
{
|
|
"entropy": 1.14609375,
|
|
"epoch": 1.1070110701107012,
|
|
"grad_norm": 0.1496862029523997,
|
|
"learning_rate": 2.302029026349162e-06,
|
|
"loss": 1.1615,
|
|
"mean_token_accuracy": 0.7222574353218079,
|
|
"num_tokens": 746344325.0,
|
|
"step": 8100
|
|
},
|
|
{
|
|
"entropy": 1.1671875,
|
|
"epoch": 1.1083777504441712,
|
|
"grad_norm": 0.13119206186372487,
|
|
"learning_rate": 2.298506411159645e-06,
|
|
"loss": 1.1741,
|
|
"mean_token_accuracy": 0.7190519452095032,
|
|
"num_tokens": 747287311.0,
|
|
"step": 8110
|
|
},
|
|
{
|
|
"entropy": 1.14765625,
|
|
"epoch": 1.1097444307776412,
|
|
"grad_norm": 0.10898057130088448,
|
|
"learning_rate": 2.2949837959701287e-06,
|
|
"loss": 1.148,
|
|
"mean_token_accuracy": 0.7223288595676423,
|
|
"num_tokens": 748278211.0,
|
|
"step": 8120
|
|
},
|
|
{
|
|
"entropy": 1.16953125,
|
|
"epoch": 1.1111111111111112,
|
|
"grad_norm": 0.1347080558423298,
|
|
"learning_rate": 2.291461180780612e-06,
|
|
"loss": 1.1699,
|
|
"mean_token_accuracy": 0.7207384288311005,
|
|
"num_tokens": 749232277.0,
|
|
"step": 8130
|
|
},
|
|
{
|
|
"entropy": 1.15,
|
|
"epoch": 1.1124777914445811,
|
|
"grad_norm": 0.11861228931815175,
|
|
"learning_rate": 2.287938565591095e-06,
|
|
"loss": 1.1554,
|
|
"mean_token_accuracy": 0.7219739854335785,
|
|
"num_tokens": 750157024.0,
|
|
"step": 8140
|
|
},
|
|
{
|
|
"entropy": 1.1125,
|
|
"epoch": 1.1138444717780511,
|
|
"grad_norm": 0.10832318228075717,
|
|
"learning_rate": 2.2844159504015785e-06,
|
|
"loss": 1.1202,
|
|
"mean_token_accuracy": 0.7297554910182953,
|
|
"num_tokens": 751102311.0,
|
|
"step": 8150
|
|
},
|
|
{
|
|
"entropy": 1.1609375,
|
|
"epoch": 1.115211152111521,
|
|
"grad_norm": 0.12094918471455288,
|
|
"learning_rate": 2.2808933352120616e-06,
|
|
"loss": 1.1599,
|
|
"mean_token_accuracy": 0.7225451111793518,
|
|
"num_tokens": 751970935.0,
|
|
"step": 8160
|
|
},
|
|
{
|
|
"entropy": 1.1640625,
|
|
"epoch": 1.116577832444991,
|
|
"grad_norm": 0.11849152971798796,
|
|
"learning_rate": 2.277370720022545e-06,
|
|
"loss": 1.1666,
|
|
"mean_token_accuracy": 0.7197894096374512,
|
|
"num_tokens": 752883996.0,
|
|
"step": 8170
|
|
},
|
|
{
|
|
"entropy": 1.147265625,
|
|
"epoch": 1.117944512778461,
|
|
"grad_norm": 0.11781150632373545,
|
|
"learning_rate": 2.2738481048330283e-06,
|
|
"loss": 1.1477,
|
|
"mean_token_accuracy": 0.7229290127754211,
|
|
"num_tokens": 753824067.0,
|
|
"step": 8180
|
|
},
|
|
{
|
|
"entropy": 1.1375,
|
|
"epoch": 1.119311193111931,
|
|
"grad_norm": 0.12274748630242192,
|
|
"learning_rate": 2.2703254896435114e-06,
|
|
"loss": 1.1513,
|
|
"mean_token_accuracy": 0.7225154519081116,
|
|
"num_tokens": 754780634.0,
|
|
"step": 8190
|
|
},
|
|
{
|
|
"entropy": 1.17265625,
|
|
"epoch": 1.120677873445401,
|
|
"grad_norm": 0.12849210484933365,
|
|
"learning_rate": 2.266802874453995e-06,
|
|
"loss": 1.1831,
|
|
"mean_token_accuracy": 0.7167483508586884,
|
|
"num_tokens": 755718040.0,
|
|
"step": 8200
|
|
},
|
|
{
|
|
"entropy": 1.14921875,
|
|
"epoch": 1.122044553778871,
|
|
"grad_norm": 0.12371274647292767,
|
|
"learning_rate": 2.263280259264478e-06,
|
|
"loss": 1.1474,
|
|
"mean_token_accuracy": 0.7244613289833068,
|
|
"num_tokens": 756620588.0,
|
|
"step": 8210
|
|
},
|
|
{
|
|
"entropy": 1.1125,
|
|
"epoch": 1.123411234112341,
|
|
"grad_norm": 0.12086536750425844,
|
|
"learning_rate": 2.259757644074961e-06,
|
|
"loss": 1.1011,
|
|
"mean_token_accuracy": 0.7324735701084137,
|
|
"num_tokens": 757521919.0,
|
|
"step": 8220
|
|
},
|
|
{
|
|
"entropy": 1.175,
|
|
"epoch": 1.1247779144458112,
|
|
"grad_norm": 0.12338449384670455,
|
|
"learning_rate": 2.2562350288854447e-06,
|
|
"loss": 1.1721,
|
|
"mean_token_accuracy": 0.7203421533107758,
|
|
"num_tokens": 758445977.0,
|
|
"step": 8230
|
|
},
|
|
{
|
|
"entropy": 1.12578125,
|
|
"epoch": 1.1261445947792812,
|
|
"grad_norm": 0.13209044313946325,
|
|
"learning_rate": 2.252712413695928e-06,
|
|
"loss": 1.1231,
|
|
"mean_token_accuracy": 0.7285464465618133,
|
|
"num_tokens": 759331953.0,
|
|
"step": 8240
|
|
},
|
|
{
|
|
"entropy": 1.15390625,
|
|
"epoch": 1.1275112751127512,
|
|
"grad_norm": 0.12504310644043393,
|
|
"learning_rate": 2.2491897985064114e-06,
|
|
"loss": 1.162,
|
|
"mean_token_accuracy": 0.7213694393634796,
|
|
"num_tokens": 760273968.0,
|
|
"step": 8250
|
|
},
|
|
{
|
|
"entropy": 1.13984375,
|
|
"epoch": 1.1288779554462212,
|
|
"grad_norm": 0.1234187869094305,
|
|
"learning_rate": 2.2456671833168945e-06,
|
|
"loss": 1.1497,
|
|
"mean_token_accuracy": 0.7238636910915375,
|
|
"num_tokens": 761159352.0,
|
|
"step": 8260
|
|
},
|
|
{
|
|
"entropy": 1.15078125,
|
|
"epoch": 1.1302446357796911,
|
|
"grad_norm": 0.11986213782874482,
|
|
"learning_rate": 2.242144568127378e-06,
|
|
"loss": 1.1539,
|
|
"mean_token_accuracy": 0.7229442000389099,
|
|
"num_tokens": 762051558.0,
|
|
"step": 8270
|
|
},
|
|
{
|
|
"entropy": 1.19921875,
|
|
"epoch": 1.1316113161131611,
|
|
"grad_norm": 0.11072847118944394,
|
|
"learning_rate": 2.238621952937861e-06,
|
|
"loss": 1.2106,
|
|
"mean_token_accuracy": 0.7143954753875732,
|
|
"num_tokens": 763006454.0,
|
|
"step": 8280
|
|
},
|
|
{
|
|
"entropy": 1.2015625,
|
|
"epoch": 1.132977996446631,
|
|
"grad_norm": 0.1222160563639076,
|
|
"learning_rate": 2.2350993377483447e-06,
|
|
"loss": 1.1955,
|
|
"mean_token_accuracy": 0.7156013369560241,
|
|
"num_tokens": 763919479.0,
|
|
"step": 8290
|
|
},
|
|
{
|
|
"entropy": 1.14765625,
|
|
"epoch": 1.134344676780101,
|
|
"grad_norm": 0.1491018449707871,
|
|
"learning_rate": 2.231576722558828e-06,
|
|
"loss": 1.1541,
|
|
"mean_token_accuracy": 0.7220531344413758,
|
|
"num_tokens": 764823373.0,
|
|
"step": 8300
|
|
},
|
|
{
|
|
"entropy": 1.140625,
|
|
"epoch": 1.135711357113571,
|
|
"grad_norm": 0.13076236824807377,
|
|
"learning_rate": 2.2280541073693114e-06,
|
|
"loss": 1.1454,
|
|
"mean_token_accuracy": 0.7246967136859894,
|
|
"num_tokens": 765754606.0,
|
|
"step": 8310
|
|
},
|
|
{
|
|
"entropy": 1.196875,
|
|
"epoch": 1.137078037447041,
|
|
"grad_norm": 0.11938463567533573,
|
|
"learning_rate": 2.2245314921797945e-06,
|
|
"loss": 1.2075,
|
|
"mean_token_accuracy": 0.7127333641052246,
|
|
"num_tokens": 766685236.0,
|
|
"step": 8320
|
|
},
|
|
{
|
|
"entropy": 1.168359375,
|
|
"epoch": 1.1384447177805113,
|
|
"grad_norm": 0.12850590499073655,
|
|
"learning_rate": 2.2210088769902777e-06,
|
|
"loss": 1.1736,
|
|
"mean_token_accuracy": 0.7197344720363616,
|
|
"num_tokens": 767625646.0,
|
|
"step": 8330
|
|
},
|
|
{
|
|
"entropy": 1.051171875,
|
|
"epoch": 1.1398113981139812,
|
|
"grad_norm": 0.14763115378476419,
|
|
"learning_rate": 2.217486261800761e-06,
|
|
"loss": 1.0583,
|
|
"mean_token_accuracy": 0.7399263322353363,
|
|
"num_tokens": 768490930.0,
|
|
"step": 8340
|
|
},
|
|
{
|
|
"entropy": 1.158984375,
|
|
"epoch": 1.1411780784474512,
|
|
"grad_norm": 0.1268661942220648,
|
|
"learning_rate": 2.2139636466112443e-06,
|
|
"loss": 1.1693,
|
|
"mean_token_accuracy": 0.7205829381942749,
|
|
"num_tokens": 769444909.0,
|
|
"step": 8350
|
|
},
|
|
{
|
|
"entropy": 1.11875,
|
|
"epoch": 1.1425447587809212,
|
|
"grad_norm": 0.12366186606556381,
|
|
"learning_rate": 2.2104410314217275e-06,
|
|
"loss": 1.1215,
|
|
"mean_token_accuracy": 0.7295616209506989,
|
|
"num_tokens": 770354512.0,
|
|
"step": 8360
|
|
},
|
|
{
|
|
"entropy": 1.1890625,
|
|
"epoch": 1.1439114391143912,
|
|
"grad_norm": 0.13428642502515425,
|
|
"learning_rate": 2.206918416232211e-06,
|
|
"loss": 1.1959,
|
|
"mean_token_accuracy": 0.715080851316452,
|
|
"num_tokens": 771302117.0,
|
|
"step": 8370
|
|
},
|
|
{
|
|
"entropy": 1.1875,
|
|
"epoch": 1.1452781194478612,
|
|
"grad_norm": 0.11354670928109616,
|
|
"learning_rate": 2.203395801042694e-06,
|
|
"loss": 1.1829,
|
|
"mean_token_accuracy": 0.7150634348392486,
|
|
"num_tokens": 772242430.0,
|
|
"step": 8380
|
|
},
|
|
{
|
|
"entropy": 1.144921875,
|
|
"epoch": 1.1466447997813312,
|
|
"grad_norm": 0.12513284022357515,
|
|
"learning_rate": 2.1998731858531777e-06,
|
|
"loss": 1.1567,
|
|
"mean_token_accuracy": 0.7222748875617981,
|
|
"num_tokens": 773212126.0,
|
|
"step": 8390
|
|
},
|
|
{
|
|
"entropy": 1.163671875,
|
|
"epoch": 1.1480114801148011,
|
|
"grad_norm": 0.1227304173837882,
|
|
"learning_rate": 2.196350570663661e-06,
|
|
"loss": 1.174,
|
|
"mean_token_accuracy": 0.7199277818202973,
|
|
"num_tokens": 774140401.0,
|
|
"step": 8400
|
|
},
|
|
{
|
|
"entropy": 1.1609375,
|
|
"epoch": 1.1493781604482711,
|
|
"grad_norm": 0.12291615280138968,
|
|
"learning_rate": 2.192827955474144e-06,
|
|
"loss": 1.1537,
|
|
"mean_token_accuracy": 0.7228089988231658,
|
|
"num_tokens": 775040095.0,
|
|
"step": 8410
|
|
},
|
|
{
|
|
"entropy": 1.11484375,
|
|
"epoch": 1.150744840781741,
|
|
"grad_norm": 0.11273104549743716,
|
|
"learning_rate": 2.1893053402846275e-06,
|
|
"loss": 1.1195,
|
|
"mean_token_accuracy": 0.7309429228305817,
|
|
"num_tokens": 775963582.0,
|
|
"step": 8420
|
|
},
|
|
{
|
|
"entropy": 1.19921875,
|
|
"epoch": 1.152111521115211,
|
|
"grad_norm": 0.11302243655265332,
|
|
"learning_rate": 2.1857827250951106e-06,
|
|
"loss": 1.2057,
|
|
"mean_token_accuracy": 0.7166090309619904,
|
|
"num_tokens": 776913274.0,
|
|
"step": 8430
|
|
},
|
|
{
|
|
"entropy": 1.1734375,
|
|
"epoch": 1.153478201448681,
|
|
"grad_norm": 0.12048553064190912,
|
|
"learning_rate": 2.182260109905594e-06,
|
|
"loss": 1.1808,
|
|
"mean_token_accuracy": 0.7159401774406433,
|
|
"num_tokens": 777837642.0,
|
|
"step": 8440
|
|
},
|
|
{
|
|
"entropy": 1.123046875,
|
|
"epoch": 1.154844881782151,
|
|
"grad_norm": 0.12678310612769256,
|
|
"learning_rate": 2.1787374947160777e-06,
|
|
"loss": 1.1216,
|
|
"mean_token_accuracy": 0.7304933309555054,
|
|
"num_tokens": 778735236.0,
|
|
"step": 8450
|
|
},
|
|
{
|
|
"entropy": 1.11875,
|
|
"epoch": 1.156211562115621,
|
|
"grad_norm": 0.11036200415956424,
|
|
"learning_rate": 2.175214879526561e-06,
|
|
"loss": 1.1247,
|
|
"mean_token_accuracy": 0.7274403691291809,
|
|
"num_tokens": 779695113.0,
|
|
"step": 8460
|
|
},
|
|
{
|
|
"entropy": 1.1671875,
|
|
"epoch": 1.1575782424490912,
|
|
"grad_norm": 0.1705032788977126,
|
|
"learning_rate": 2.171692264337044e-06,
|
|
"loss": 1.1592,
|
|
"mean_token_accuracy": 0.721995371580124,
|
|
"num_tokens": 780598891.0,
|
|
"step": 8470
|
|
},
|
|
{
|
|
"entropy": 1.1390625,
|
|
"epoch": 1.1589449227825612,
|
|
"grad_norm": 0.12850890509354324,
|
|
"learning_rate": 2.1681696491475275e-06,
|
|
"loss": 1.1281,
|
|
"mean_token_accuracy": 0.7270937979221344,
|
|
"num_tokens": 781494278.0,
|
|
"step": 8480
|
|
},
|
|
{
|
|
"entropy": 1.1640625,
|
|
"epoch": 1.1603116031160312,
|
|
"grad_norm": 0.13374397477247651,
|
|
"learning_rate": 2.1646470339580106e-06,
|
|
"loss": 1.1586,
|
|
"mean_token_accuracy": 0.7231090128421783,
|
|
"num_tokens": 782471652.0,
|
|
"step": 8490
|
|
},
|
|
{
|
|
"entropy": 1.1234375,
|
|
"epoch": 1.1616782834495012,
|
|
"grad_norm": 0.1300640508645764,
|
|
"learning_rate": 2.1611244187684937e-06,
|
|
"loss": 1.1252,
|
|
"mean_token_accuracy": 0.7304909765720368,
|
|
"num_tokens": 783377485.0,
|
|
"step": 8500
|
|
},
|
|
{
|
|
"entropy": 1.14140625,
|
|
"epoch": 1.1630449637829712,
|
|
"grad_norm": 0.11771086477760878,
|
|
"learning_rate": 2.1576018035789773e-06,
|
|
"loss": 1.1439,
|
|
"mean_token_accuracy": 0.7254180669784546,
|
|
"num_tokens": 784337833.0,
|
|
"step": 8510
|
|
},
|
|
{
|
|
"entropy": 1.11640625,
|
|
"epoch": 1.1644116441164412,
|
|
"grad_norm": 0.13539634293812242,
|
|
"learning_rate": 2.1540791883894604e-06,
|
|
"loss": 1.1197,
|
|
"mean_token_accuracy": 0.7279597401618958,
|
|
"num_tokens": 785204537.0,
|
|
"step": 8520
|
|
},
|
|
{
|
|
"entropy": 1.16640625,
|
|
"epoch": 1.1657783244499111,
|
|
"grad_norm": 0.11592575916177711,
|
|
"learning_rate": 2.150556573199944e-06,
|
|
"loss": 1.178,
|
|
"mean_token_accuracy": 0.7206804752349854,
|
|
"num_tokens": 786110258.0,
|
|
"step": 8530
|
|
},
|
|
{
|
|
"entropy": 1.1390625,
|
|
"epoch": 1.1671450047833811,
|
|
"grad_norm": 0.113862107056524,
|
|
"learning_rate": 2.147033958010427e-06,
|
|
"loss": 1.1411,
|
|
"mean_token_accuracy": 0.724855613708496,
|
|
"num_tokens": 787025727.0,
|
|
"step": 8540
|
|
},
|
|
{
|
|
"entropy": 1.165625,
|
|
"epoch": 1.168511685116851,
|
|
"grad_norm": 0.14778799925673647,
|
|
"learning_rate": 2.14351134282091e-06,
|
|
"loss": 1.1611,
|
|
"mean_token_accuracy": 0.7227478325366974,
|
|
"num_tokens": 787966153.0,
|
|
"step": 8550
|
|
},
|
|
{
|
|
"entropy": 1.1171875,
|
|
"epoch": 1.169878365450321,
|
|
"grad_norm": 0.1292598085736431,
|
|
"learning_rate": 2.1399887276313937e-06,
|
|
"loss": 1.0971,
|
|
"mean_token_accuracy": 0.7341828346252441,
|
|
"num_tokens": 788911753.0,
|
|
"step": 8560
|
|
},
|
|
{
|
|
"entropy": 1.171875,
|
|
"epoch": 1.1712450457837913,
|
|
"grad_norm": 0.11563635664878288,
|
|
"learning_rate": 2.136466112441877e-06,
|
|
"loss": 1.1754,
|
|
"mean_token_accuracy": 0.7202007114887238,
|
|
"num_tokens": 789851402.0,
|
|
"step": 8570
|
|
},
|
|
{
|
|
"entropy": 1.12890625,
|
|
"epoch": 1.1726117261172613,
|
|
"grad_norm": 0.1390148583406341,
|
|
"learning_rate": 2.1329434972523604e-06,
|
|
"loss": 1.1347,
|
|
"mean_token_accuracy": 0.7272844254970551,
|
|
"num_tokens": 790766294.0,
|
|
"step": 8580
|
|
},
|
|
{
|
|
"entropy": 1.14296875,
|
|
"epoch": 1.1739784064507313,
|
|
"grad_norm": 0.12362391223168229,
|
|
"learning_rate": 2.1294208820628435e-06,
|
|
"loss": 1.1318,
|
|
"mean_token_accuracy": 0.7278177738189697,
|
|
"num_tokens": 791663847.0,
|
|
"step": 8590
|
|
},
|
|
{
|
|
"entropy": 1.19296875,
|
|
"epoch": 1.1753450867842012,
|
|
"grad_norm": 0.12116893214660727,
|
|
"learning_rate": 2.125898266873327e-06,
|
|
"loss": 1.194,
|
|
"mean_token_accuracy": 0.7128901422023773,
|
|
"num_tokens": 792644026.0,
|
|
"step": 8600
|
|
},
|
|
{
|
|
"entropy": 1.1765625,
|
|
"epoch": 1.1767117671176712,
|
|
"grad_norm": 0.2525489770515861,
|
|
"learning_rate": 2.12237565168381e-06,
|
|
"loss": 1.1706,
|
|
"mean_token_accuracy": 0.7201654076576233,
|
|
"num_tokens": 793573004.0,
|
|
"step": 8610
|
|
},
|
|
{
|
|
"entropy": 1.16171875,
|
|
"epoch": 1.1780784474511412,
|
|
"grad_norm": 0.11284136504213572,
|
|
"learning_rate": 2.1188530364942938e-06,
|
|
"loss": 1.1663,
|
|
"mean_token_accuracy": 0.7204218566417694,
|
|
"num_tokens": 794498385.0,
|
|
"step": 8620
|
|
},
|
|
{
|
|
"entropy": 1.09609375,
|
|
"epoch": 1.1794451277846112,
|
|
"grad_norm": 0.11639427705530188,
|
|
"learning_rate": 2.115330421304777e-06,
|
|
"loss": 1.1023,
|
|
"mean_token_accuracy": 0.7314833700656891,
|
|
"num_tokens": 795410342.0,
|
|
"step": 8630
|
|
},
|
|
{
|
|
"entropy": 1.13046875,
|
|
"epoch": 1.1808118081180812,
|
|
"grad_norm": 0.12359678051255271,
|
|
"learning_rate": 2.1118078061152604e-06,
|
|
"loss": 1.1466,
|
|
"mean_token_accuracy": 0.7254610180854797,
|
|
"num_tokens": 796377123.0,
|
|
"step": 8640
|
|
},
|
|
{
|
|
"entropy": 1.10546875,
|
|
"epoch": 1.1821784884515512,
|
|
"grad_norm": 0.11802117677976161,
|
|
"learning_rate": 2.1082851909257435e-06,
|
|
"loss": 1.1211,
|
|
"mean_token_accuracy": 0.7283147513866425,
|
|
"num_tokens": 797310568.0,
|
|
"step": 8650
|
|
},
|
|
{
|
|
"entropy": 1.139453125,
|
|
"epoch": 1.1835451687850211,
|
|
"grad_norm": 0.10542548356818253,
|
|
"learning_rate": 2.1047625757362267e-06,
|
|
"loss": 1.1431,
|
|
"mean_token_accuracy": 0.7255856275558472,
|
|
"num_tokens": 798259517.0,
|
|
"step": 8660
|
|
},
|
|
{
|
|
"entropy": 1.14140625,
|
|
"epoch": 1.1849118491184911,
|
|
"grad_norm": 0.12832747340862993,
|
|
"learning_rate": 2.1012399605467102e-06,
|
|
"loss": 1.1564,
|
|
"mean_token_accuracy": 0.7223408281803131,
|
|
"num_tokens": 799148339.0,
|
|
"step": 8670
|
|
},
|
|
{
|
|
"entropy": 1.16953125,
|
|
"epoch": 1.186278529451961,
|
|
"grad_norm": 0.11280902456754496,
|
|
"learning_rate": 2.0977173453571933e-06,
|
|
"loss": 1.175,
|
|
"mean_token_accuracy": 0.7199336588382721,
|
|
"num_tokens": 800030657.0,
|
|
"step": 8680
|
|
},
|
|
{
|
|
"entropy": 1.1859375,
|
|
"epoch": 1.187645209785431,
|
|
"grad_norm": 0.13913478124472062,
|
|
"learning_rate": 2.0941947301676765e-06,
|
|
"loss": 1.196,
|
|
"mean_token_accuracy": 0.7157652139663696,
|
|
"num_tokens": 800903352.0,
|
|
"step": 8690
|
|
},
|
|
{
|
|
"entropy": 1.125,
|
|
"epoch": 1.189011890118901,
|
|
"grad_norm": 0.11879513398776918,
|
|
"learning_rate": 2.09067211497816e-06,
|
|
"loss": 1.1207,
|
|
"mean_token_accuracy": 0.7277761101722717,
|
|
"num_tokens": 801839997.0,
|
|
"step": 8700
|
|
},
|
|
{
|
|
"entropy": 1.141015625,
|
|
"epoch": 1.1903785704523713,
|
|
"grad_norm": 0.11939572166848791,
|
|
"learning_rate": 2.087149499788643e-06,
|
|
"loss": 1.1432,
|
|
"mean_token_accuracy": 0.7234301149845124,
|
|
"num_tokens": 802754802.0,
|
|
"step": 8710
|
|
},
|
|
{
|
|
"entropy": 1.16875,
|
|
"epoch": 1.1917452507858413,
|
|
"grad_norm": 0.1817087771312119,
|
|
"learning_rate": 2.0836268845991263e-06,
|
|
"loss": 1.1737,
|
|
"mean_token_accuracy": 0.7186951994895935,
|
|
"num_tokens": 803675305.0,
|
|
"step": 8720
|
|
},
|
|
{
|
|
"entropy": 1.15625,
|
|
"epoch": 1.1931119311193112,
|
|
"grad_norm": 0.12762657989156193,
|
|
"learning_rate": 2.08010426940961e-06,
|
|
"loss": 1.1746,
|
|
"mean_token_accuracy": 0.7213612198829651,
|
|
"num_tokens": 804626080.0,
|
|
"step": 8730
|
|
},
|
|
{
|
|
"entropy": 1.17421875,
|
|
"epoch": 1.1944786114527812,
|
|
"grad_norm": 0.13995924952669073,
|
|
"learning_rate": 2.076581654220093e-06,
|
|
"loss": 1.1912,
|
|
"mean_token_accuracy": 0.7171994388103485,
|
|
"num_tokens": 805552114.0,
|
|
"step": 8740
|
|
},
|
|
{
|
|
"entropy": 1.131640625,
|
|
"epoch": 1.1958452917862512,
|
|
"grad_norm": 0.1315612381318665,
|
|
"learning_rate": 2.0730590390305765e-06,
|
|
"loss": 1.1305,
|
|
"mean_token_accuracy": 0.7276914238929748,
|
|
"num_tokens": 806462200.0,
|
|
"step": 8750
|
|
},
|
|
{
|
|
"entropy": 1.15859375,
|
|
"epoch": 1.1972119721197212,
|
|
"grad_norm": 0.1338582774604745,
|
|
"learning_rate": 2.0695364238410596e-06,
|
|
"loss": 1.1457,
|
|
"mean_token_accuracy": 0.723927891254425,
|
|
"num_tokens": 807437504.0,
|
|
"step": 8760
|
|
},
|
|
{
|
|
"entropy": 1.14296875,
|
|
"epoch": 1.1985786524531912,
|
|
"grad_norm": 0.11177625458817675,
|
|
"learning_rate": 2.066013808651543e-06,
|
|
"loss": 1.1501,
|
|
"mean_token_accuracy": 0.7217326283454895,
|
|
"num_tokens": 808356776.0,
|
|
"step": 8770
|
|
},
|
|
{
|
|
"entropy": 1.1828125,
|
|
"epoch": 1.1999453327866612,
|
|
"grad_norm": 0.14040333378062458,
|
|
"learning_rate": 2.0624911934620263e-06,
|
|
"loss": 1.1861,
|
|
"mean_token_accuracy": 0.7184356927871705,
|
|
"num_tokens": 809235508.0,
|
|
"step": 8780
|
|
},
|
|
{
|
|
"entropy": 1.1859375,
|
|
"epoch": 1.2013120131201311,
|
|
"grad_norm": 0.12132067133863302,
|
|
"learning_rate": 2.05896857827251e-06,
|
|
"loss": 1.1966,
|
|
"mean_token_accuracy": 0.7138246297836304,
|
|
"num_tokens": 810205113.0,
|
|
"step": 8790
|
|
},
|
|
{
|
|
"entropy": 1.11796875,
|
|
"epoch": 1.2026786934536011,
|
|
"grad_norm": 0.11871796765839734,
|
|
"learning_rate": 2.055445963082993e-06,
|
|
"loss": 1.1183,
|
|
"mean_token_accuracy": 0.7323211193084717,
|
|
"num_tokens": 811113331.0,
|
|
"step": 8800
|
|
},
|
|
{
|
|
"entropy": 1.122265625,
|
|
"epoch": 1.2040453737870713,
|
|
"grad_norm": 0.1265824806548553,
|
|
"learning_rate": 2.0519233478934765e-06,
|
|
"loss": 1.1358,
|
|
"mean_token_accuracy": 0.725651615858078,
|
|
"num_tokens": 812063842.0,
|
|
"step": 8810
|
|
},
|
|
{
|
|
"entropy": 1.1671875,
|
|
"epoch": 1.2054120541205413,
|
|
"grad_norm": 0.12154450087220717,
|
|
"learning_rate": 2.0484007327039596e-06,
|
|
"loss": 1.1745,
|
|
"mean_token_accuracy": 0.7190692901611329,
|
|
"num_tokens": 812974009.0,
|
|
"step": 8820
|
|
},
|
|
{
|
|
"entropy": 1.16015625,
|
|
"epoch": 1.2067787344540113,
|
|
"grad_norm": 0.12841998736232743,
|
|
"learning_rate": 2.0448781175144427e-06,
|
|
"loss": 1.1563,
|
|
"mean_token_accuracy": 0.723628431558609,
|
|
"num_tokens": 813886829.0,
|
|
"step": 8830
|
|
},
|
|
{
|
|
"entropy": 1.1328125,
|
|
"epoch": 1.2081454147874813,
|
|
"grad_norm": 0.11196998635383473,
|
|
"learning_rate": 2.0413555023249263e-06,
|
|
"loss": 1.1338,
|
|
"mean_token_accuracy": 0.7279942095279693,
|
|
"num_tokens": 814818796.0,
|
|
"step": 8840
|
|
},
|
|
{
|
|
"entropy": 1.158984375,
|
|
"epoch": 1.2095120951209513,
|
|
"grad_norm": 0.1234219164270619,
|
|
"learning_rate": 2.0378328871354094e-06,
|
|
"loss": 1.1637,
|
|
"mean_token_accuracy": 0.7202268898487091,
|
|
"num_tokens": 815777038.0,
|
|
"step": 8850
|
|
},
|
|
{
|
|
"entropy": 1.1390625,
|
|
"epoch": 1.2108787754544212,
|
|
"grad_norm": 0.12639714749347425,
|
|
"learning_rate": 2.034310271945893e-06,
|
|
"loss": 1.1458,
|
|
"mean_token_accuracy": 0.7263902962207794,
|
|
"num_tokens": 816742274.0,
|
|
"step": 8860
|
|
},
|
|
{
|
|
"entropy": 1.13515625,
|
|
"epoch": 1.2122454557878912,
|
|
"grad_norm": 0.12202583722236762,
|
|
"learning_rate": 2.030787656756376e-06,
|
|
"loss": 1.1377,
|
|
"mean_token_accuracy": 0.723840868473053,
|
|
"num_tokens": 817650313.0,
|
|
"step": 8870
|
|
},
|
|
{
|
|
"entropy": 1.11171875,
|
|
"epoch": 1.2136121361213612,
|
|
"grad_norm": 0.11896596324674998,
|
|
"learning_rate": 2.027265041566859e-06,
|
|
"loss": 1.1328,
|
|
"mean_token_accuracy": 0.7275956690311431,
|
|
"num_tokens": 818564569.0,
|
|
"step": 8880
|
|
},
|
|
{
|
|
"entropy": 1.14765625,
|
|
"epoch": 1.2149788164548312,
|
|
"grad_norm": 0.12024141438965477,
|
|
"learning_rate": 2.0237424263773427e-06,
|
|
"loss": 1.1458,
|
|
"mean_token_accuracy": 0.725680160522461,
|
|
"num_tokens": 819452714.0,
|
|
"step": 8890
|
|
},
|
|
{
|
|
"entropy": 1.0859375,
|
|
"epoch": 1.2163454967883012,
|
|
"grad_norm": 0.10669549535604227,
|
|
"learning_rate": 2.020219811187826e-06,
|
|
"loss": 1.0909,
|
|
"mean_token_accuracy": 0.7346997439861298,
|
|
"num_tokens": 820395120.0,
|
|
"step": 8900
|
|
},
|
|
{
|
|
"entropy": 1.16796875,
|
|
"epoch": 1.2177121771217712,
|
|
"grad_norm": 0.11992472258231814,
|
|
"learning_rate": 2.0166971959983094e-06,
|
|
"loss": 1.1813,
|
|
"mean_token_accuracy": 0.7167044222354889,
|
|
"num_tokens": 821331637.0,
|
|
"step": 8910
|
|
},
|
|
{
|
|
"entropy": 1.10859375,
|
|
"epoch": 1.2190788574552411,
|
|
"grad_norm": 0.11774388903151509,
|
|
"learning_rate": 2.0131745808087925e-06,
|
|
"loss": 1.1181,
|
|
"mean_token_accuracy": 0.7299959599971771,
|
|
"num_tokens": 822262363.0,
|
|
"step": 8920
|
|
},
|
|
{
|
|
"entropy": 1.125390625,
|
|
"epoch": 1.2204455377887111,
|
|
"grad_norm": 0.1305105385335872,
|
|
"learning_rate": 2.009651965619276e-06,
|
|
"loss": 1.1287,
|
|
"mean_token_accuracy": 0.7270788550376892,
|
|
"num_tokens": 823185730.0,
|
|
"step": 8930
|
|
},
|
|
{
|
|
"entropy": 1.17265625,
|
|
"epoch": 1.221812218122181,
|
|
"grad_norm": 0.13718737161915423,
|
|
"learning_rate": 2.0061293504297592e-06,
|
|
"loss": 1.1886,
|
|
"mean_token_accuracy": 0.7168324530124665,
|
|
"num_tokens": 824130635.0,
|
|
"step": 8940
|
|
},
|
|
{
|
|
"entropy": 1.111328125,
|
|
"epoch": 1.2231788984556513,
|
|
"grad_norm": 0.12267567988111593,
|
|
"learning_rate": 2.0026067352402428e-06,
|
|
"loss": 1.1125,
|
|
"mean_token_accuracy": 0.7300690829753875,
|
|
"num_tokens": 825059981.0,
|
|
"step": 8950
|
|
},
|
|
{
|
|
"entropy": 1.17109375,
|
|
"epoch": 1.2245455787891213,
|
|
"grad_norm": 0.11836565171661576,
|
|
"learning_rate": 1.999084120050726e-06,
|
|
"loss": 1.1796,
|
|
"mean_token_accuracy": 0.7191387116909027,
|
|
"num_tokens": 825984443.0,
|
|
"step": 8960
|
|
},
|
|
{
|
|
"entropy": 1.11171875,
|
|
"epoch": 1.2259122591225913,
|
|
"grad_norm": 0.12056129337260046,
|
|
"learning_rate": 1.9955615048612094e-06,
|
|
"loss": 1.1105,
|
|
"mean_token_accuracy": 0.7309830188751221,
|
|
"num_tokens": 826900181.0,
|
|
"step": 8970
|
|
},
|
|
{
|
|
"entropy": 1.15,
|
|
"epoch": 1.2272789394560613,
|
|
"grad_norm": 0.1419110176275336,
|
|
"learning_rate": 1.9920388896716926e-06,
|
|
"loss": 1.1566,
|
|
"mean_token_accuracy": 0.7232869744300843,
|
|
"num_tokens": 827834339.0,
|
|
"step": 8980
|
|
},
|
|
{
|
|
"entropy": 1.15078125,
|
|
"epoch": 1.2286456197895312,
|
|
"grad_norm": 0.11306135396160932,
|
|
"learning_rate": 1.9885162744821757e-06,
|
|
"loss": 1.1622,
|
|
"mean_token_accuracy": 0.725649756193161,
|
|
"num_tokens": 828755267.0,
|
|
"step": 8990
|
|
},
|
|
{
|
|
"entropy": 1.1390625,
|
|
"epoch": 1.2300123001230012,
|
|
"grad_norm": 0.12861352479261928,
|
|
"learning_rate": 1.9849936592926592e-06,
|
|
"loss": 1.1414,
|
|
"mean_token_accuracy": 0.7235295712947846,
|
|
"num_tokens": 829615451.0,
|
|
"step": 9000
|
|
},
|
|
{
|
|
"entropy": 1.130078125,
|
|
"epoch": 1.2313789804564712,
|
|
"grad_norm": 0.11972774630134063,
|
|
"learning_rate": 1.9814710441031424e-06,
|
|
"loss": 1.1271,
|
|
"mean_token_accuracy": 0.7267436861991883,
|
|
"num_tokens": 830524527.0,
|
|
"step": 9010
|
|
},
|
|
{
|
|
"entropy": 1.12265625,
|
|
"epoch": 1.2327456607899412,
|
|
"grad_norm": 0.15731795672400709,
|
|
"learning_rate": 1.9779484289136255e-06,
|
|
"loss": 1.1086,
|
|
"mean_token_accuracy": 0.7316631555557251,
|
|
"num_tokens": 831423667.0,
|
|
"step": 9020
|
|
},
|
|
{
|
|
"entropy": 1.16953125,
|
|
"epoch": 1.2341123411234112,
|
|
"grad_norm": 0.1212590729630957,
|
|
"learning_rate": 1.974425813724109e-06,
|
|
"loss": 1.1749,
|
|
"mean_token_accuracy": 0.7197006404399872,
|
|
"num_tokens": 832352867.0,
|
|
"step": 9030
|
|
},
|
|
{
|
|
"entropy": 1.12734375,
|
|
"epoch": 1.2354790214568812,
|
|
"grad_norm": 0.12501694052067208,
|
|
"learning_rate": 1.970903198534592e-06,
|
|
"loss": 1.1341,
|
|
"mean_token_accuracy": 0.7258389174938202,
|
|
"num_tokens": 833293798.0,
|
|
"step": 9040
|
|
},
|
|
{
|
|
"entropy": 1.1609375,
|
|
"epoch": 1.2368457017903514,
|
|
"grad_norm": 0.14944586679587357,
|
|
"learning_rate": 1.9673805833450753e-06,
|
|
"loss": 1.1582,
|
|
"mean_token_accuracy": 0.7208650350570679,
|
|
"num_tokens": 834230235.0,
|
|
"step": 9050
|
|
},
|
|
{
|
|
"entropy": 1.175,
|
|
"epoch": 1.2382123821238213,
|
|
"grad_norm": 0.10964054219852276,
|
|
"learning_rate": 1.963857968155559e-06,
|
|
"loss": 1.1765,
|
|
"mean_token_accuracy": 0.7182572543621063,
|
|
"num_tokens": 835171316.0,
|
|
"step": 9060
|
|
},
|
|
{
|
|
"entropy": 1.1234375,
|
|
"epoch": 1.2395790624572913,
|
|
"grad_norm": 0.13366982041556238,
|
|
"learning_rate": 1.960335352966042e-06,
|
|
"loss": 1.1223,
|
|
"mean_token_accuracy": 0.7307281494140625,
|
|
"num_tokens": 836094206.0,
|
|
"step": 9070
|
|
},
|
|
{
|
|
"entropy": 1.16796875,
|
|
"epoch": 1.2409457427907613,
|
|
"grad_norm": 0.11691593764175923,
|
|
"learning_rate": 1.9568127377765255e-06,
|
|
"loss": 1.1636,
|
|
"mean_token_accuracy": 0.7198194921016693,
|
|
"num_tokens": 837040128.0,
|
|
"step": 9080
|
|
},
|
|
{
|
|
"entropy": 1.115234375,
|
|
"epoch": 1.2423124231242313,
|
|
"grad_norm": 0.1241102090008227,
|
|
"learning_rate": 1.9532901225870086e-06,
|
|
"loss": 1.1239,
|
|
"mean_token_accuracy": 0.728308355808258,
|
|
"num_tokens": 838015487.0,
|
|
"step": 9090
|
|
},
|
|
{
|
|
"entropy": 1.1296875,
|
|
"epoch": 1.2436791034577013,
|
|
"grad_norm": 0.12390180661131674,
|
|
"learning_rate": 1.949767507397492e-06,
|
|
"loss": 1.1332,
|
|
"mean_token_accuracy": 0.7255496621131897,
|
|
"num_tokens": 838908651.0,
|
|
"step": 9100
|
|
},
|
|
{
|
|
"entropy": 1.156640625,
|
|
"epoch": 1.2450457837911713,
|
|
"grad_norm": 0.11898306011850142,
|
|
"learning_rate": 1.9462448922079753e-06,
|
|
"loss": 1.163,
|
|
"mean_token_accuracy": 0.7211547136306763,
|
|
"num_tokens": 839819407.0,
|
|
"step": 9110
|
|
},
|
|
{
|
|
"entropy": 1.144140625,
|
|
"epoch": 1.2464124641246412,
|
|
"grad_norm": 0.12291766742549996,
|
|
"learning_rate": 1.942722277018459e-06,
|
|
"loss": 1.1545,
|
|
"mean_token_accuracy": 0.7251638472080231,
|
|
"num_tokens": 840716632.0,
|
|
"step": 9120
|
|
},
|
|
{
|
|
"entropy": 1.18125,
|
|
"epoch": 1.2477791444581112,
|
|
"grad_norm": 0.1150514718105091,
|
|
"learning_rate": 1.939199661828942e-06,
|
|
"loss": 1.1899,
|
|
"mean_token_accuracy": 0.7150820016860961,
|
|
"num_tokens": 841628747.0,
|
|
"step": 9130
|
|
},
|
|
{
|
|
"entropy": 1.11171875,
|
|
"epoch": 1.2491458247915812,
|
|
"grad_norm": 0.1268113257777486,
|
|
"learning_rate": 1.9356770466394255e-06,
|
|
"loss": 1.12,
|
|
"mean_token_accuracy": 0.7301454365253448,
|
|
"num_tokens": 842477808.0,
|
|
"step": 9140
|
|
},
|
|
{
|
|
"entropy": 1.108203125,
|
|
"epoch": 1.2505125051250512,
|
|
"grad_norm": 0.12472588908109901,
|
|
"learning_rate": 1.9321544314499086e-06,
|
|
"loss": 1.1079,
|
|
"mean_token_accuracy": 0.7319329977035522,
|
|
"num_tokens": 843389427.0,
|
|
"step": 9150
|
|
},
|
|
{
|
|
"entropy": 1.1328125,
|
|
"epoch": 1.2518791854585212,
|
|
"grad_norm": 0.12474965174018694,
|
|
"learning_rate": 1.9286318162603917e-06,
|
|
"loss": 1.1435,
|
|
"mean_token_accuracy": 0.7229217708110809,
|
|
"num_tokens": 844326133.0,
|
|
"step": 9160
|
|
},
|
|
{
|
|
"entropy": 1.108984375,
|
|
"epoch": 1.2532458657919912,
|
|
"grad_norm": 0.11995848733906697,
|
|
"learning_rate": 1.9251092010708753e-06,
|
|
"loss": 1.1177,
|
|
"mean_token_accuracy": 0.7315669417381286,
|
|
"num_tokens": 845242173.0,
|
|
"step": 9170
|
|
},
|
|
{
|
|
"entropy": 1.15859375,
|
|
"epoch": 1.2546125461254611,
|
|
"grad_norm": 0.12055795585604127,
|
|
"learning_rate": 1.9215865858813584e-06,
|
|
"loss": 1.1716,
|
|
"mean_token_accuracy": 0.7191281914710999,
|
|
"num_tokens": 846159471.0,
|
|
"step": 9180
|
|
},
|
|
{
|
|
"entropy": 1.15,
|
|
"epoch": 1.2559792264589311,
|
|
"grad_norm": 0.1803711508236027,
|
|
"learning_rate": 1.918063970691842e-06,
|
|
"loss": 1.162,
|
|
"mean_token_accuracy": 0.7202498912811279,
|
|
"num_tokens": 847086459.0,
|
|
"step": 9190
|
|
},
|
|
{
|
|
"entropy": 1.15703125,
|
|
"epoch": 1.2573459067924013,
|
|
"grad_norm": 0.13066188437357856,
|
|
"learning_rate": 1.914541355502325e-06,
|
|
"loss": 1.1417,
|
|
"mean_token_accuracy": 0.7227379858493805,
|
|
"num_tokens": 847965115.0,
|
|
"step": 9200
|
|
},
|
|
{
|
|
"entropy": 1.13046875,
|
|
"epoch": 1.2587125871258713,
|
|
"grad_norm": 0.1151317634410843,
|
|
"learning_rate": 1.9110187403128082e-06,
|
|
"loss": 1.1327,
|
|
"mean_token_accuracy": 0.7249361932277679,
|
|
"num_tokens": 848874699.0,
|
|
"step": 9210
|
|
},
|
|
{
|
|
"entropy": 1.13828125,
|
|
"epoch": 1.2600792674593413,
|
|
"grad_norm": 0.11441984364458327,
|
|
"learning_rate": 1.9074961251232918e-06,
|
|
"loss": 1.1499,
|
|
"mean_token_accuracy": 0.7245885789394378,
|
|
"num_tokens": 849836872.0,
|
|
"step": 9220
|
|
},
|
|
{
|
|
"entropy": 1.13671875,
|
|
"epoch": 1.2614459477928113,
|
|
"grad_norm": 0.1325084706931591,
|
|
"learning_rate": 1.903973509933775e-06,
|
|
"loss": 1.1318,
|
|
"mean_token_accuracy": 0.723566061258316,
|
|
"num_tokens": 850740853.0,
|
|
"step": 9230
|
|
},
|
|
{
|
|
"entropy": 1.18359375,
|
|
"epoch": 1.2628126281262813,
|
|
"grad_norm": 0.122259467506771,
|
|
"learning_rate": 1.9004508947442582e-06,
|
|
"loss": 1.1789,
|
|
"mean_token_accuracy": 0.7178839862346649,
|
|
"num_tokens": 851633495.0,
|
|
"step": 9240
|
|
},
|
|
{
|
|
"entropy": 1.100390625,
|
|
"epoch": 1.2641793084597512,
|
|
"grad_norm": 0.14099055324455587,
|
|
"learning_rate": 1.8969282795547418e-06,
|
|
"loss": 1.1105,
|
|
"mean_token_accuracy": 0.7309195041656494,
|
|
"num_tokens": 852532177.0,
|
|
"step": 9250
|
|
},
|
|
{
|
|
"entropy": 1.140625,
|
|
"epoch": 1.2655459887932212,
|
|
"grad_norm": 0.1171778252964458,
|
|
"learning_rate": 1.8934056643652249e-06,
|
|
"loss": 1.1503,
|
|
"mean_token_accuracy": 0.7225196599960327,
|
|
"num_tokens": 853442859.0,
|
|
"step": 9260
|
|
},
|
|
{
|
|
"entropy": 1.13359375,
|
|
"epoch": 1.2669126691266912,
|
|
"grad_norm": 0.12197958305042671,
|
|
"learning_rate": 1.889883049175708e-06,
|
|
"loss": 1.137,
|
|
"mean_token_accuracy": 0.7267388939857483,
|
|
"num_tokens": 854344107.0,
|
|
"step": 9270
|
|
},
|
|
{
|
|
"entropy": 1.09609375,
|
|
"epoch": 1.2682793494601612,
|
|
"grad_norm": 0.11339111316442285,
|
|
"learning_rate": 1.8863604339861916e-06,
|
|
"loss": 1.097,
|
|
"mean_token_accuracy": 0.7322591245174408,
|
|
"num_tokens": 855262838.0,
|
|
"step": 9280
|
|
},
|
|
{
|
|
"entropy": 1.159375,
|
|
"epoch": 1.2696460297936314,
|
|
"grad_norm": 0.12554465520625516,
|
|
"learning_rate": 1.8828378187966747e-06,
|
|
"loss": 1.1731,
|
|
"mean_token_accuracy": 0.7203898847103118,
|
|
"num_tokens": 856200075.0,
|
|
"step": 9290
|
|
},
|
|
{
|
|
"entropy": 1.125,
|
|
"epoch": 1.2710127101271014,
|
|
"grad_norm": 0.13193462813471937,
|
|
"learning_rate": 1.8793152036071582e-06,
|
|
"loss": 1.1309,
|
|
"mean_token_accuracy": 0.7271068155765533,
|
|
"num_tokens": 857113863.0,
|
|
"step": 9300
|
|
},
|
|
{
|
|
"entropy": 1.1125,
|
|
"epoch": 1.2723793904605714,
|
|
"grad_norm": 0.11387184183906215,
|
|
"learning_rate": 1.8757925884176414e-06,
|
|
"loss": 1.1193,
|
|
"mean_token_accuracy": 0.7280774533748626,
|
|
"num_tokens": 858051807.0,
|
|
"step": 9310
|
|
},
|
|
{
|
|
"entropy": 1.1828125,
|
|
"epoch": 1.2737460707940413,
|
|
"grad_norm": 0.12486922748794475,
|
|
"learning_rate": 1.8722699732281247e-06,
|
|
"loss": 1.1941,
|
|
"mean_token_accuracy": 0.712821877002716,
|
|
"num_tokens": 859041307.0,
|
|
"step": 9320
|
|
},
|
|
{
|
|
"entropy": 1.18125,
|
|
"epoch": 1.2751127511275113,
|
|
"grad_norm": 0.12654629913077,
|
|
"learning_rate": 1.868747358038608e-06,
|
|
"loss": 1.1839,
|
|
"mean_token_accuracy": 0.7173452556133271,
|
|
"num_tokens": 859948415.0,
|
|
"step": 9330
|
|
},
|
|
{
|
|
"entropy": 1.10625,
|
|
"epoch": 1.2764794314609813,
|
|
"grad_norm": 0.12899092161609677,
|
|
"learning_rate": 1.8652247428490914e-06,
|
|
"loss": 1.1103,
|
|
"mean_token_accuracy": 0.7321920037269593,
|
|
"num_tokens": 860865275.0,
|
|
"step": 9340
|
|
},
|
|
{
|
|
"entropy": 1.165625,
|
|
"epoch": 1.2778461117944513,
|
|
"grad_norm": 0.18589825857753203,
|
|
"learning_rate": 1.8617021276595745e-06,
|
|
"loss": 1.1788,
|
|
"mean_token_accuracy": 0.7205223739147186,
|
|
"num_tokens": 861771377.0,
|
|
"step": 9350
|
|
},
|
|
{
|
|
"entropy": 1.1359375,
|
|
"epoch": 1.2792127921279213,
|
|
"grad_norm": 0.13685574230355435,
|
|
"learning_rate": 1.858179512470058e-06,
|
|
"loss": 1.1392,
|
|
"mean_token_accuracy": 0.7277045309543609,
|
|
"num_tokens": 862681652.0,
|
|
"step": 9360
|
|
},
|
|
{
|
|
"entropy": 1.12734375,
|
|
"epoch": 1.2805794724613913,
|
|
"grad_norm": 0.11227429205752089,
|
|
"learning_rate": 1.8546568972805412e-06,
|
|
"loss": 1.138,
|
|
"mean_token_accuracy": 0.7267838656902313,
|
|
"num_tokens": 863660677.0,
|
|
"step": 9370
|
|
},
|
|
{
|
|
"entropy": 1.108203125,
|
|
"epoch": 1.2819461527948612,
|
|
"grad_norm": 0.13019836087994216,
|
|
"learning_rate": 1.8511342820910245e-06,
|
|
"loss": 1.1063,
|
|
"mean_token_accuracy": 0.7307425320148468,
|
|
"num_tokens": 864575910.0,
|
|
"step": 9380
|
|
},
|
|
{
|
|
"entropy": 1.1046875,
|
|
"epoch": 1.2833128331283312,
|
|
"grad_norm": 0.12848448820212352,
|
|
"learning_rate": 1.8476116669015078e-06,
|
|
"loss": 1.1047,
|
|
"mean_token_accuracy": 0.7337555944919586,
|
|
"num_tokens": 865515416.0,
|
|
"step": 9390
|
|
},
|
|
{
|
|
"entropy": 1.115234375,
|
|
"epoch": 1.2846795134618012,
|
|
"grad_norm": 0.11936121103938083,
|
|
"learning_rate": 1.8440890517119912e-06,
|
|
"loss": 1.1107,
|
|
"mean_token_accuracy": 0.7309712171554565,
|
|
"num_tokens": 866447102.0,
|
|
"step": 9400
|
|
},
|
|
{
|
|
"entropy": 1.13125,
|
|
"epoch": 1.2860461937952712,
|
|
"grad_norm": 0.13412853956778975,
|
|
"learning_rate": 1.8405664365224743e-06,
|
|
"loss": 1.129,
|
|
"mean_token_accuracy": 0.7280626714229583,
|
|
"num_tokens": 867334361.0,
|
|
"step": 9410
|
|
},
|
|
{
|
|
"entropy": 1.17265625,
|
|
"epoch": 1.2874128741287412,
|
|
"grad_norm": 0.12696115661262372,
|
|
"learning_rate": 1.8370438213329578e-06,
|
|
"loss": 1.1896,
|
|
"mean_token_accuracy": 0.7166397273540497,
|
|
"num_tokens": 868306198.0,
|
|
"step": 9420
|
|
},
|
|
{
|
|
"entropy": 1.15625,
|
|
"epoch": 1.2887795544622112,
|
|
"grad_norm": 0.1223633397574027,
|
|
"learning_rate": 1.833521206143441e-06,
|
|
"loss": 1.1521,
|
|
"mean_token_accuracy": 0.7238766729831696,
|
|
"num_tokens": 869231963.0,
|
|
"step": 9430
|
|
},
|
|
{
|
|
"entropy": 1.19453125,
|
|
"epoch": 1.2901462347956814,
|
|
"grad_norm": 0.1305489507001326,
|
|
"learning_rate": 1.8299985909539245e-06,
|
|
"loss": 1.1951,
|
|
"mean_token_accuracy": 0.7149894595146179,
|
|
"num_tokens": 870118000.0,
|
|
"step": 9440
|
|
},
|
|
{
|
|
"entropy": 1.160546875,
|
|
"epoch": 1.2915129151291513,
|
|
"grad_norm": 0.13148927938901983,
|
|
"learning_rate": 1.8264759757644076e-06,
|
|
"loss": 1.1561,
|
|
"mean_token_accuracy": 0.7229930102825165,
|
|
"num_tokens": 871032920.0,
|
|
"step": 9450
|
|
},
|
|
{
|
|
"entropy": 1.11875,
|
|
"epoch": 1.2928795954626213,
|
|
"grad_norm": 0.13141578197257942,
|
|
"learning_rate": 1.822953360574891e-06,
|
|
"loss": 1.1236,
|
|
"mean_token_accuracy": 0.7292287647724152,
|
|
"num_tokens": 871910994.0,
|
|
"step": 9460
|
|
},
|
|
{
|
|
"entropy": 1.112109375,
|
|
"epoch": 1.2942462757960913,
|
|
"grad_norm": 0.11069273846240486,
|
|
"learning_rate": 1.8194307453853743e-06,
|
|
"loss": 1.1101,
|
|
"mean_token_accuracy": 0.7316868722438812,
|
|
"num_tokens": 872819239.0,
|
|
"step": 9470
|
|
},
|
|
{
|
|
"entropy": 1.1765625,
|
|
"epoch": 1.2956129561295613,
|
|
"grad_norm": 0.12887293661817775,
|
|
"learning_rate": 1.8159081301958576e-06,
|
|
"loss": 1.1861,
|
|
"mean_token_accuracy": 0.7184693276882171,
|
|
"num_tokens": 873762298.0,
|
|
"step": 9480
|
|
},
|
|
{
|
|
"entropy": 1.1453125,
|
|
"epoch": 1.2969796364630313,
|
|
"grad_norm": 0.11142898376909992,
|
|
"learning_rate": 1.8123855150063408e-06,
|
|
"loss": 1.1478,
|
|
"mean_token_accuracy": 0.7251074910163879,
|
|
"num_tokens": 874698488.0,
|
|
"step": 9490
|
|
},
|
|
{
|
|
"entropy": 1.14375,
|
|
"epoch": 1.2983463167965013,
|
|
"grad_norm": 0.12375583816415528,
|
|
"learning_rate": 1.8088628998168243e-06,
|
|
"loss": 1.1651,
|
|
"mean_token_accuracy": 0.7218458354473114,
|
|
"num_tokens": 875593896.0,
|
|
"step": 9500
|
|
},
|
|
{
|
|
"entropy": 1.105859375,
|
|
"epoch": 1.2997129971299712,
|
|
"grad_norm": 0.13069860611156106,
|
|
"learning_rate": 1.8053402846273074e-06,
|
|
"loss": 1.1,
|
|
"mean_token_accuracy": 0.7326761603355407,
|
|
"num_tokens": 876502712.0,
|
|
"step": 9510
|
|
},
|
|
{
|
|
"entropy": 1.07109375,
|
|
"epoch": 1.3010796774634412,
|
|
"grad_norm": 0.12108072327602613,
|
|
"learning_rate": 1.8018176694377906e-06,
|
|
"loss": 1.0822,
|
|
"mean_token_accuracy": 0.7358316838741302,
|
|
"num_tokens": 877406242.0,
|
|
"step": 9520
|
|
},
|
|
{
|
|
"entropy": 1.1453125,
|
|
"epoch": 1.3024463577969114,
|
|
"grad_norm": 0.12260377924077265,
|
|
"learning_rate": 1.798295054248274e-06,
|
|
"loss": 1.162,
|
|
"mean_token_accuracy": 0.7208038330078125,
|
|
"num_tokens": 878297818.0,
|
|
"step": 9530
|
|
},
|
|
{
|
|
"entropy": 1.123828125,
|
|
"epoch": 1.3038130381303814,
|
|
"grad_norm": 0.14163153626215408,
|
|
"learning_rate": 1.7947724390587572e-06,
|
|
"loss": 1.12,
|
|
"mean_token_accuracy": 0.7294059872627259,
|
|
"num_tokens": 879197968.0,
|
|
"step": 9540
|
|
},
|
|
{
|
|
"entropy": 1.146875,
|
|
"epoch": 1.3051797184638514,
|
|
"grad_norm": 0.1255503603868471,
|
|
"learning_rate": 1.7912498238692408e-06,
|
|
"loss": 1.1548,
|
|
"mean_token_accuracy": 0.7227218627929688,
|
|
"num_tokens": 880144165.0,
|
|
"step": 9550
|
|
},
|
|
{
|
|
"entropy": 1.119921875,
|
|
"epoch": 1.3065463987973214,
|
|
"grad_norm": 0.12046631493721405,
|
|
"learning_rate": 1.7877272086797239e-06,
|
|
"loss": 1.1384,
|
|
"mean_token_accuracy": 0.7270127415657044,
|
|
"num_tokens": 881067277.0,
|
|
"step": 9560
|
|
},
|
|
{
|
|
"entropy": 1.16171875,
|
|
"epoch": 1.3079130791307914,
|
|
"grad_norm": 0.11757941719044496,
|
|
"learning_rate": 1.7842045934902072e-06,
|
|
"loss": 1.1631,
|
|
"mean_token_accuracy": 0.7188346862792969,
|
|
"num_tokens": 882005499.0,
|
|
"step": 9570
|
|
},
|
|
{
|
|
"entropy": 1.138671875,
|
|
"epoch": 1.3092797594642613,
|
|
"grad_norm": 0.11224981404865213,
|
|
"learning_rate": 1.7806819783006908e-06,
|
|
"loss": 1.1368,
|
|
"mean_token_accuracy": 0.727442741394043,
|
|
"num_tokens": 882921824.0,
|
|
"step": 9580
|
|
},
|
|
{
|
|
"entropy": 1.115234375,
|
|
"epoch": 1.3106464397977313,
|
|
"grad_norm": 0.1164007386534604,
|
|
"learning_rate": 1.777159363111174e-06,
|
|
"loss": 1.1315,
|
|
"mean_token_accuracy": 0.7273443937301636,
|
|
"num_tokens": 883846035.0,
|
|
"step": 9590
|
|
},
|
|
{
|
|
"entropy": 1.1359375,
|
|
"epoch": 1.3120131201312013,
|
|
"grad_norm": 0.1120874001154016,
|
|
"learning_rate": 1.773636747921657e-06,
|
|
"loss": 1.1266,
|
|
"mean_token_accuracy": 0.7285519659519195,
|
|
"num_tokens": 884719258.0,
|
|
"step": 9600
|
|
},
|
|
{
|
|
"entropy": 1.1328125,
|
|
"epoch": 1.3133798004646713,
|
|
"grad_norm": 0.1242669646810013,
|
|
"learning_rate": 1.7701141327321406e-06,
|
|
"loss": 1.1239,
|
|
"mean_token_accuracy": 0.729374223947525,
|
|
"num_tokens": 885675336.0,
|
|
"step": 9610
|
|
},
|
|
{
|
|
"entropy": 1.153125,
|
|
"epoch": 1.3147464807981413,
|
|
"grad_norm": 0.11110390863149527,
|
|
"learning_rate": 1.7665915175426237e-06,
|
|
"loss": 1.154,
|
|
"mean_token_accuracy": 0.7233369708061218,
|
|
"num_tokens": 886645805.0,
|
|
"step": 9620
|
|
},
|
|
{
|
|
"entropy": 1.121875,
|
|
"epoch": 1.3161131611316113,
|
|
"grad_norm": 0.13235372935866377,
|
|
"learning_rate": 1.763068902353107e-06,
|
|
"loss": 1.1314,
|
|
"mean_token_accuracy": 0.7258896410465241,
|
|
"num_tokens": 887541444.0,
|
|
"step": 9630
|
|
},
|
|
{
|
|
"entropy": 1.171875,
|
|
"epoch": 1.3174798414650812,
|
|
"grad_norm": 0.11705002124030302,
|
|
"learning_rate": 1.7595462871635904e-06,
|
|
"loss": 1.1872,
|
|
"mean_token_accuracy": 0.7153473496437073,
|
|
"num_tokens": 888431199.0,
|
|
"step": 9640
|
|
},
|
|
{
|
|
"entropy": 1.0953125,
|
|
"epoch": 1.3188465217985512,
|
|
"grad_norm": 0.15667168450597824,
|
|
"learning_rate": 1.7560236719740737e-06,
|
|
"loss": 1.0948,
|
|
"mean_token_accuracy": 0.7336769521236419,
|
|
"num_tokens": 889364908.0,
|
|
"step": 9650
|
|
},
|
|
{
|
|
"entropy": 1.140625,
|
|
"epoch": 1.3202132021320212,
|
|
"grad_norm": 0.11149799889869759,
|
|
"learning_rate": 1.752501056784557e-06,
|
|
"loss": 1.1308,
|
|
"mean_token_accuracy": 0.7312242448329925,
|
|
"num_tokens": 890313125.0,
|
|
"step": 9660
|
|
},
|
|
{
|
|
"entropy": 1.14296875,
|
|
"epoch": 1.3215798824654912,
|
|
"grad_norm": 0.11286014360777594,
|
|
"learning_rate": 1.7489784415950404e-06,
|
|
"loss": 1.1485,
|
|
"mean_token_accuracy": 0.7226826131343842,
|
|
"num_tokens": 891198196.0,
|
|
"step": 9670
|
|
},
|
|
{
|
|
"entropy": 1.112890625,
|
|
"epoch": 1.3229465627989614,
|
|
"grad_norm": 0.12820210328943715,
|
|
"learning_rate": 1.7454558264055235e-06,
|
|
"loss": 1.1084,
|
|
"mean_token_accuracy": 0.7311557650566101,
|
|
"num_tokens": 892094044.0,
|
|
"step": 9680
|
|
},
|
|
{
|
|
"entropy": 1.1265625,
|
|
"epoch": 1.3243132431324314,
|
|
"grad_norm": 0.12844664045005375,
|
|
"learning_rate": 1.741933211216007e-06,
|
|
"loss": 1.1362,
|
|
"mean_token_accuracy": 0.7282798409461975,
|
|
"num_tokens": 892984168.0,
|
|
"step": 9690
|
|
},
|
|
{
|
|
"entropy": 1.14609375,
|
|
"epoch": 1.3256799234659014,
|
|
"grad_norm": 0.1520364821414003,
|
|
"learning_rate": 1.7384105960264902e-06,
|
|
"loss": 1.1409,
|
|
"mean_token_accuracy": 0.7290291130542755,
|
|
"num_tokens": 893912834.0,
|
|
"step": 9700
|
|
},
|
|
{
|
|
"entropy": 1.1921875,
|
|
"epoch": 1.3270466037993713,
|
|
"grad_norm": 0.13260637611708134,
|
|
"learning_rate": 1.7348879808369735e-06,
|
|
"loss": 1.1897,
|
|
"mean_token_accuracy": 0.7132637202739716,
|
|
"num_tokens": 894862912.0,
|
|
"step": 9710
|
|
},
|
|
{
|
|
"entropy": 1.115625,
|
|
"epoch": 1.3284132841328413,
|
|
"grad_norm": 0.12590065612186976,
|
|
"learning_rate": 1.7313653656474568e-06,
|
|
"loss": 1.134,
|
|
"mean_token_accuracy": 0.7261955797672272,
|
|
"num_tokens": 895810211.0,
|
|
"step": 9720
|
|
},
|
|
{
|
|
"entropy": 1.171875,
|
|
"epoch": 1.3297799644663113,
|
|
"grad_norm": 0.12362462997183474,
|
|
"learning_rate": 1.7278427504579402e-06,
|
|
"loss": 1.1842,
|
|
"mean_token_accuracy": 0.7179460763931275,
|
|
"num_tokens": 896767345.0,
|
|
"step": 9730
|
|
},
|
|
{
|
|
"entropy": 1.1765625,
|
|
"epoch": 1.3311466447997813,
|
|
"grad_norm": 0.11890462458289028,
|
|
"learning_rate": 1.7243201352684233e-06,
|
|
"loss": 1.1969,
|
|
"mean_token_accuracy": 0.7149165511131287,
|
|
"num_tokens": 897730037.0,
|
|
"step": 9740
|
|
},
|
|
{
|
|
"entropy": 1.155859375,
|
|
"epoch": 1.3325133251332513,
|
|
"grad_norm": 0.13304170291348116,
|
|
"learning_rate": 1.7207975200789068e-06,
|
|
"loss": 1.1644,
|
|
"mean_token_accuracy": 0.720121294260025,
|
|
"num_tokens": 898654053.0,
|
|
"step": 9750
|
|
},
|
|
{
|
|
"entropy": 1.130859375,
|
|
"epoch": 1.3338800054667213,
|
|
"grad_norm": 0.13174088189508898,
|
|
"learning_rate": 1.71727490488939e-06,
|
|
"loss": 1.1332,
|
|
"mean_token_accuracy": 0.7259802639484405,
|
|
"num_tokens": 899526007.0,
|
|
"step": 9760
|
|
},
|
|
{
|
|
"entropy": 1.116015625,
|
|
"epoch": 1.3352466858001915,
|
|
"grad_norm": 0.1290537188329695,
|
|
"learning_rate": 1.7137522896998735e-06,
|
|
"loss": 1.1399,
|
|
"mean_token_accuracy": 0.7256585836410523,
|
|
"num_tokens": 900457138.0,
|
|
"step": 9770
|
|
},
|
|
{
|
|
"entropy": 1.091796875,
|
|
"epoch": 1.3366133661336614,
|
|
"grad_norm": 0.11321127696650404,
|
|
"learning_rate": 1.7102296745103566e-06,
|
|
"loss": 1.0954,
|
|
"mean_token_accuracy": 0.7358497262001038,
|
|
"num_tokens": 901342348.0,
|
|
"step": 9780
|
|
},
|
|
{
|
|
"entropy": 1.15703125,
|
|
"epoch": 1.3379800464671314,
|
|
"grad_norm": 0.13555180433013675,
|
|
"learning_rate": 1.7067070593208398e-06,
|
|
"loss": 1.1595,
|
|
"mean_token_accuracy": 0.7216777622699737,
|
|
"num_tokens": 902289247.0,
|
|
"step": 9790
|
|
},
|
|
{
|
|
"entropy": 1.131640625,
|
|
"epoch": 1.3393467268006014,
|
|
"grad_norm": 0.11942488028235804,
|
|
"learning_rate": 1.7031844441313233e-06,
|
|
"loss": 1.1393,
|
|
"mean_token_accuracy": 0.7264927148818969,
|
|
"num_tokens": 903216964.0,
|
|
"step": 9800
|
|
},
|
|
{
|
|
"entropy": 1.12109375,
|
|
"epoch": 1.3407134071340714,
|
|
"grad_norm": 0.11700574744145754,
|
|
"learning_rate": 1.6996618289418064e-06,
|
|
"loss": 1.132,
|
|
"mean_token_accuracy": 0.7282067716121674,
|
|
"num_tokens": 904149583.0,
|
|
"step": 9810
|
|
},
|
|
{
|
|
"entropy": 1.140625,
|
|
"epoch": 1.3420800874675414,
|
|
"grad_norm": 0.1264381577329388,
|
|
"learning_rate": 1.6961392137522898e-06,
|
|
"loss": 1.1499,
|
|
"mean_token_accuracy": 0.7218035817146301,
|
|
"num_tokens": 905019600.0,
|
|
"step": 9820
|
|
},
|
|
{
|
|
"entropy": 1.096484375,
|
|
"epoch": 1.3434467678010114,
|
|
"grad_norm": 0.11965628174295416,
|
|
"learning_rate": 1.6926165985627733e-06,
|
|
"loss": 1.0859,
|
|
"mean_token_accuracy": 0.7363412737846374,
|
|
"num_tokens": 905895865.0,
|
|
"step": 9830
|
|
},
|
|
{
|
|
"entropy": 1.15703125,
|
|
"epoch": 1.3448134481344813,
|
|
"grad_norm": 0.13305376704806485,
|
|
"learning_rate": 1.6890939833732564e-06,
|
|
"loss": 1.1693,
|
|
"mean_token_accuracy": 0.7187966585159302,
|
|
"num_tokens": 906847363.0,
|
|
"step": 9840
|
|
},
|
|
{
|
|
"entropy": 1.16328125,
|
|
"epoch": 1.3461801284679513,
|
|
"grad_norm": 0.1294699939439715,
|
|
"learning_rate": 1.6855713681837396e-06,
|
|
"loss": 1.1726,
|
|
"mean_token_accuracy": 0.720471876859665,
|
|
"num_tokens": 907782159.0,
|
|
"step": 9850
|
|
},
|
|
{
|
|
"entropy": 1.122265625,
|
|
"epoch": 1.3475468088014213,
|
|
"grad_norm": 0.12631282887876366,
|
|
"learning_rate": 1.6820487529942231e-06,
|
|
"loss": 1.1106,
|
|
"mean_token_accuracy": 0.7299838542938233,
|
|
"num_tokens": 908694428.0,
|
|
"step": 9860
|
|
},
|
|
{
|
|
"entropy": 1.11328125,
|
|
"epoch": 1.3489134891348913,
|
|
"grad_norm": 0.14199445907786085,
|
|
"learning_rate": 1.6785261378047062e-06,
|
|
"loss": 1.1087,
|
|
"mean_token_accuracy": 0.7305295169353485,
|
|
"num_tokens": 909610885.0,
|
|
"step": 9870
|
|
},
|
|
{
|
|
"entropy": 1.134375,
|
|
"epoch": 1.3502801694683613,
|
|
"grad_norm": 0.13151164898953854,
|
|
"learning_rate": 1.6750035226151898e-06,
|
|
"loss": 1.1297,
|
|
"mean_token_accuracy": 0.7277298212051392,
|
|
"num_tokens": 910486074.0,
|
|
"step": 9880
|
|
},
|
|
{
|
|
"entropy": 1.12109375,
|
|
"epoch": 1.3516468498018313,
|
|
"grad_norm": 0.12991869099539746,
|
|
"learning_rate": 1.671480907425673e-06,
|
|
"loss": 1.1198,
|
|
"mean_token_accuracy": 0.7287248730659485,
|
|
"num_tokens": 911424517.0,
|
|
"step": 9890
|
|
},
|
|
{
|
|
"entropy": 1.13828125,
|
|
"epoch": 1.3530135301353012,
|
|
"grad_norm": 0.11622106800972831,
|
|
"learning_rate": 1.6679582922361562e-06,
|
|
"loss": 1.144,
|
|
"mean_token_accuracy": 0.7258367359638214,
|
|
"num_tokens": 912321096.0,
|
|
"step": 9900
|
|
},
|
|
{
|
|
"entropy": 1.14375,
|
|
"epoch": 1.3543802104687712,
|
|
"grad_norm": 0.12024921585842754,
|
|
"learning_rate": 1.6644356770466396e-06,
|
|
"loss": 1.1431,
|
|
"mean_token_accuracy": 0.7243121147155762,
|
|
"num_tokens": 913256636.0,
|
|
"step": 9910
|
|
},
|
|
{
|
|
"entropy": 1.1390625,
|
|
"epoch": 1.3557468908022414,
|
|
"grad_norm": 0.17103726275897554,
|
|
"learning_rate": 1.660913061857123e-06,
|
|
"loss": 1.1453,
|
|
"mean_token_accuracy": 0.7249217808246613,
|
|
"num_tokens": 914191751.0,
|
|
"step": 9920
|
|
},
|
|
{
|
|
"entropy": 1.098828125,
|
|
"epoch": 1.3571135711357114,
|
|
"grad_norm": 0.12571933408901723,
|
|
"learning_rate": 1.657390446667606e-06,
|
|
"loss": 1.0866,
|
|
"mean_token_accuracy": 0.7367975354194641,
|
|
"num_tokens": 915113189.0,
|
|
"step": 9930
|
|
},
|
|
{
|
|
"entropy": 1.13125,
|
|
"epoch": 1.3584802514691814,
|
|
"grad_norm": 0.12512296420301228,
|
|
"learning_rate": 1.6538678314780896e-06,
|
|
"loss": 1.1426,
|
|
"mean_token_accuracy": 0.7253833115100861,
|
|
"num_tokens": 916020500.0,
|
|
"step": 9940
|
|
},
|
|
{
|
|
"entropy": 1.2078125,
|
|
"epoch": 1.3598469318026514,
|
|
"grad_norm": 0.13553930769370245,
|
|
"learning_rate": 1.6503452162885727e-06,
|
|
"loss": 1.2139,
|
|
"mean_token_accuracy": 0.7105623066425324,
|
|
"num_tokens": 916915733.0,
|
|
"step": 9950
|
|
},
|
|
{
|
|
"entropy": 1.14453125,
|
|
"epoch": 1.3612136121361214,
|
|
"grad_norm": 0.12675864455822014,
|
|
"learning_rate": 1.646822601099056e-06,
|
|
"loss": 1.1427,
|
|
"mean_token_accuracy": 0.7246937930583954,
|
|
"num_tokens": 917832257.0,
|
|
"step": 9960
|
|
},
|
|
{
|
|
"entropy": 1.122265625,
|
|
"epoch": 1.3625802924695913,
|
|
"grad_norm": 0.12715770292537565,
|
|
"learning_rate": 1.6432999859095394e-06,
|
|
"loss": 1.1414,
|
|
"mean_token_accuracy": 0.7272812008857727,
|
|
"num_tokens": 918751964.0,
|
|
"step": 9970
|
|
},
|
|
{
|
|
"entropy": 1.1890625,
|
|
"epoch": 1.3639469728030613,
|
|
"grad_norm": 0.12438047657940991,
|
|
"learning_rate": 1.6397773707200227e-06,
|
|
"loss": 1.1923,
|
|
"mean_token_accuracy": 0.7146305561065673,
|
|
"num_tokens": 919683718.0,
|
|
"step": 9980
|
|
},
|
|
{
|
|
"entropy": 1.119921875,
|
|
"epoch": 1.3653136531365313,
|
|
"grad_norm": 0.11655682025397461,
|
|
"learning_rate": 1.636254755530506e-06,
|
|
"loss": 1.1302,
|
|
"mean_token_accuracy": 0.7253379464149475,
|
|
"num_tokens": 920567825.0,
|
|
"step": 9990
|
|
},
|
|
{
|
|
"entropy": 1.062109375,
|
|
"epoch": 1.3666803334700013,
|
|
"grad_norm": 0.11475603650469983,
|
|
"learning_rate": 1.6327321403409894e-06,
|
|
"loss": 1.0705,
|
|
"mean_token_accuracy": 0.7379503607749939,
|
|
"num_tokens": 921479711.0,
|
|
"step": 10000
|
|
},
|
|
{
|
|
"entropy": 1.14609375,
|
|
"epoch": 1.3680470138034715,
|
|
"grad_norm": 0.13316804441232394,
|
|
"learning_rate": 1.6292095251514725e-06,
|
|
"loss": 1.1593,
|
|
"mean_token_accuracy": 0.7211896240711212,
|
|
"num_tokens": 922388746.0,
|
|
"step": 10010
|
|
},
|
|
{
|
|
"entropy": 1.162109375,
|
|
"epoch": 1.3694136941369415,
|
|
"grad_norm": 0.11993145266201026,
|
|
"learning_rate": 1.625686909961956e-06,
|
|
"loss": 1.1747,
|
|
"mean_token_accuracy": 0.7182297468185425,
|
|
"num_tokens": 923329842.0,
|
|
"step": 10020
|
|
},
|
|
{
|
|
"entropy": 1.15234375,
|
|
"epoch": 1.3707803744704115,
|
|
"grad_norm": 0.1976541800516837,
|
|
"learning_rate": 1.6221642947724392e-06,
|
|
"loss": 1.1592,
|
|
"mean_token_accuracy": 0.7232244431972503,
|
|
"num_tokens": 924206127.0,
|
|
"step": 10030
|
|
},
|
|
{
|
|
"entropy": 1.190625,
|
|
"epoch": 1.3721470548038814,
|
|
"grad_norm": 0.12883703834537447,
|
|
"learning_rate": 1.6186416795829223e-06,
|
|
"loss": 1.1806,
|
|
"mean_token_accuracy": 0.7178798615932465,
|
|
"num_tokens": 925150765.0,
|
|
"step": 10040
|
|
},
|
|
{
|
|
"entropy": 1.16171875,
|
|
"epoch": 1.3735137351373514,
|
|
"grad_norm": 0.1286369756353084,
|
|
"learning_rate": 1.6151190643934058e-06,
|
|
"loss": 1.1646,
|
|
"mean_token_accuracy": 0.7192493140697479,
|
|
"num_tokens": 926087915.0,
|
|
"step": 10050
|
|
},
|
|
{
|
|
"entropy": 1.11953125,
|
|
"epoch": 1.3748804154708214,
|
|
"grad_norm": 0.12107636110690413,
|
|
"learning_rate": 1.6115964492038892e-06,
|
|
"loss": 1.1199,
|
|
"mean_token_accuracy": 0.7292195439338685,
|
|
"num_tokens": 926998850.0,
|
|
"step": 10060
|
|
},
|
|
{
|
|
"entropy": 1.12890625,
|
|
"epoch": 1.3762470958042914,
|
|
"grad_norm": 0.11508390579741043,
|
|
"learning_rate": 1.6080738340143723e-06,
|
|
"loss": 1.1376,
|
|
"mean_token_accuracy": 0.7273702442646026,
|
|
"num_tokens": 927939577.0,
|
|
"step": 10070
|
|
},
|
|
{
|
|
"entropy": 1.09453125,
|
|
"epoch": 1.3776137761377614,
|
|
"grad_norm": 0.11980103917913609,
|
|
"learning_rate": 1.6045512188248559e-06,
|
|
"loss": 1.1016,
|
|
"mean_token_accuracy": 0.7339250922203064,
|
|
"num_tokens": 928810365.0,
|
|
"step": 10080
|
|
},
|
|
{
|
|
"entropy": 1.154296875,
|
|
"epoch": 1.3789804564712314,
|
|
"grad_norm": 0.12323752692983242,
|
|
"learning_rate": 1.601028603635339e-06,
|
|
"loss": 1.1576,
|
|
"mean_token_accuracy": 0.7235203564167023,
|
|
"num_tokens": 929705162.0,
|
|
"step": 10090
|
|
},
|
|
{
|
|
"entropy": 1.1421875,
|
|
"epoch": 1.3803471368047013,
|
|
"grad_norm": 0.12975072153688783,
|
|
"learning_rate": 1.5975059884458225e-06,
|
|
"loss": 1.1633,
|
|
"mean_token_accuracy": 0.7207797348499299,
|
|
"num_tokens": 930678360.0,
|
|
"step": 10100
|
|
},
|
|
{
|
|
"entropy": 1.128515625,
|
|
"epoch": 1.3817138171381713,
|
|
"grad_norm": 0.12545417890232785,
|
|
"learning_rate": 1.5939833732563056e-06,
|
|
"loss": 1.1287,
|
|
"mean_token_accuracy": 0.7278427302837371,
|
|
"num_tokens": 931601996.0,
|
|
"step": 10110
|
|
},
|
|
{
|
|
"entropy": 1.15,
|
|
"epoch": 1.3830804974716413,
|
|
"grad_norm": 0.11509571239792743,
|
|
"learning_rate": 1.5904607580667888e-06,
|
|
"loss": 1.152,
|
|
"mean_token_accuracy": 0.7221798956394195,
|
|
"num_tokens": 932551505.0,
|
|
"step": 10120
|
|
},
|
|
{
|
|
"entropy": 1.1328125,
|
|
"epoch": 1.3844471778051113,
|
|
"grad_norm": 0.14212122880539452,
|
|
"learning_rate": 1.5869381428772723e-06,
|
|
"loss": 1.1487,
|
|
"mean_token_accuracy": 0.7246949195861816,
|
|
"num_tokens": 933463791.0,
|
|
"step": 10130
|
|
},
|
|
{
|
|
"entropy": 1.1578125,
|
|
"epoch": 1.3858138581385813,
|
|
"grad_norm": 0.12351127621022603,
|
|
"learning_rate": 1.5834155276877554e-06,
|
|
"loss": 1.1578,
|
|
"mean_token_accuracy": 0.7227281630039215,
|
|
"num_tokens": 934370194.0,
|
|
"step": 10140
|
|
},
|
|
{
|
|
"entropy": 1.15390625,
|
|
"epoch": 1.3871805384720512,
|
|
"grad_norm": 0.12570257037325266,
|
|
"learning_rate": 1.5798929124982388e-06,
|
|
"loss": 1.1516,
|
|
"mean_token_accuracy": 0.7223952651023865,
|
|
"num_tokens": 935250557.0,
|
|
"step": 10150
|
|
},
|
|
{
|
|
"entropy": 1.119921875,
|
|
"epoch": 1.3885472188055215,
|
|
"grad_norm": 0.11864769395741905,
|
|
"learning_rate": 1.5763702973087221e-06,
|
|
"loss": 1.1374,
|
|
"mean_token_accuracy": 0.7246994912624359,
|
|
"num_tokens": 936144286.0,
|
|
"step": 10160
|
|
},
|
|
{
|
|
"entropy": 1.10703125,
|
|
"epoch": 1.3899138991389914,
|
|
"grad_norm": 0.14769369590341186,
|
|
"learning_rate": 1.5728476821192054e-06,
|
|
"loss": 1.1161,
|
|
"mean_token_accuracy": 0.7310448050498962,
|
|
"num_tokens": 937054635.0,
|
|
"step": 10170
|
|
},
|
|
{
|
|
"entropy": 1.187890625,
|
|
"epoch": 1.3912805794724614,
|
|
"grad_norm": 0.12332239377453258,
|
|
"learning_rate": 1.5693250669296886e-06,
|
|
"loss": 1.197,
|
|
"mean_token_accuracy": 0.7155725657939911,
|
|
"num_tokens": 938010446.0,
|
|
"step": 10180
|
|
},
|
|
{
|
|
"entropy": 1.1171875,
|
|
"epoch": 1.3926472598059314,
|
|
"grad_norm": 0.1216194378344364,
|
|
"learning_rate": 1.5658024517401721e-06,
|
|
"loss": 1.1246,
|
|
"mean_token_accuracy": 0.7286647975444793,
|
|
"num_tokens": 938926013.0,
|
|
"step": 10190
|
|
},
|
|
{
|
|
"entropy": 1.09921875,
|
|
"epoch": 1.3940139401394014,
|
|
"grad_norm": 0.11744763770684995,
|
|
"learning_rate": 1.5622798365506552e-06,
|
|
"loss": 1.1098,
|
|
"mean_token_accuracy": 0.7320698797702789,
|
|
"num_tokens": 939867123.0,
|
|
"step": 10200
|
|
},
|
|
{
|
|
"entropy": 1.14765625,
|
|
"epoch": 1.3953806204728714,
|
|
"grad_norm": 0.11735181990172476,
|
|
"learning_rate": 1.5587572213611388e-06,
|
|
"loss": 1.1428,
|
|
"mean_token_accuracy": 0.7262371778488159,
|
|
"num_tokens": 940784288.0,
|
|
"step": 10210
|
|
},
|
|
{
|
|
"entropy": 1.1578125,
|
|
"epoch": 1.3967473008063414,
|
|
"grad_norm": 0.12735038891623535,
|
|
"learning_rate": 1.555234606171622e-06,
|
|
"loss": 1.149,
|
|
"mean_token_accuracy": 0.7247069895267486,
|
|
"num_tokens": 941732009.0,
|
|
"step": 10220
|
|
},
|
|
{
|
|
"entropy": 1.139453125,
|
|
"epoch": 1.3981139811398113,
|
|
"grad_norm": 0.10761030335825769,
|
|
"learning_rate": 1.5517119909821052e-06,
|
|
"loss": 1.1429,
|
|
"mean_token_accuracy": 0.7241693735122681,
|
|
"num_tokens": 942687281.0,
|
|
"step": 10230
|
|
},
|
|
{
|
|
"entropy": 1.15234375,
|
|
"epoch": 1.3994806614732813,
|
|
"grad_norm": 0.1288932913885588,
|
|
"learning_rate": 1.5481893757925886e-06,
|
|
"loss": 1.1572,
|
|
"mean_token_accuracy": 0.7228234767913818,
|
|
"num_tokens": 943633306.0,
|
|
"step": 10240
|
|
},
|
|
{
|
|
"entropy": 1.15390625,
|
|
"epoch": 1.4008473418067515,
|
|
"grad_norm": 0.14420635008046045,
|
|
"learning_rate": 1.544666760603072e-06,
|
|
"loss": 1.1625,
|
|
"mean_token_accuracy": 0.7204009115695953,
|
|
"num_tokens": 944512974.0,
|
|
"step": 10250
|
|
},
|
|
{
|
|
"entropy": 1.1921875,
|
|
"epoch": 1.4022140221402215,
|
|
"grad_norm": 0.12974847547737886,
|
|
"learning_rate": 1.541144145413555e-06,
|
|
"loss": 1.2082,
|
|
"mean_token_accuracy": 0.7119271636009217,
|
|
"num_tokens": 945460175.0,
|
|
"step": 10260
|
|
},
|
|
{
|
|
"entropy": 1.1578125,
|
|
"epoch": 1.4035807024736915,
|
|
"grad_norm": 0.11982364502844581,
|
|
"learning_rate": 1.5376215302240386e-06,
|
|
"loss": 1.1638,
|
|
"mean_token_accuracy": 0.7235591769218445,
|
|
"num_tokens": 946446899.0,
|
|
"step": 10270
|
|
},
|
|
{
|
|
"entropy": 1.133203125,
|
|
"epoch": 1.4049473828071615,
|
|
"grad_norm": 0.12036935649165953,
|
|
"learning_rate": 1.5340989150345217e-06,
|
|
"loss": 1.1461,
|
|
"mean_token_accuracy": 0.7254356205463409,
|
|
"num_tokens": 947379575.0,
|
|
"step": 10280
|
|
},
|
|
{
|
|
"entropy": 1.090625,
|
|
"epoch": 1.4063140631406315,
|
|
"grad_norm": 0.11601135924494972,
|
|
"learning_rate": 1.5305762998450048e-06,
|
|
"loss": 1.1068,
|
|
"mean_token_accuracy": 0.7333293616771698,
|
|
"num_tokens": 948285980.0,
|
|
"step": 10290
|
|
},
|
|
{
|
|
"entropy": 1.197265625,
|
|
"epoch": 1.4076807434741014,
|
|
"grad_norm": 0.13812752098137976,
|
|
"learning_rate": 1.5270536846554884e-06,
|
|
"loss": 1.206,
|
|
"mean_token_accuracy": 0.7138454854488373,
|
|
"num_tokens": 949204652.0,
|
|
"step": 10300
|
|
},
|
|
{
|
|
"entropy": 1.1734375,
|
|
"epoch": 1.4090474238075714,
|
|
"grad_norm": 0.13969164572643764,
|
|
"learning_rate": 1.5235310694659717e-06,
|
|
"loss": 1.1728,
|
|
"mean_token_accuracy": 0.7181223928928375,
|
|
"num_tokens": 950145164.0,
|
|
"step": 10310
|
|
},
|
|
{
|
|
"entropy": 1.1078125,
|
|
"epoch": 1.4104141041410414,
|
|
"grad_norm": 0.21467604596301562,
|
|
"learning_rate": 1.5200084542764548e-06,
|
|
"loss": 1.0934,
|
|
"mean_token_accuracy": 0.7336738169193268,
|
|
"num_tokens": 951084582.0,
|
|
"step": 10320
|
|
},
|
|
{
|
|
"entropy": 1.1390625,
|
|
"epoch": 1.4117807844745114,
|
|
"grad_norm": 0.11810493879599651,
|
|
"learning_rate": 1.5164858390869384e-06,
|
|
"loss": 1.1453,
|
|
"mean_token_accuracy": 0.7229368388652802,
|
|
"num_tokens": 952003428.0,
|
|
"step": 10330
|
|
},
|
|
{
|
|
"entropy": 1.2078125,
|
|
"epoch": 1.4131474648079814,
|
|
"grad_norm": 0.13161363904840032,
|
|
"learning_rate": 1.5129632238974215e-06,
|
|
"loss": 1.214,
|
|
"mean_token_accuracy": 0.7142141699790955,
|
|
"num_tokens": 952891571.0,
|
|
"step": 10340
|
|
},
|
|
{
|
|
"entropy": 1.13671875,
|
|
"epoch": 1.4145141451414514,
|
|
"grad_norm": 0.11716633096195692,
|
|
"learning_rate": 1.509440608707905e-06,
|
|
"loss": 1.1349,
|
|
"mean_token_accuracy": 0.7256884217262268,
|
|
"num_tokens": 953841805.0,
|
|
"step": 10350
|
|
},
|
|
{
|
|
"entropy": 1.190625,
|
|
"epoch": 1.4158808254749213,
|
|
"grad_norm": 0.14710666599527258,
|
|
"learning_rate": 1.5059179935183882e-06,
|
|
"loss": 1.1937,
|
|
"mean_token_accuracy": 0.7156160116195679,
|
|
"num_tokens": 954757093.0,
|
|
"step": 10360
|
|
},
|
|
{
|
|
"entropy": 1.1484375,
|
|
"epoch": 1.4172475058083913,
|
|
"grad_norm": 0.14246251645814137,
|
|
"learning_rate": 1.5023953783288713e-06,
|
|
"loss": 1.1622,
|
|
"mean_token_accuracy": 0.7245750963687897,
|
|
"num_tokens": 955688450.0,
|
|
"step": 10370
|
|
},
|
|
{
|
|
"entropy": 1.13359375,
|
|
"epoch": 1.4186141861418613,
|
|
"grad_norm": 0.11359975606948707,
|
|
"learning_rate": 1.4988727631393549e-06,
|
|
"loss": 1.1371,
|
|
"mean_token_accuracy": 0.7238262355327606,
|
|
"num_tokens": 956599499.0,
|
|
"step": 10380
|
|
},
|
|
{
|
|
"entropy": 1.159375,
|
|
"epoch": 1.4199808664753313,
|
|
"grad_norm": 0.13632200246456636,
|
|
"learning_rate": 1.495350147949838e-06,
|
|
"loss": 1.1657,
|
|
"mean_token_accuracy": 0.7224506080150604,
|
|
"num_tokens": 957556773.0,
|
|
"step": 10390
|
|
},
|
|
{
|
|
"entropy": 1.105078125,
|
|
"epoch": 1.4213475468088015,
|
|
"grad_norm": 0.12097520984173497,
|
|
"learning_rate": 1.4918275327603213e-06,
|
|
"loss": 1.1108,
|
|
"mean_token_accuracy": 0.732928740978241,
|
|
"num_tokens": 958434459.0,
|
|
"step": 10400
|
|
},
|
|
{
|
|
"entropy": 1.103515625,
|
|
"epoch": 1.4227142271422715,
|
|
"grad_norm": 0.1164878783904925,
|
|
"learning_rate": 1.4883049175708047e-06,
|
|
"loss": 1.1194,
|
|
"mean_token_accuracy": 0.7298633217811584,
|
|
"num_tokens": 959305209.0,
|
|
"step": 10410
|
|
},
|
|
{
|
|
"entropy": 1.159375,
|
|
"epoch": 1.4240809074757415,
|
|
"grad_norm": 0.12034891657527398,
|
|
"learning_rate": 1.484782302381288e-06,
|
|
"loss": 1.1456,
|
|
"mean_token_accuracy": 0.7233862102031707,
|
|
"num_tokens": 960231022.0,
|
|
"step": 10420
|
|
},
|
|
{
|
|
"entropy": 1.15859375,
|
|
"epoch": 1.4254475878092114,
|
|
"grad_norm": 0.12942824021612429,
|
|
"learning_rate": 1.4812596871917711e-06,
|
|
"loss": 1.1621,
|
|
"mean_token_accuracy": 0.7230564475059509,
|
|
"num_tokens": 961131511.0,
|
|
"step": 10430
|
|
},
|
|
{
|
|
"entropy": 1.12265625,
|
|
"epoch": 1.4268142681426814,
|
|
"grad_norm": 0.11602656395586755,
|
|
"learning_rate": 1.4777370720022547e-06,
|
|
"loss": 1.1224,
|
|
"mean_token_accuracy": 0.7290022611618042,
|
|
"num_tokens": 962047298.0,
|
|
"step": 10440
|
|
},
|
|
{
|
|
"entropy": 1.134765625,
|
|
"epoch": 1.4281809484761514,
|
|
"grad_norm": 0.11934777714867545,
|
|
"learning_rate": 1.4742144568127378e-06,
|
|
"loss": 1.1343,
|
|
"mean_token_accuracy": 0.725718754529953,
|
|
"num_tokens": 963000680.0,
|
|
"step": 10450
|
|
},
|
|
{
|
|
"entropy": 1.08984375,
|
|
"epoch": 1.4295476288096214,
|
|
"grad_norm": 0.1223907929736115,
|
|
"learning_rate": 1.4706918416232213e-06,
|
|
"loss": 1.0884,
|
|
"mean_token_accuracy": 0.7364513099193573,
|
|
"num_tokens": 963910634.0,
|
|
"step": 10460
|
|
},
|
|
{
|
|
"entropy": 1.15234375,
|
|
"epoch": 1.4309143091430914,
|
|
"grad_norm": 0.113086734417037,
|
|
"learning_rate": 1.4671692264337045e-06,
|
|
"loss": 1.1504,
|
|
"mean_token_accuracy": 0.7230085074901581,
|
|
"num_tokens": 964864834.0,
|
|
"step": 10470
|
|
},
|
|
{
|
|
"entropy": 1.12734375,
|
|
"epoch": 1.4322809894765614,
|
|
"grad_norm": 0.11184709673624277,
|
|
"learning_rate": 1.4636466112441878e-06,
|
|
"loss": 1.13,
|
|
"mean_token_accuracy": 0.7253074824810029,
|
|
"num_tokens": 965816016.0,
|
|
"step": 10480
|
|
},
|
|
{
|
|
"entropy": 1.119140625,
|
|
"epoch": 1.4336476698100316,
|
|
"grad_norm": 0.11943689220583421,
|
|
"learning_rate": 1.4601239960546711e-06,
|
|
"loss": 1.1147,
|
|
"mean_token_accuracy": 0.7292599737644195,
|
|
"num_tokens": 966734698.0,
|
|
"step": 10490
|
|
},
|
|
{
|
|
"entropy": 1.16640625,
|
|
"epoch": 1.4350143501435015,
|
|
"grad_norm": 0.15139464397143093,
|
|
"learning_rate": 1.4566013808651545e-06,
|
|
"loss": 1.169,
|
|
"mean_token_accuracy": 0.719297981262207,
|
|
"num_tokens": 967702846.0,
|
|
"step": 10500
|
|
},
|
|
{
|
|
"entropy": 1.092578125,
|
|
"epoch": 1.4363810304769715,
|
|
"grad_norm": 0.11930159424391458,
|
|
"learning_rate": 1.4530787656756376e-06,
|
|
"loss": 1.1016,
|
|
"mean_token_accuracy": 0.7341405153274536,
|
|
"num_tokens": 968655185.0,
|
|
"step": 10510
|
|
},
|
|
{
|
|
"entropy": 1.125,
|
|
"epoch": 1.4377477108104415,
|
|
"grad_norm": 0.13758821739788085,
|
|
"learning_rate": 1.4495561504861211e-06,
|
|
"loss": 1.1188,
|
|
"mean_token_accuracy": 0.7288134396076202,
|
|
"num_tokens": 969625560.0,
|
|
"step": 10520
|
|
},
|
|
{
|
|
"entropy": 1.13359375,
|
|
"epoch": 1.4391143911439115,
|
|
"grad_norm": 0.11745280788787815,
|
|
"learning_rate": 1.4460335352966043e-06,
|
|
"loss": 1.1365,
|
|
"mean_token_accuracy": 0.7289946615695954,
|
|
"num_tokens": 970536328.0,
|
|
"step": 10530
|
|
},
|
|
{
|
|
"entropy": 1.2109375,
|
|
"epoch": 1.4404810714773815,
|
|
"grad_norm": 0.12121961934296427,
|
|
"learning_rate": 1.4425109201070876e-06,
|
|
"loss": 1.2148,
|
|
"mean_token_accuracy": 0.7111732006072998,
|
|
"num_tokens": 971542224.0,
|
|
"step": 10540
|
|
},
|
|
{
|
|
"entropy": 1.15390625,
|
|
"epoch": 1.4418477518108515,
|
|
"grad_norm": 0.12554566443061882,
|
|
"learning_rate": 1.438988304917571e-06,
|
|
"loss": 1.1584,
|
|
"mean_token_accuracy": 0.7210110902786255,
|
|
"num_tokens": 972483619.0,
|
|
"step": 10550
|
|
},
|
|
{
|
|
"entropy": 1.1375,
|
|
"epoch": 1.4432144321443214,
|
|
"grad_norm": 0.11382195828890349,
|
|
"learning_rate": 1.4354656897280543e-06,
|
|
"loss": 1.1481,
|
|
"mean_token_accuracy": 0.7231622517108918,
|
|
"num_tokens": 973418101.0,
|
|
"step": 10560
|
|
},
|
|
{
|
|
"entropy": 1.121875,
|
|
"epoch": 1.4445811124777914,
|
|
"grad_norm": 0.12763363356987312,
|
|
"learning_rate": 1.4319430745385376e-06,
|
|
"loss": 1.1255,
|
|
"mean_token_accuracy": 0.7272957324981689,
|
|
"num_tokens": 974333475.0,
|
|
"step": 10570
|
|
},
|
|
{
|
|
"entropy": 1.146875,
|
|
"epoch": 1.4459477928112614,
|
|
"grad_norm": 0.11495621783064523,
|
|
"learning_rate": 1.428420459349021e-06,
|
|
"loss": 1.1493,
|
|
"mean_token_accuracy": 0.723940247297287,
|
|
"num_tokens": 975250053.0,
|
|
"step": 10580
|
|
},
|
|
{
|
|
"entropy": 1.084375,
|
|
"epoch": 1.4473144731447314,
|
|
"grad_norm": 0.12614760590302687,
|
|
"learning_rate": 1.424897844159504e-06,
|
|
"loss": 1.0906,
|
|
"mean_token_accuracy": 0.7352995097637176,
|
|
"num_tokens": 976170564.0,
|
|
"step": 10590
|
|
},
|
|
{
|
|
"entropy": 1.126171875,
|
|
"epoch": 1.4486811534782014,
|
|
"grad_norm": 0.1442645948595566,
|
|
"learning_rate": 1.4213752289699876e-06,
|
|
"loss": 1.131,
|
|
"mean_token_accuracy": 0.726500928401947,
|
|
"num_tokens": 977076464.0,
|
|
"step": 10600
|
|
},
|
|
{
|
|
"entropy": 1.10546875,
|
|
"epoch": 1.4500478338116713,
|
|
"grad_norm": 0.11540104760609518,
|
|
"learning_rate": 1.4178526137804707e-06,
|
|
"loss": 1.1108,
|
|
"mean_token_accuracy": 0.730927461385727,
|
|
"num_tokens": 977974328.0,
|
|
"step": 10610
|
|
},
|
|
{
|
|
"entropy": 1.108984375,
|
|
"epoch": 1.4514145141451413,
|
|
"grad_norm": 0.12128597012465797,
|
|
"learning_rate": 1.4143299985909538e-06,
|
|
"loss": 1.1026,
|
|
"mean_token_accuracy": 0.7348154366016388,
|
|
"num_tokens": 978866088.0,
|
|
"step": 10620
|
|
},
|
|
{
|
|
"entropy": 1.115625,
|
|
"epoch": 1.4527811944786113,
|
|
"grad_norm": 0.12558909708512084,
|
|
"learning_rate": 1.4108073834014374e-06,
|
|
"loss": 1.1075,
|
|
"mean_token_accuracy": 0.7321736991405488,
|
|
"num_tokens": 979814950.0,
|
|
"step": 10630
|
|
},
|
|
{
|
|
"entropy": 1.1109375,
|
|
"epoch": 1.4541478748120815,
|
|
"grad_norm": 0.12462454090104996,
|
|
"learning_rate": 1.4072847682119205e-06,
|
|
"loss": 1.1222,
|
|
"mean_token_accuracy": 0.7298550069332123,
|
|
"num_tokens": 980760948.0,
|
|
"step": 10640
|
|
},
|
|
{
|
|
"entropy": 1.128125,
|
|
"epoch": 1.4555145551455515,
|
|
"grad_norm": 0.11912853559702008,
|
|
"learning_rate": 1.4037621530224039e-06,
|
|
"loss": 1.1243,
|
|
"mean_token_accuracy": 0.7297784864902497,
|
|
"num_tokens": 981737348.0,
|
|
"step": 10650
|
|
},
|
|
{
|
|
"entropy": 1.10546875,
|
|
"epoch": 1.4568812354790215,
|
|
"grad_norm": 0.11959006258150814,
|
|
"learning_rate": 1.4002395378328872e-06,
|
|
"loss": 1.1089,
|
|
"mean_token_accuracy": 0.7283894300460816,
|
|
"num_tokens": 982695749.0,
|
|
"step": 10660
|
|
},
|
|
{
|
|
"entropy": 1.0921875,
|
|
"epoch": 1.4582479158124915,
|
|
"grad_norm": 0.16895521134720787,
|
|
"learning_rate": 1.3967169226433705e-06,
|
|
"loss": 1.09,
|
|
"mean_token_accuracy": 0.7358869552612305,
|
|
"num_tokens": 983614796.0,
|
|
"step": 10670
|
|
},
|
|
{
|
|
"entropy": 1.15,
|
|
"epoch": 1.4596145961459615,
|
|
"grad_norm": 0.12478895942361642,
|
|
"learning_rate": 1.393194307453854e-06,
|
|
"loss": 1.1527,
|
|
"mean_token_accuracy": 0.7227578163146973,
|
|
"num_tokens": 984551709.0,
|
|
"step": 10680
|
|
},
|
|
{
|
|
"entropy": 1.17578125,
|
|
"epoch": 1.4609812764794314,
|
|
"grad_norm": 0.14430409253409043,
|
|
"learning_rate": 1.3896716922643372e-06,
|
|
"loss": 1.1693,
|
|
"mean_token_accuracy": 0.7194712996482849,
|
|
"num_tokens": 985454993.0,
|
|
"step": 10690
|
|
},
|
|
{
|
|
"entropy": 1.11953125,
|
|
"epoch": 1.4623479568129014,
|
|
"grad_norm": 0.12216880897167008,
|
|
"learning_rate": 1.3861490770748203e-06,
|
|
"loss": 1.1243,
|
|
"mean_token_accuracy": 0.7289729475975036,
|
|
"num_tokens": 986391499.0,
|
|
"step": 10700
|
|
},
|
|
{
|
|
"entropy": 1.14296875,
|
|
"epoch": 1.4637146371463714,
|
|
"grad_norm": 0.13398581243660923,
|
|
"learning_rate": 1.3826264618853039e-06,
|
|
"loss": 1.1563,
|
|
"mean_token_accuracy": 0.7211912095546722,
|
|
"num_tokens": 987263519.0,
|
|
"step": 10710
|
|
},
|
|
{
|
|
"entropy": 1.165625,
|
|
"epoch": 1.4650813174798414,
|
|
"grad_norm": 0.1243335379856118,
|
|
"learning_rate": 1.379103846695787e-06,
|
|
"loss": 1.1736,
|
|
"mean_token_accuracy": 0.7198470771312714,
|
|
"num_tokens": 988255038.0,
|
|
"step": 10720
|
|
},
|
|
{
|
|
"entropy": 1.15546875,
|
|
"epoch": 1.4664479978133116,
|
|
"grad_norm": 0.13484220124987883,
|
|
"learning_rate": 1.3755812315062703e-06,
|
|
"loss": 1.1683,
|
|
"mean_token_accuracy": 0.720468407869339,
|
|
"num_tokens": 989167746.0,
|
|
"step": 10730
|
|
},
|
|
{
|
|
"entropy": 1.1140625,
|
|
"epoch": 1.4678146781467816,
|
|
"grad_norm": 0.11803756285355078,
|
|
"learning_rate": 1.3720586163167537e-06,
|
|
"loss": 1.1276,
|
|
"mean_token_accuracy": 0.7267346739768982,
|
|
"num_tokens": 990084172.0,
|
|
"step": 10740
|
|
},
|
|
{
|
|
"entropy": 1.16015625,
|
|
"epoch": 1.4691813584802516,
|
|
"grad_norm": 0.13601557919394122,
|
|
"learning_rate": 1.368536001127237e-06,
|
|
"loss": 1.1744,
|
|
"mean_token_accuracy": 0.720447200536728,
|
|
"num_tokens": 990977216.0,
|
|
"step": 10750
|
|
},
|
|
{
|
|
"entropy": 1.11953125,
|
|
"epoch": 1.4705480388137215,
|
|
"grad_norm": 0.11682773231990441,
|
|
"learning_rate": 1.3650133859377201e-06,
|
|
"loss": 1.1088,
|
|
"mean_token_accuracy": 0.7284457921981812,
|
|
"num_tokens": 991891997.0,
|
|
"step": 10760
|
|
},
|
|
{
|
|
"entropy": 1.136328125,
|
|
"epoch": 1.4719147191471915,
|
|
"grad_norm": 0.12466959100778467,
|
|
"learning_rate": 1.3614907707482037e-06,
|
|
"loss": 1.1382,
|
|
"mean_token_accuracy": 0.7256722390651703,
|
|
"num_tokens": 992789231.0,
|
|
"step": 10770
|
|
},
|
|
{
|
|
"entropy": 1.2140625,
|
|
"epoch": 1.4732813994806615,
|
|
"grad_norm": 0.12052570206084148,
|
|
"learning_rate": 1.3579681555586868e-06,
|
|
"loss": 1.2106,
|
|
"mean_token_accuracy": 0.7136904656887054,
|
|
"num_tokens": 993739364.0,
|
|
"step": 10780
|
|
},
|
|
{
|
|
"entropy": 1.1421875,
|
|
"epoch": 1.4746480798141315,
|
|
"grad_norm": 0.11080424159707382,
|
|
"learning_rate": 1.3544455403691703e-06,
|
|
"loss": 1.1495,
|
|
"mean_token_accuracy": 0.723738157749176,
|
|
"num_tokens": 994688957.0,
|
|
"step": 10790
|
|
},
|
|
{
|
|
"entropy": 1.11484375,
|
|
"epoch": 1.4760147601476015,
|
|
"grad_norm": 0.1342231765841263,
|
|
"learning_rate": 1.3509229251796535e-06,
|
|
"loss": 1.1087,
|
|
"mean_token_accuracy": 0.7325750052928924,
|
|
"num_tokens": 995572150.0,
|
|
"step": 10800
|
|
},
|
|
{
|
|
"entropy": 1.1578125,
|
|
"epoch": 1.4773814404810715,
|
|
"grad_norm": 0.1473865000813694,
|
|
"learning_rate": 1.3474003099901368e-06,
|
|
"loss": 1.1534,
|
|
"mean_token_accuracy": 0.7238484621047974,
|
|
"num_tokens": 996471230.0,
|
|
"step": 10810
|
|
},
|
|
{
|
|
"entropy": 1.121875,
|
|
"epoch": 1.4787481208145414,
|
|
"grad_norm": 0.11768275271775251,
|
|
"learning_rate": 1.3438776948006201e-06,
|
|
"loss": 1.1132,
|
|
"mean_token_accuracy": 0.7320198357105255,
|
|
"num_tokens": 997378119.0,
|
|
"step": 10820
|
|
},
|
|
{
|
|
"entropy": 1.0953125,
|
|
"epoch": 1.4801148011480114,
|
|
"grad_norm": 0.12046934093719325,
|
|
"learning_rate": 1.3403550796111035e-06,
|
|
"loss": 1.1061,
|
|
"mean_token_accuracy": 0.7332315266132354,
|
|
"num_tokens": 998303380.0,
|
|
"step": 10830
|
|
},
|
|
{
|
|
"entropy": 1.155859375,
|
|
"epoch": 1.4814814814814814,
|
|
"grad_norm": 0.11940224391795941,
|
|
"learning_rate": 1.3368324644215866e-06,
|
|
"loss": 1.154,
|
|
"mean_token_accuracy": 0.7224894046783448,
|
|
"num_tokens": 999225085.0,
|
|
"step": 10840
|
|
},
|
|
{
|
|
"entropy": 1.09375,
|
|
"epoch": 1.4828481618149514,
|
|
"grad_norm": 0.11531722501170785,
|
|
"learning_rate": 1.3333098492320701e-06,
|
|
"loss": 1.0947,
|
|
"mean_token_accuracy": 0.7335414588451385,
|
|
"num_tokens": 1000160882.0,
|
|
"step": 10850
|
|
},
|
|
{
|
|
"entropy": 1.1546875,
|
|
"epoch": 1.4842148421484214,
|
|
"grad_norm": 0.12793052681287823,
|
|
"learning_rate": 1.3297872340425533e-06,
|
|
"loss": 1.1543,
|
|
"mean_token_accuracy": 0.7216011881828308,
|
|
"num_tokens": 1001058706.0,
|
|
"step": 10860
|
|
},
|
|
{
|
|
"entropy": 1.17421875,
|
|
"epoch": 1.4855815224818913,
|
|
"grad_norm": 0.1279625478784157,
|
|
"learning_rate": 1.3262646188530364e-06,
|
|
"loss": 1.1891,
|
|
"mean_token_accuracy": 0.717362642288208,
|
|
"num_tokens": 1002025389.0,
|
|
"step": 10870
|
|
},
|
|
{
|
|
"entropy": 1.111328125,
|
|
"epoch": 1.4869482028153616,
|
|
"grad_norm": 0.1261917146211461,
|
|
"learning_rate": 1.32274200366352e-06,
|
|
"loss": 1.1152,
|
|
"mean_token_accuracy": 0.7323106944561004,
|
|
"num_tokens": 1002947444.0,
|
|
"step": 10880
|
|
},
|
|
{
|
|
"entropy": 1.10859375,
|
|
"epoch": 1.4883148831488315,
|
|
"grad_norm": 0.121743130506697,
|
|
"learning_rate": 1.319219388474003e-06,
|
|
"loss": 1.1182,
|
|
"mean_token_accuracy": 0.7309813857078552,
|
|
"num_tokens": 1003827927.0,
|
|
"step": 10890
|
|
},
|
|
{
|
|
"entropy": 1.159375,
|
|
"epoch": 1.4896815634823015,
|
|
"grad_norm": 0.11958587739425196,
|
|
"learning_rate": 1.3156967732844866e-06,
|
|
"loss": 1.1654,
|
|
"mean_token_accuracy": 0.7222406327724457,
|
|
"num_tokens": 1004761314.0,
|
|
"step": 10900
|
|
},
|
|
{
|
|
"entropy": 1.0921875,
|
|
"epoch": 1.4910482438157715,
|
|
"grad_norm": 0.12823287297760808,
|
|
"learning_rate": 1.31217415809497e-06,
|
|
"loss": 1.0909,
|
|
"mean_token_accuracy": 0.7349133491516113,
|
|
"num_tokens": 1005703529.0,
|
|
"step": 10910
|
|
},
|
|
{
|
|
"entropy": 1.171875,
|
|
"epoch": 1.4924149241492415,
|
|
"grad_norm": 0.1237708058368329,
|
|
"learning_rate": 1.308651542905453e-06,
|
|
"loss": 1.1919,
|
|
"mean_token_accuracy": 0.7129414439201355,
|
|
"num_tokens": 1006683078.0,
|
|
"step": 10920
|
|
},
|
|
{
|
|
"entropy": 1.112109375,
|
|
"epoch": 1.4937816044827115,
|
|
"grad_norm": 0.12247098689194967,
|
|
"learning_rate": 1.3051289277159366e-06,
|
|
"loss": 1.1167,
|
|
"mean_token_accuracy": 0.7285617768764496,
|
|
"num_tokens": 1007585241.0,
|
|
"step": 10930
|
|
},
|
|
{
|
|
"entropy": 1.12734375,
|
|
"epoch": 1.4951482848161814,
|
|
"grad_norm": 0.12471104614289083,
|
|
"learning_rate": 1.3016063125264197e-06,
|
|
"loss": 1.1377,
|
|
"mean_token_accuracy": 0.7280544757843017,
|
|
"num_tokens": 1008478476.0,
|
|
"step": 10940
|
|
},
|
|
{
|
|
"entropy": 1.15859375,
|
|
"epoch": 1.4965149651496514,
|
|
"grad_norm": 0.1271455292446835,
|
|
"learning_rate": 1.2980836973369029e-06,
|
|
"loss": 1.1574,
|
|
"mean_token_accuracy": 0.7197562098503113,
|
|
"num_tokens": 1009373252.0,
|
|
"step": 10950
|
|
},
|
|
{
|
|
"entropy": 1.115234375,
|
|
"epoch": 1.4978816454831214,
|
|
"grad_norm": 0.13287581419374467,
|
|
"learning_rate": 1.2945610821473864e-06,
|
|
"loss": 1.1237,
|
|
"mean_token_accuracy": 0.7298968195915222,
|
|
"num_tokens": 1010255380.0,
|
|
"step": 10960
|
|
},
|
|
{
|
|
"entropy": 1.091015625,
|
|
"epoch": 1.4992483258165916,
|
|
"grad_norm": 0.12418811009607064,
|
|
"learning_rate": 1.2910384669578695e-06,
|
|
"loss": 1.0944,
|
|
"mean_token_accuracy": 0.7370682656764984,
|
|
"num_tokens": 1011183449.0,
|
|
"step": 10970
|
|
},
|
|
{
|
|
"entropy": 1.102734375,
|
|
"epoch": 1.5006150061500616,
|
|
"grad_norm": 0.13815438661232401,
|
|
"learning_rate": 1.2875158517683529e-06,
|
|
"loss": 1.1128,
|
|
"mean_token_accuracy": 0.7306560158729554,
|
|
"num_tokens": 1012078107.0,
|
|
"step": 10980
|
|
},
|
|
{
|
|
"entropy": 1.1625,
|
|
"epoch": 1.5019816864835316,
|
|
"grad_norm": 0.1265467408433479,
|
|
"learning_rate": 1.2839932365788362e-06,
|
|
"loss": 1.166,
|
|
"mean_token_accuracy": 0.7198687493801117,
|
|
"num_tokens": 1012995560.0,
|
|
"step": 10990
|
|
},
|
|
{
|
|
"entropy": 1.14453125,
|
|
"epoch": 1.5033483668170016,
|
|
"grad_norm": 0.10786769257917381,
|
|
"learning_rate": 1.2804706213893195e-06,
|
|
"loss": 1.1481,
|
|
"mean_token_accuracy": 0.7241556465625762,
|
|
"num_tokens": 1013935624.0,
|
|
"step": 11000
|
|
},
|
|
{
|
|
"entropy": 1.1671875,
|
|
"epoch": 1.5047150471504716,
|
|
"grad_norm": 0.12494911986614933,
|
|
"learning_rate": 1.2769480061998029e-06,
|
|
"loss": 1.1782,
|
|
"mean_token_accuracy": 0.7204301476478576,
|
|
"num_tokens": 1014862406.0,
|
|
"step": 11010
|
|
},
|
|
{
|
|
"entropy": 1.116796875,
|
|
"epoch": 1.5060817274839415,
|
|
"grad_norm": 0.11134752643883906,
|
|
"learning_rate": 1.2734253910102862e-06,
|
|
"loss": 1.12,
|
|
"mean_token_accuracy": 0.7306741833686828,
|
|
"num_tokens": 1015821840.0,
|
|
"step": 11020
|
|
},
|
|
{
|
|
"entropy": 1.13515625,
|
|
"epoch": 1.5074484078174115,
|
|
"grad_norm": 0.12860235241396278,
|
|
"learning_rate": 1.2699027758207693e-06,
|
|
"loss": 1.1434,
|
|
"mean_token_accuracy": 0.7247728526592254,
|
|
"num_tokens": 1016739059.0,
|
|
"step": 11030
|
|
},
|
|
{
|
|
"entropy": 1.17734375,
|
|
"epoch": 1.5088150881508815,
|
|
"grad_norm": 0.1329250889832141,
|
|
"learning_rate": 1.2663801606312529e-06,
|
|
"loss": 1.1749,
|
|
"mean_token_accuracy": 0.7168094992637635,
|
|
"num_tokens": 1017732695.0,
|
|
"step": 11040
|
|
},
|
|
{
|
|
"entropy": 1.09921875,
|
|
"epoch": 1.5101817684843515,
|
|
"grad_norm": 0.12706552102710733,
|
|
"learning_rate": 1.262857545441736e-06,
|
|
"loss": 1.0986,
|
|
"mean_token_accuracy": 0.7345069468021392,
|
|
"num_tokens": 1018608385.0,
|
|
"step": 11050
|
|
},
|
|
{
|
|
"entropy": 1.1421875,
|
|
"epoch": 1.5115484488178215,
|
|
"grad_norm": 0.1177745830787445,
|
|
"learning_rate": 1.2593349302522193e-06,
|
|
"loss": 1.1491,
|
|
"mean_token_accuracy": 0.7236185848712922,
|
|
"num_tokens": 1019521726.0,
|
|
"step": 11060
|
|
},
|
|
{
|
|
"entropy": 1.153125,
|
|
"epoch": 1.5129151291512914,
|
|
"grad_norm": 0.1441299906859044,
|
|
"learning_rate": 1.2558123150627027e-06,
|
|
"loss": 1.1641,
|
|
"mean_token_accuracy": 0.7199095487594604,
|
|
"num_tokens": 1020459793.0,
|
|
"step": 11070
|
|
},
|
|
{
|
|
"entropy": 1.1765625,
|
|
"epoch": 1.5142818094847614,
|
|
"grad_norm": 0.14286305777648808,
|
|
"learning_rate": 1.252289699873186e-06,
|
|
"loss": 1.1927,
|
|
"mean_token_accuracy": 0.7140438795089722,
|
|
"num_tokens": 1021389370.0,
|
|
"step": 11080
|
|
},
|
|
{
|
|
"entropy": 1.17578125,
|
|
"epoch": 1.5156484898182314,
|
|
"grad_norm": 0.13028538613207452,
|
|
"learning_rate": 1.2487670846836693e-06,
|
|
"loss": 1.1766,
|
|
"mean_token_accuracy": 0.7178589463233948,
|
|
"num_tokens": 1022323761.0,
|
|
"step": 11090
|
|
},
|
|
{
|
|
"entropy": 1.234375,
|
|
"epoch": 1.5170151701517014,
|
|
"grad_norm": 0.16011534468064187,
|
|
"learning_rate": 1.2452444694941527e-06,
|
|
"loss": 1.2446,
|
|
"mean_token_accuracy": 0.7050847768783569,
|
|
"num_tokens": 1023215077.0,
|
|
"step": 11100
|
|
},
|
|
{
|
|
"entropy": 1.151171875,
|
|
"epoch": 1.5183818504851714,
|
|
"grad_norm": 0.11319502235477179,
|
|
"learning_rate": 1.2417218543046358e-06,
|
|
"loss": 1.1549,
|
|
"mean_token_accuracy": 0.7238343715667724,
|
|
"num_tokens": 1024137735.0,
|
|
"step": 11110
|
|
},
|
|
{
|
|
"entropy": 1.1203125,
|
|
"epoch": 1.5197485308186414,
|
|
"grad_norm": 0.12012277351441142,
|
|
"learning_rate": 1.2381992391151191e-06,
|
|
"loss": 1.1249,
|
|
"mean_token_accuracy": 0.7295436501502991,
|
|
"num_tokens": 1025047579.0,
|
|
"step": 11120
|
|
},
|
|
{
|
|
"entropy": 1.13984375,
|
|
"epoch": 1.5211152111521116,
|
|
"grad_norm": 0.11818146812246275,
|
|
"learning_rate": 1.2346766239256025e-06,
|
|
"loss": 1.1567,
|
|
"mean_token_accuracy": 0.7214396059513092,
|
|
"num_tokens": 1025997637.0,
|
|
"step": 11130
|
|
},
|
|
{
|
|
"entropy": 1.11953125,
|
|
"epoch": 1.5224818914855816,
|
|
"grad_norm": 0.12109234807178804,
|
|
"learning_rate": 1.2311540087360858e-06,
|
|
"loss": 1.1271,
|
|
"mean_token_accuracy": 0.7281914055347443,
|
|
"num_tokens": 1026960810.0,
|
|
"step": 11140
|
|
},
|
|
{
|
|
"entropy": 1.115625,
|
|
"epoch": 1.5238485718190515,
|
|
"grad_norm": 0.12773349821260574,
|
|
"learning_rate": 1.2276313935465691e-06,
|
|
"loss": 1.1053,
|
|
"mean_token_accuracy": 0.7335841238498688,
|
|
"num_tokens": 1027877854.0,
|
|
"step": 11150
|
|
},
|
|
{
|
|
"entropy": 1.099609375,
|
|
"epoch": 1.5252152521525215,
|
|
"grad_norm": 0.12205866856933854,
|
|
"learning_rate": 1.2241087783570525e-06,
|
|
"loss": 1.1072,
|
|
"mean_token_accuracy": 0.7316837728023529,
|
|
"num_tokens": 1028805396.0,
|
|
"step": 11160
|
|
},
|
|
{
|
|
"entropy": 1.13515625,
|
|
"epoch": 1.5265819324859915,
|
|
"grad_norm": 0.11253356378509863,
|
|
"learning_rate": 1.2205861631675358e-06,
|
|
"loss": 1.145,
|
|
"mean_token_accuracy": 0.7235315620899201,
|
|
"num_tokens": 1029738415.0,
|
|
"step": 11170
|
|
},
|
|
{
|
|
"entropy": 1.165625,
|
|
"epoch": 1.5279486128194615,
|
|
"grad_norm": 0.12496770275490106,
|
|
"learning_rate": 1.217063547978019e-06,
|
|
"loss": 1.1667,
|
|
"mean_token_accuracy": 0.7207337200641633,
|
|
"num_tokens": 1030646865.0,
|
|
"step": 11180
|
|
},
|
|
{
|
|
"entropy": 1.123828125,
|
|
"epoch": 1.5293152931529317,
|
|
"grad_norm": 0.12414702799522384,
|
|
"learning_rate": 1.2135409327885023e-06,
|
|
"loss": 1.1299,
|
|
"mean_token_accuracy": 0.7277159273624421,
|
|
"num_tokens": 1031564697.0,
|
|
"step": 11190
|
|
},
|
|
{
|
|
"entropy": 1.1140625,
|
|
"epoch": 1.5306819734864017,
|
|
"grad_norm": 0.11012148118622797,
|
|
"learning_rate": 1.2100183175989856e-06,
|
|
"loss": 1.1122,
|
|
"mean_token_accuracy": 0.7312304019927979,
|
|
"num_tokens": 1032464705.0,
|
|
"step": 11200
|
|
},
|
|
{
|
|
"entropy": 1.09765625,
|
|
"epoch": 1.5320486538198717,
|
|
"grad_norm": 0.12038493115307669,
|
|
"learning_rate": 1.206495702409469e-06,
|
|
"loss": 1.1033,
|
|
"mean_token_accuracy": 0.7324084937572479,
|
|
"num_tokens": 1033363698.0,
|
|
"step": 11210
|
|
},
|
|
{
|
|
"entropy": 1.0703125,
|
|
"epoch": 1.5334153341533416,
|
|
"grad_norm": 0.15114894260637182,
|
|
"learning_rate": 1.202973087219952e-06,
|
|
"loss": 1.0698,
|
|
"mean_token_accuracy": 0.7358902812004089,
|
|
"num_tokens": 1034287145.0,
|
|
"step": 11220
|
|
},
|
|
{
|
|
"entropy": 1.0953125,
|
|
"epoch": 1.5347820144868116,
|
|
"grad_norm": 0.31501151932330834,
|
|
"learning_rate": 1.1994504720304354e-06,
|
|
"loss": 1.1144,
|
|
"mean_token_accuracy": 0.729765784740448,
|
|
"num_tokens": 1035237499.0,
|
|
"step": 11230
|
|
},
|
|
{
|
|
"entropy": 1.1171875,
|
|
"epoch": 1.5361486948202816,
|
|
"grad_norm": 0.128431149334592,
|
|
"learning_rate": 1.1959278568409187e-06,
|
|
"loss": 1.115,
|
|
"mean_token_accuracy": 0.7287120521068573,
|
|
"num_tokens": 1036189146.0,
|
|
"step": 11240
|
|
},
|
|
{
|
|
"entropy": 1.1171875,
|
|
"epoch": 1.5375153751537516,
|
|
"grad_norm": 0.12063776103178454,
|
|
"learning_rate": 1.192405241651402e-06,
|
|
"loss": 1.1205,
|
|
"mean_token_accuracy": 0.7282150089740753,
|
|
"num_tokens": 1037111626.0,
|
|
"step": 11250
|
|
},
|
|
{
|
|
"entropy": 1.13046875,
|
|
"epoch": 1.5388820554872216,
|
|
"grad_norm": 0.13103772552346435,
|
|
"learning_rate": 1.1888826264618854e-06,
|
|
"loss": 1.1391,
|
|
"mean_token_accuracy": 0.7259630143642426,
|
|
"num_tokens": 1038023066.0,
|
|
"step": 11260
|
|
},
|
|
{
|
|
"entropy": 1.11953125,
|
|
"epoch": 1.5402487358206916,
|
|
"grad_norm": 0.11621037472591171,
|
|
"learning_rate": 1.1853600112723687e-06,
|
|
"loss": 1.1205,
|
|
"mean_token_accuracy": 0.7290335774421692,
|
|
"num_tokens": 1038963483.0,
|
|
"step": 11270
|
|
},
|
|
{
|
|
"entropy": 1.107421875,
|
|
"epoch": 1.5416154161541615,
|
|
"grad_norm": 0.13728031253651332,
|
|
"learning_rate": 1.181837396082852e-06,
|
|
"loss": 1.1101,
|
|
"mean_token_accuracy": 0.7315243601799011,
|
|
"num_tokens": 1039900364.0,
|
|
"step": 11280
|
|
},
|
|
{
|
|
"entropy": 1.1078125,
|
|
"epoch": 1.5429820964876315,
|
|
"grad_norm": 0.11495856825634025,
|
|
"learning_rate": 1.1783147808933352e-06,
|
|
"loss": 1.1117,
|
|
"mean_token_accuracy": 0.7312809348106384,
|
|
"num_tokens": 1040829063.0,
|
|
"step": 11290
|
|
},
|
|
{
|
|
"entropy": 1.2140625,
|
|
"epoch": 1.5443487768211015,
|
|
"grad_norm": 0.12424015734225774,
|
|
"learning_rate": 1.1747921657038185e-06,
|
|
"loss": 1.2234,
|
|
"mean_token_accuracy": 0.7100824356079102,
|
|
"num_tokens": 1041751052.0,
|
|
"step": 11300
|
|
},
|
|
{
|
|
"entropy": 1.096875,
|
|
"epoch": 1.5457154571545715,
|
|
"grad_norm": 0.12460068799392095,
|
|
"learning_rate": 1.1712695505143019e-06,
|
|
"loss": 1.0983,
|
|
"mean_token_accuracy": 0.7322580814361572,
|
|
"num_tokens": 1042647676.0,
|
|
"step": 11310
|
|
},
|
|
{
|
|
"entropy": 1.15390625,
|
|
"epoch": 1.5470821374880415,
|
|
"grad_norm": 0.12329833417779823,
|
|
"learning_rate": 1.1677469353247852e-06,
|
|
"loss": 1.1487,
|
|
"mean_token_accuracy": 0.7248301982879639,
|
|
"num_tokens": 1043516583.0,
|
|
"step": 11320
|
|
},
|
|
{
|
|
"entropy": 1.16328125,
|
|
"epoch": 1.5484488178215114,
|
|
"grad_norm": 0.19192124179713474,
|
|
"learning_rate": 1.1642243201352685e-06,
|
|
"loss": 1.173,
|
|
"mean_token_accuracy": 0.7179230332374573,
|
|
"num_tokens": 1044470985.0,
|
|
"step": 11330
|
|
},
|
|
{
|
|
"entropy": 1.1234375,
|
|
"epoch": 1.5498154981549814,
|
|
"grad_norm": 0.1194883141181194,
|
|
"learning_rate": 1.1607017049457519e-06,
|
|
"loss": 1.1246,
|
|
"mean_token_accuracy": 0.7270079612731933,
|
|
"num_tokens": 1045407715.0,
|
|
"step": 11340
|
|
},
|
|
{
|
|
"entropy": 1.1515625,
|
|
"epoch": 1.5511821784884514,
|
|
"grad_norm": 0.12457179861364985,
|
|
"learning_rate": 1.1571790897562352e-06,
|
|
"loss": 1.1547,
|
|
"mean_token_accuracy": 0.7230138957500458,
|
|
"num_tokens": 1046300953.0,
|
|
"step": 11350
|
|
},
|
|
{
|
|
"entropy": 1.110546875,
|
|
"epoch": 1.5525488588219214,
|
|
"grad_norm": 0.12962225090136165,
|
|
"learning_rate": 1.1536564745667183e-06,
|
|
"loss": 1.1051,
|
|
"mean_token_accuracy": 0.7321679055690765,
|
|
"num_tokens": 1047206230.0,
|
|
"step": 11360
|
|
},
|
|
{
|
|
"entropy": 1.11875,
|
|
"epoch": 1.5539155391553916,
|
|
"grad_norm": 0.13192591617636093,
|
|
"learning_rate": 1.1501338593772017e-06,
|
|
"loss": 1.1147,
|
|
"mean_token_accuracy": 0.7315157234668732,
|
|
"num_tokens": 1048108612.0,
|
|
"step": 11370
|
|
},
|
|
{
|
|
"entropy": 1.14921875,
|
|
"epoch": 1.5552822194888616,
|
|
"grad_norm": 0.12571254938893825,
|
|
"learning_rate": 1.146611244187685e-06,
|
|
"loss": 1.1516,
|
|
"mean_token_accuracy": 0.7258441388607025,
|
|
"num_tokens": 1049057878.0,
|
|
"step": 11380
|
|
},
|
|
{
|
|
"entropy": 1.137890625,
|
|
"epoch": 1.5566488998223316,
|
|
"grad_norm": 0.13498563188019388,
|
|
"learning_rate": 1.1430886289981683e-06,
|
|
"loss": 1.1457,
|
|
"mean_token_accuracy": 0.7211768627166748,
|
|
"num_tokens": 1049975577.0,
|
|
"step": 11390
|
|
},
|
|
{
|
|
"entropy": 1.08515625,
|
|
"epoch": 1.5580155801558015,
|
|
"grad_norm": 0.13452779678137516,
|
|
"learning_rate": 1.1395660138086517e-06,
|
|
"loss": 1.0774,
|
|
"mean_token_accuracy": 0.7369921267032623,
|
|
"num_tokens": 1050838742.0,
|
|
"step": 11400
|
|
},
|
|
{
|
|
"entropy": 1.1125,
|
|
"epoch": 1.5593822604892715,
|
|
"grad_norm": 0.11989082328842084,
|
|
"learning_rate": 1.136043398619135e-06,
|
|
"loss": 1.1159,
|
|
"mean_token_accuracy": 0.7324026942253112,
|
|
"num_tokens": 1051748442.0,
|
|
"step": 11410
|
|
},
|
|
{
|
|
"entropy": 1.1828125,
|
|
"epoch": 1.5607489408227415,
|
|
"grad_norm": 0.14061979982710313,
|
|
"learning_rate": 1.1325207834296184e-06,
|
|
"loss": 1.1762,
|
|
"mean_token_accuracy": 0.718201470375061,
|
|
"num_tokens": 1052690851.0,
|
|
"step": 11420
|
|
},
|
|
{
|
|
"entropy": 1.111328125,
|
|
"epoch": 1.5621156211562117,
|
|
"grad_norm": 0.12784694016038392,
|
|
"learning_rate": 1.1289981682401017e-06,
|
|
"loss": 1.1091,
|
|
"mean_token_accuracy": 0.7315260946750641,
|
|
"num_tokens": 1053577690.0,
|
|
"step": 11430
|
|
},
|
|
{
|
|
"entropy": 1.126171875,
|
|
"epoch": 1.5634823014896817,
|
|
"grad_norm": 0.11222835871952336,
|
|
"learning_rate": 1.1254755530505848e-06,
|
|
"loss": 1.1203,
|
|
"mean_token_accuracy": 0.7297685861587524,
|
|
"num_tokens": 1054497466.0,
|
|
"step": 11440
|
|
},
|
|
{
|
|
"entropy": 1.1640625,
|
|
"epoch": 1.5648489818231517,
|
|
"grad_norm": 0.11846541449120646,
|
|
"learning_rate": 1.1219529378610681e-06,
|
|
"loss": 1.1628,
|
|
"mean_token_accuracy": 0.7201218724250793,
|
|
"num_tokens": 1055417792.0,
|
|
"step": 11450
|
|
},
|
|
{
|
|
"entropy": 1.148828125,
|
|
"epoch": 1.5662156621566217,
|
|
"grad_norm": 0.12138200216088794,
|
|
"learning_rate": 1.1184303226715515e-06,
|
|
"loss": 1.1503,
|
|
"mean_token_accuracy": 0.7244573295116424,
|
|
"num_tokens": 1056359927.0,
|
|
"step": 11460
|
|
},
|
|
{
|
|
"entropy": 1.15546875,
|
|
"epoch": 1.5675823424900917,
|
|
"grad_norm": 0.1284015648186347,
|
|
"learning_rate": 1.1149077074820346e-06,
|
|
"loss": 1.1561,
|
|
"mean_token_accuracy": 0.7220216512680053,
|
|
"num_tokens": 1057290213.0,
|
|
"step": 11470
|
|
},
|
|
{
|
|
"entropy": 1.178125,
|
|
"epoch": 1.5689490228235616,
|
|
"grad_norm": 0.1448677304439902,
|
|
"learning_rate": 1.111385092292518e-06,
|
|
"loss": 1.1785,
|
|
"mean_token_accuracy": 0.7215452373027802,
|
|
"num_tokens": 1058200427.0,
|
|
"step": 11480
|
|
},
|
|
{
|
|
"entropy": 1.13046875,
|
|
"epoch": 1.5703157031570316,
|
|
"grad_norm": 0.13634355597809453,
|
|
"learning_rate": 1.1078624771030013e-06,
|
|
"loss": 1.1423,
|
|
"mean_token_accuracy": 0.7255885422229766,
|
|
"num_tokens": 1059120981.0,
|
|
"step": 11490
|
|
},
|
|
{
|
|
"entropy": 1.146875,
|
|
"epoch": 1.5716823834905016,
|
|
"grad_norm": 0.11342694251686376,
|
|
"learning_rate": 1.1043398619134846e-06,
|
|
"loss": 1.1503,
|
|
"mean_token_accuracy": 0.7218681931495666,
|
|
"num_tokens": 1060081905.0,
|
|
"step": 11500
|
|
},
|
|
{
|
|
"entropy": 1.1609375,
|
|
"epoch": 1.5730490638239716,
|
|
"grad_norm": 0.10359949303091928,
|
|
"learning_rate": 1.100817246723968e-06,
|
|
"loss": 1.165,
|
|
"mean_token_accuracy": 0.7202832281589509,
|
|
"num_tokens": 1060987615.0,
|
|
"step": 11510
|
|
},
|
|
{
|
|
"entropy": 1.15390625,
|
|
"epoch": 1.5744157441574416,
|
|
"grad_norm": 0.11999446576836434,
|
|
"learning_rate": 1.0972946315344513e-06,
|
|
"loss": 1.1684,
|
|
"mean_token_accuracy": 0.7221543192863464,
|
|
"num_tokens": 1061947626.0,
|
|
"step": 11520
|
|
},
|
|
{
|
|
"entropy": 1.106640625,
|
|
"epoch": 1.5757824244909115,
|
|
"grad_norm": 0.1323707750117954,
|
|
"learning_rate": 1.0937720163449346e-06,
|
|
"loss": 1.116,
|
|
"mean_token_accuracy": 0.7317051529884339,
|
|
"num_tokens": 1062853005.0,
|
|
"step": 11530
|
|
},
|
|
{
|
|
"entropy": 1.1328125,
|
|
"epoch": 1.5771491048243815,
|
|
"grad_norm": 0.11986143892857323,
|
|
"learning_rate": 1.0902494011554177e-06,
|
|
"loss": 1.1327,
|
|
"mean_token_accuracy": 0.7260720252990722,
|
|
"num_tokens": 1063787432.0,
|
|
"step": 11540
|
|
},
|
|
{
|
|
"entropy": 1.1140625,
|
|
"epoch": 1.5785157851578515,
|
|
"grad_norm": 0.12230800677856728,
|
|
"learning_rate": 1.086726785965901e-06,
|
|
"loss": 1.1221,
|
|
"mean_token_accuracy": 0.7295116007328033,
|
|
"num_tokens": 1064750440.0,
|
|
"step": 11550
|
|
},
|
|
{
|
|
"entropy": 1.16875,
|
|
"epoch": 1.5798824654913215,
|
|
"grad_norm": 0.12807399121374127,
|
|
"learning_rate": 1.0832041707763844e-06,
|
|
"loss": 1.1724,
|
|
"mean_token_accuracy": 0.7215964794158936,
|
|
"num_tokens": 1065677004.0,
|
|
"step": 11560
|
|
},
|
|
{
|
|
"entropy": 1.1578125,
|
|
"epoch": 1.5812491458247915,
|
|
"grad_norm": 0.12460605344841094,
|
|
"learning_rate": 1.0796815555868678e-06,
|
|
"loss": 1.1674,
|
|
"mean_token_accuracy": 0.722792637348175,
|
|
"num_tokens": 1066659637.0,
|
|
"step": 11570
|
|
},
|
|
{
|
|
"entropy": 1.178125,
|
|
"epoch": 1.5826158261582615,
|
|
"grad_norm": 0.10304035173127779,
|
|
"learning_rate": 1.076158940397351e-06,
|
|
"loss": 1.1884,
|
|
"mean_token_accuracy": 0.7162478923797607,
|
|
"num_tokens": 1067586812.0,
|
|
"step": 11580
|
|
},
|
|
{
|
|
"entropy": 1.1421875,
|
|
"epoch": 1.5839825064917314,
|
|
"grad_norm": 0.12016764218944256,
|
|
"learning_rate": 1.0726363252078344e-06,
|
|
"loss": 1.1478,
|
|
"mean_token_accuracy": 0.7253705441951752,
|
|
"num_tokens": 1068476288.0,
|
|
"step": 11590
|
|
},
|
|
{
|
|
"entropy": 1.09296875,
|
|
"epoch": 1.5853491868252014,
|
|
"grad_norm": 0.11657867133047171,
|
|
"learning_rate": 1.0691137100183178e-06,
|
|
"loss": 1.0914,
|
|
"mean_token_accuracy": 0.7364721119403839,
|
|
"num_tokens": 1069384818.0,
|
|
"step": 11600
|
|
},
|
|
{
|
|
"entropy": 1.10703125,
|
|
"epoch": 1.5867158671586716,
|
|
"grad_norm": 0.13050177798308177,
|
|
"learning_rate": 1.065591094828801e-06,
|
|
"loss": 1.1181,
|
|
"mean_token_accuracy": 0.7282937884330749,
|
|
"num_tokens": 1070358732.0,
|
|
"step": 11610
|
|
},
|
|
{
|
|
"entropy": 1.1234375,
|
|
"epoch": 1.5880825474921416,
|
|
"grad_norm": 0.12177492037454049,
|
|
"learning_rate": 1.0620684796392842e-06,
|
|
"loss": 1.1187,
|
|
"mean_token_accuracy": 0.7275471746921539,
|
|
"num_tokens": 1071225249.0,
|
|
"step": 11620
|
|
},
|
|
{
|
|
"entropy": 1.076171875,
|
|
"epoch": 1.5894492278256116,
|
|
"grad_norm": 0.12324075003948506,
|
|
"learning_rate": 1.0585458644497676e-06,
|
|
"loss": 1.074,
|
|
"mean_token_accuracy": 0.7371815323829651,
|
|
"num_tokens": 1072105786.0,
|
|
"step": 11630
|
|
},
|
|
{
|
|
"entropy": 1.171875,
|
|
"epoch": 1.5908159081590816,
|
|
"grad_norm": 0.12106877097956617,
|
|
"learning_rate": 1.0550232492602509e-06,
|
|
"loss": 1.1796,
|
|
"mean_token_accuracy": 0.7172235667705535,
|
|
"num_tokens": 1073011100.0,
|
|
"step": 11640
|
|
},
|
|
{
|
|
"entropy": 1.157421875,
|
|
"epoch": 1.5921825884925516,
|
|
"grad_norm": 0.14683687349575103,
|
|
"learning_rate": 1.0515006340707342e-06,
|
|
"loss": 1.1591,
|
|
"mean_token_accuracy": 0.7234167456626892,
|
|
"num_tokens": 1073914349.0,
|
|
"step": 11650
|
|
},
|
|
{
|
|
"entropy": 1.08515625,
|
|
"epoch": 1.5935492688260215,
|
|
"grad_norm": 0.1338938449025585,
|
|
"learning_rate": 1.0479780188812176e-06,
|
|
"loss": 1.0912,
|
|
"mean_token_accuracy": 0.7364835023880005,
|
|
"num_tokens": 1074787847.0,
|
|
"step": 11660
|
|
},
|
|
{
|
|
"entropy": 1.1234375,
|
|
"epoch": 1.5949159491594918,
|
|
"grad_norm": 0.12640428954501473,
|
|
"learning_rate": 1.0444554036917009e-06,
|
|
"loss": 1.1359,
|
|
"mean_token_accuracy": 0.7263867020606994,
|
|
"num_tokens": 1075720122.0,
|
|
"step": 11670
|
|
},
|
|
{
|
|
"entropy": 1.1671875,
|
|
"epoch": 1.5962826294929617,
|
|
"grad_norm": 0.1215934322114979,
|
|
"learning_rate": 1.0409327885021842e-06,
|
|
"loss": 1.1778,
|
|
"mean_token_accuracy": 0.715707677602768,
|
|
"num_tokens": 1076658698.0,
|
|
"step": 11680
|
|
},
|
|
{
|
|
"entropy": 1.09609375,
|
|
"epoch": 1.5976493098264317,
|
|
"grad_norm": 0.1138474549245809,
|
|
"learning_rate": 1.0374101733126674e-06,
|
|
"loss": 1.1141,
|
|
"mean_token_accuracy": 0.731173700094223,
|
|
"num_tokens": 1077578515.0,
|
|
"step": 11690
|
|
},
|
|
{
|
|
"entropy": 1.0984375,
|
|
"epoch": 1.5990159901599017,
|
|
"grad_norm": 0.1360098724266158,
|
|
"learning_rate": 1.0338875581231507e-06,
|
|
"loss": 1.089,
|
|
"mean_token_accuracy": 0.7350546360015869,
|
|
"num_tokens": 1078439448.0,
|
|
"step": 11700
|
|
},
|
|
{
|
|
"entropy": 1.17109375,
|
|
"epoch": 1.6003826704933717,
|
|
"grad_norm": 0.11717204371998141,
|
|
"learning_rate": 1.030364942933634e-06,
|
|
"loss": 1.1767,
|
|
"mean_token_accuracy": 0.716688460111618,
|
|
"num_tokens": 1079335986.0,
|
|
"step": 11710
|
|
},
|
|
{
|
|
"entropy": 1.1484375,
|
|
"epoch": 1.6017493508268417,
|
|
"grad_norm": 0.10552359315082514,
|
|
"learning_rate": 1.0268423277441174e-06,
|
|
"loss": 1.1534,
|
|
"mean_token_accuracy": 0.7226088047027588,
|
|
"num_tokens": 1080224151.0,
|
|
"step": 11720
|
|
},
|
|
{
|
|
"entropy": 1.12734375,
|
|
"epoch": 1.6031160311603116,
|
|
"grad_norm": 0.13536023503511121,
|
|
"learning_rate": 1.0233197125546005e-06,
|
|
"loss": 1.1219,
|
|
"mean_token_accuracy": 0.7308767795562744,
|
|
"num_tokens": 1081090458.0,
|
|
"step": 11730
|
|
},
|
|
{
|
|
"entropy": 1.1484375,
|
|
"epoch": 1.6044827114937816,
|
|
"grad_norm": 0.14056423677741323,
|
|
"learning_rate": 1.0197970973650838e-06,
|
|
"loss": 1.1519,
|
|
"mean_token_accuracy": 0.7230500102043151,
|
|
"num_tokens": 1082015833.0,
|
|
"step": 11740
|
|
},
|
|
{
|
|
"entropy": 1.1359375,
|
|
"epoch": 1.6058493918272516,
|
|
"grad_norm": 0.12233295541300536,
|
|
"learning_rate": 1.0162744821755674e-06,
|
|
"loss": 1.1514,
|
|
"mean_token_accuracy": 0.724388587474823,
|
|
"num_tokens": 1082949779.0,
|
|
"step": 11750
|
|
},
|
|
{
|
|
"entropy": 1.11015625,
|
|
"epoch": 1.6072160721607216,
|
|
"grad_norm": 0.12857163973297187,
|
|
"learning_rate": 1.0127518669860505e-06,
|
|
"loss": 1.1145,
|
|
"mean_token_accuracy": 0.7311505615711212,
|
|
"num_tokens": 1083865005.0,
|
|
"step": 11760
|
|
},
|
|
{
|
|
"entropy": 1.1734375,
|
|
"epoch": 1.6085827524941916,
|
|
"grad_norm": 0.1586964296322425,
|
|
"learning_rate": 1.0092292517965338e-06,
|
|
"loss": 1.1768,
|
|
"mean_token_accuracy": 0.7200727164745331,
|
|
"num_tokens": 1084813965.0,
|
|
"step": 11770
|
|
},
|
|
{
|
|
"entropy": 1.1421875,
|
|
"epoch": 1.6099494328276616,
|
|
"grad_norm": 0.12995136781671832,
|
|
"learning_rate": 1.0057066366070172e-06,
|
|
"loss": 1.1444,
|
|
"mean_token_accuracy": 0.7251144826412201,
|
|
"num_tokens": 1085727725.0,
|
|
"step": 11780
|
|
},
|
|
{
|
|
"entropy": 1.09921875,
|
|
"epoch": 1.6113161131611315,
|
|
"grad_norm": 0.12104614307804479,
|
|
"learning_rate": 1.0021840214175005e-06,
|
|
"loss": 1.0975,
|
|
"mean_token_accuracy": 0.7348082721233368,
|
|
"num_tokens": 1086639180.0,
|
|
"step": 11790
|
|
},
|
|
{
|
|
"entropy": 1.13359375,
|
|
"epoch": 1.6126827934946015,
|
|
"grad_norm": 0.1623759532457273,
|
|
"learning_rate": 9.986614062279836e-07,
|
|
"loss": 1.1435,
|
|
"mean_token_accuracy": 0.726040518283844,
|
|
"num_tokens": 1087573751.0,
|
|
"step": 11800
|
|
},
|
|
{
|
|
"entropy": 1.07109375,
|
|
"epoch": 1.6140494738280715,
|
|
"grad_norm": 0.12584252554636982,
|
|
"learning_rate": 9.95138791038467e-07,
|
|
"loss": 1.0701,
|
|
"mean_token_accuracy": 0.7392770051956177,
|
|
"num_tokens": 1088506212.0,
|
|
"step": 11810
|
|
},
|
|
{
|
|
"entropy": 1.10078125,
|
|
"epoch": 1.6154161541615415,
|
|
"grad_norm": 0.12192195540059664,
|
|
"learning_rate": 9.916161758489503e-07,
|
|
"loss": 1.0918,
|
|
"mean_token_accuracy": 0.736325865983963,
|
|
"num_tokens": 1089428943.0,
|
|
"step": 11820
|
|
},
|
|
{
|
|
"entropy": 1.14375,
|
|
"epoch": 1.6167828344950115,
|
|
"grad_norm": 0.1286079162694709,
|
|
"learning_rate": 9.880935606594336e-07,
|
|
"loss": 1.1546,
|
|
"mean_token_accuracy": 0.7223917722702027,
|
|
"num_tokens": 1090406547.0,
|
|
"step": 11830
|
|
},
|
|
{
|
|
"entropy": 1.125,
|
|
"epoch": 1.6181495148284815,
|
|
"grad_norm": 0.11489562275908007,
|
|
"learning_rate": 9.84570945469917e-07,
|
|
"loss": 1.1366,
|
|
"mean_token_accuracy": 0.7276401162147522,
|
|
"num_tokens": 1091320300.0,
|
|
"step": 11840
|
|
},
|
|
{
|
|
"entropy": 1.15,
|
|
"epoch": 1.6195161951619517,
|
|
"grad_norm": 0.12001631788659294,
|
|
"learning_rate": 9.810483302804003e-07,
|
|
"loss": 1.1448,
|
|
"mean_token_accuracy": 0.7230552911758423,
|
|
"num_tokens": 1092235124.0,
|
|
"step": 11850
|
|
},
|
|
{
|
|
"entropy": 1.1328125,
|
|
"epoch": 1.6208828754954216,
|
|
"grad_norm": 0.12311908616814327,
|
|
"learning_rate": 9.775257150908836e-07,
|
|
"loss": 1.1312,
|
|
"mean_token_accuracy": 0.7270432889461518,
|
|
"num_tokens": 1093131170.0,
|
|
"step": 11860
|
|
},
|
|
{
|
|
"entropy": 1.13046875,
|
|
"epoch": 1.6222495558288916,
|
|
"grad_norm": 0.11934608051876348,
|
|
"learning_rate": 9.740030999013668e-07,
|
|
"loss": 1.1454,
|
|
"mean_token_accuracy": 0.725026112794876,
|
|
"num_tokens": 1094028259.0,
|
|
"step": 11870
|
|
},
|
|
{
|
|
"entropy": 1.10234375,
|
|
"epoch": 1.6236162361623616,
|
|
"grad_norm": 0.14688658644472252,
|
|
"learning_rate": 9.7048048471185e-07,
|
|
"loss": 1.1009,
|
|
"mean_token_accuracy": 0.7302022397518158,
|
|
"num_tokens": 1094923700.0,
|
|
"step": 11880
|
|
},
|
|
{
|
|
"entropy": 1.1578125,
|
|
"epoch": 1.6249829164958316,
|
|
"grad_norm": 0.11644563225791779,
|
|
"learning_rate": 9.669578695223334e-07,
|
|
"loss": 1.1667,
|
|
"mean_token_accuracy": 0.7219655752182007,
|
|
"num_tokens": 1095863029.0,
|
|
"step": 11890
|
|
},
|
|
{
|
|
"entropy": 1.13984375,
|
|
"epoch": 1.6263495968293016,
|
|
"grad_norm": 0.1484871579911452,
|
|
"learning_rate": 9.634352543328168e-07,
|
|
"loss": 1.1364,
|
|
"mean_token_accuracy": 0.725720477104187,
|
|
"num_tokens": 1096756795.0,
|
|
"step": 11900
|
|
},
|
|
{
|
|
"entropy": 1.138671875,
|
|
"epoch": 1.6277162771627718,
|
|
"grad_norm": 0.1370857294594391,
|
|
"learning_rate": 9.599126391433e-07,
|
|
"loss": 1.1399,
|
|
"mean_token_accuracy": 0.7245112895965576,
|
|
"num_tokens": 1097677930.0,
|
|
"step": 11910
|
|
},
|
|
{
|
|
"entropy": 1.156640625,
|
|
"epoch": 1.6290829574962418,
|
|
"grad_norm": 0.12448315997117682,
|
|
"learning_rate": 9.563900239537834e-07,
|
|
"loss": 1.1577,
|
|
"mean_token_accuracy": 0.7216490745544434,
|
|
"num_tokens": 1098586303.0,
|
|
"step": 11920
|
|
},
|
|
{
|
|
"entropy": 1.098046875,
|
|
"epoch": 1.6304496378297118,
|
|
"grad_norm": 0.1206001578437035,
|
|
"learning_rate": 9.528674087642667e-07,
|
|
"loss": 1.1085,
|
|
"mean_token_accuracy": 0.7318263173103332,
|
|
"num_tokens": 1099532417.0,
|
|
"step": 11930
|
|
},
|
|
{
|
|
"entropy": 1.10546875,
|
|
"epoch": 1.6318163181631817,
|
|
"grad_norm": 0.12175636952537547,
|
|
"learning_rate": 9.4934479357475e-07,
|
|
"loss": 1.0982,
|
|
"mean_token_accuracy": 0.7344684660434723,
|
|
"num_tokens": 1100461088.0,
|
|
"step": 11940
|
|
},
|
|
{
|
|
"entropy": 1.15859375,
|
|
"epoch": 1.6331829984966517,
|
|
"grad_norm": 0.11513127414585328,
|
|
"learning_rate": 9.458221783852332e-07,
|
|
"loss": 1.1594,
|
|
"mean_token_accuracy": 0.7210802555084228,
|
|
"num_tokens": 1101423253.0,
|
|
"step": 11950
|
|
},
|
|
{
|
|
"entropy": 1.11875,
|
|
"epoch": 1.6345496788301217,
|
|
"grad_norm": 0.12492140481929791,
|
|
"learning_rate": 9.422995631957166e-07,
|
|
"loss": 1.1206,
|
|
"mean_token_accuracy": 0.729708445072174,
|
|
"num_tokens": 1102306270.0,
|
|
"step": 11960
|
|
},
|
|
{
|
|
"entropy": 1.16484375,
|
|
"epoch": 1.6359163591635917,
|
|
"grad_norm": 0.12981977698520522,
|
|
"learning_rate": 9.387769480061999e-07,
|
|
"loss": 1.1676,
|
|
"mean_token_accuracy": 0.7215504288673401,
|
|
"num_tokens": 1103221346.0,
|
|
"step": 11970
|
|
},
|
|
{
|
|
"entropy": 1.128125,
|
|
"epoch": 1.6372830394970617,
|
|
"grad_norm": 0.1217742070731861,
|
|
"learning_rate": 9.352543328166831e-07,
|
|
"loss": 1.1195,
|
|
"mean_token_accuracy": 0.7296084761619568,
|
|
"num_tokens": 1104181732.0,
|
|
"step": 11980
|
|
},
|
|
{
|
|
"entropy": 1.1328125,
|
|
"epoch": 1.6386497198305316,
|
|
"grad_norm": 0.1252222572178398,
|
|
"learning_rate": 9.317317176271665e-07,
|
|
"loss": 1.1474,
|
|
"mean_token_accuracy": 0.7249259412288666,
|
|
"num_tokens": 1105059442.0,
|
|
"step": 11990
|
|
},
|
|
{
|
|
"entropy": 1.17734375,
|
|
"epoch": 1.6400164001640016,
|
|
"grad_norm": 0.24079485512933355,
|
|
"learning_rate": 9.282091024376498e-07,
|
|
"loss": 1.1874,
|
|
"mean_token_accuracy": 0.7149234771728515,
|
|
"num_tokens": 1106044386.0,
|
|
"step": 12000
|
|
},
|
|
{
|
|
"entropy": 1.125,
|
|
"epoch": 1.6413830804974716,
|
|
"grad_norm": 0.11335315366450434,
|
|
"learning_rate": 9.246864872481331e-07,
|
|
"loss": 1.1269,
|
|
"mean_token_accuracy": 0.7290163040161133,
|
|
"num_tokens": 919957.0,
|
|
"step": 12010
|
|
},
|
|
{
|
|
"entropy": 1.1234375,
|
|
"epoch": 1.6427497608309416,
|
|
"grad_norm": 0.1289646191589402,
|
|
"learning_rate": 9.211638720586164e-07,
|
|
"loss": 1.1295,
|
|
"mean_token_accuracy": 0.7236729919910431,
|
|
"num_tokens": 1791388.0,
|
|
"step": 12020
|
|
},
|
|
{
|
|
"entropy": 1.135546875,
|
|
"epoch": 1.6441164411644116,
|
|
"grad_norm": 0.11834900583980815,
|
|
"learning_rate": 9.176412568690997e-07,
|
|
"loss": 1.1407,
|
|
"mean_token_accuracy": 0.7254348993301392,
|
|
"num_tokens": 2716370.0,
|
|
"step": 12030
|
|
},
|
|
{
|
|
"entropy": 1.14140625,
|
|
"epoch": 1.6454831214978816,
|
|
"grad_norm": 0.13114727410617283,
|
|
"learning_rate": 9.14118641679583e-07,
|
|
"loss": 1.1411,
|
|
"mean_token_accuracy": 0.7267215490341187,
|
|
"num_tokens": 3631132.0,
|
|
"step": 12040
|
|
},
|
|
{
|
|
"entropy": 1.13515625,
|
|
"epoch": 1.6468498018313515,
|
|
"grad_norm": 0.12740561404994052,
|
|
"learning_rate": 9.105960264900663e-07,
|
|
"loss": 1.138,
|
|
"mean_token_accuracy": 0.7264596700668335,
|
|
"num_tokens": 4509030.0,
|
|
"step": 12050
|
|
},
|
|
{
|
|
"entropy": 1.078515625,
|
|
"epoch": 1.6482164821648215,
|
|
"grad_norm": 0.11775942186165603,
|
|
"learning_rate": 9.070734113005496e-07,
|
|
"loss": 1.0705,
|
|
"mean_token_accuracy": 0.7390502154827118,
|
|
"num_tokens": 5398084.0,
|
|
"step": 12060
|
|
},
|
|
{
|
|
"entropy": 1.103125,
|
|
"epoch": 1.6495831624982915,
|
|
"grad_norm": 0.12929179242036887,
|
|
"learning_rate": 9.035507961110329e-07,
|
|
"loss": 1.104,
|
|
"mean_token_accuracy": 0.7328418791294098,
|
|
"num_tokens": 6266301.0,
|
|
"step": 12070
|
|
},
|
|
{
|
|
"entropy": 1.05859375,
|
|
"epoch": 1.6509498428317615,
|
|
"grad_norm": 0.12238043195751332,
|
|
"learning_rate": 9.000281809215163e-07,
|
|
"loss": 1.0491,
|
|
"mean_token_accuracy": 0.7420717060565949,
|
|
"num_tokens": 7154758.0,
|
|
"step": 12080
|
|
},
|
|
{
|
|
"entropy": 1.07890625,
|
|
"epoch": 1.6523165231652317,
|
|
"grad_norm": 0.11662372318548768,
|
|
"learning_rate": 8.965055657319995e-07,
|
|
"loss": 1.0871,
|
|
"mean_token_accuracy": 0.7360152006149292,
|
|
"num_tokens": 8080747.0,
|
|
"step": 12090
|
|
},
|
|
{
|
|
"entropy": 1.139453125,
|
|
"epoch": 1.6536832034987017,
|
|
"grad_norm": 0.12031875215135618,
|
|
"learning_rate": 8.929829505424828e-07,
|
|
"loss": 1.1457,
|
|
"mean_token_accuracy": 0.724914962053299,
|
|
"num_tokens": 8968701.0,
|
|
"step": 12100
|
|
},
|
|
{
|
|
"entropy": 1.1265625,
|
|
"epoch": 1.6550498838321717,
|
|
"grad_norm": 0.13978839749038258,
|
|
"learning_rate": 8.894603353529662e-07,
|
|
"loss": 1.1345,
|
|
"mean_token_accuracy": 0.7268293619155883,
|
|
"num_tokens": 9859682.0,
|
|
"step": 12110
|
|
},
|
|
{
|
|
"entropy": 1.112109375,
|
|
"epoch": 1.6564165641656416,
|
|
"grad_norm": 0.1318341508960567,
|
|
"learning_rate": 8.859377201634495e-07,
|
|
"loss": 1.1128,
|
|
"mean_token_accuracy": 0.731121689081192,
|
|
"num_tokens": 10758105.0,
|
|
"step": 12120
|
|
},
|
|
{
|
|
"entropy": 1.1609375,
|
|
"epoch": 1.6577832444991116,
|
|
"grad_norm": 0.12079947459572049,
|
|
"learning_rate": 8.824151049739326e-07,
|
|
"loss": 1.1714,
|
|
"mean_token_accuracy": 0.7176726520061493,
|
|
"num_tokens": 11738390.0,
|
|
"step": 12130
|
|
},
|
|
{
|
|
"entropy": 1.121875,
|
|
"epoch": 1.6591499248325816,
|
|
"grad_norm": 0.13033980420459462,
|
|
"learning_rate": 8.788924897844161e-07,
|
|
"loss": 1.1172,
|
|
"mean_token_accuracy": 0.730788654088974,
|
|
"num_tokens": 12678741.0,
|
|
"step": 12140
|
|
},
|
|
{
|
|
"entropy": 1.13203125,
|
|
"epoch": 1.6605166051660518,
|
|
"grad_norm": 0.11205064623135448,
|
|
"learning_rate": 8.753698745948994e-07,
|
|
"loss": 1.1302,
|
|
"mean_token_accuracy": 0.727443951368332,
|
|
"num_tokens": 13662733.0,
|
|
"step": 12150
|
|
},
|
|
{
|
|
"entropy": 1.1375,
|
|
"epoch": 1.6618832854995218,
|
|
"grad_norm": 0.1243287885557993,
|
|
"learning_rate": 8.718472594053825e-07,
|
|
"loss": 1.1366,
|
|
"mean_token_accuracy": 0.725858473777771,
|
|
"num_tokens": 14575264.0,
|
|
"step": 12160
|
|
},
|
|
{
|
|
"entropy": 1.1109375,
|
|
"epoch": 1.6632499658329918,
|
|
"grad_norm": 0.13482495829774246,
|
|
"learning_rate": 8.683246442158659e-07,
|
|
"loss": 1.1283,
|
|
"mean_token_accuracy": 0.7273253977298737,
|
|
"num_tokens": 15538810.0,
|
|
"step": 12170
|
|
},
|
|
{
|
|
"entropy": 1.09609375,
|
|
"epoch": 1.6646166461664618,
|
|
"grad_norm": 0.18460530461327604,
|
|
"learning_rate": 8.648020290263492e-07,
|
|
"loss": 1.0992,
|
|
"mean_token_accuracy": 0.7343933999538421,
|
|
"num_tokens": 16472619.0,
|
|
"step": 12180
|
|
},
|
|
{
|
|
"entropy": 1.1109375,
|
|
"epoch": 1.6659833264999317,
|
|
"grad_norm": 0.12099563508581379,
|
|
"learning_rate": 8.612794138368325e-07,
|
|
"loss": 1.1115,
|
|
"mean_token_accuracy": 0.7305367827415467,
|
|
"num_tokens": 17377623.0,
|
|
"step": 12190
|
|
},
|
|
{
|
|
"entropy": 1.1234375,
|
|
"epoch": 1.6673500068334017,
|
|
"grad_norm": 0.12719071428201498,
|
|
"learning_rate": 8.577567986473158e-07,
|
|
"loss": 1.1351,
|
|
"mean_token_accuracy": 0.7263271152973175,
|
|
"num_tokens": 18334132.0,
|
|
"step": 12200
|
|
},
|
|
{
|
|
"entropy": 1.13359375,
|
|
"epoch": 1.6687166871668717,
|
|
"grad_norm": 0.11975097163302485,
|
|
"learning_rate": 8.542341834577991e-07,
|
|
"loss": 1.1339,
|
|
"mean_token_accuracy": 0.7269580066204071,
|
|
"num_tokens": 19297366.0,
|
|
"step": 12210
|
|
},
|
|
{
|
|
"entropy": 1.1375,
|
|
"epoch": 1.6700833675003417,
|
|
"grad_norm": 0.13144176971636454,
|
|
"learning_rate": 8.507115682682824e-07,
|
|
"loss": 1.1443,
|
|
"mean_token_accuracy": 0.7247963905334472,
|
|
"num_tokens": 20237051.0,
|
|
"step": 12220
|
|
},
|
|
{
|
|
"entropy": 1.10078125,
|
|
"epoch": 1.6714500478338117,
|
|
"grad_norm": 0.13592707234717247,
|
|
"learning_rate": 8.471889530787658e-07,
|
|
"loss": 1.1119,
|
|
"mean_token_accuracy": 0.7311986982822418,
|
|
"num_tokens": 21166451.0,
|
|
"step": 12230
|
|
},
|
|
{
|
|
"entropy": 1.1515625,
|
|
"epoch": 1.6728167281672817,
|
|
"grad_norm": 0.13751883121360312,
|
|
"learning_rate": 8.43666337889249e-07,
|
|
"loss": 1.1628,
|
|
"mean_token_accuracy": 0.7227022409439087,
|
|
"num_tokens": 22118238.0,
|
|
"step": 12240
|
|
},
|
|
{
|
|
"entropy": 1.122265625,
|
|
"epoch": 1.6741834085007516,
|
|
"grad_norm": 0.1380850141625896,
|
|
"learning_rate": 8.401437226997323e-07,
|
|
"loss": 1.1255,
|
|
"mean_token_accuracy": 0.7266109824180603,
|
|
"num_tokens": 23052978.0,
|
|
"step": 12250
|
|
},
|
|
{
|
|
"entropy": 1.0984375,
|
|
"epoch": 1.6755500888342216,
|
|
"grad_norm": 0.12743246589465215,
|
|
"learning_rate": 8.366211075102157e-07,
|
|
"loss": 1.0995,
|
|
"mean_token_accuracy": 0.734176117181778,
|
|
"num_tokens": 23976168.0,
|
|
"step": 12260
|
|
},
|
|
{
|
|
"entropy": 1.0703125,
|
|
"epoch": 1.6769167691676916,
|
|
"grad_norm": 0.11296514218437993,
|
|
"learning_rate": 8.330984923206989e-07,
|
|
"loss": 1.0629,
|
|
"mean_token_accuracy": 0.7408572018146515,
|
|
"num_tokens": 24900925.0,
|
|
"step": 12270
|
|
},
|
|
{
|
|
"entropy": 1.109375,
|
|
"epoch": 1.6782834495011616,
|
|
"grad_norm": 0.12592211838662387,
|
|
"learning_rate": 8.295758771311822e-07,
|
|
"loss": 1.1108,
|
|
"mean_token_accuracy": 0.7323496162891387,
|
|
"num_tokens": 25799980.0,
|
|
"step": 12280
|
|
},
|
|
{
|
|
"entropy": 1.12890625,
|
|
"epoch": 1.6796501298346316,
|
|
"grad_norm": 0.12967146053644268,
|
|
"learning_rate": 8.260532619416656e-07,
|
|
"loss": 1.1329,
|
|
"mean_token_accuracy": 0.7278318643569947,
|
|
"num_tokens": 26704876.0,
|
|
"step": 12290
|
|
},
|
|
{
|
|
"entropy": 1.11875,
|
|
"epoch": 1.6810168101681016,
|
|
"grad_norm": 0.12323587848417461,
|
|
"learning_rate": 8.225306467521489e-07,
|
|
"loss": 1.1229,
|
|
"mean_token_accuracy": 0.7315844535827637,
|
|
"num_tokens": 27624201.0,
|
|
"step": 12300
|
|
},
|
|
{
|
|
"entropy": 1.11015625,
|
|
"epoch": 1.6823834905015715,
|
|
"grad_norm": 0.1132699589399511,
|
|
"learning_rate": 8.190080315626321e-07,
|
|
"loss": 1.1187,
|
|
"mean_token_accuracy": 0.7305192530155182,
|
|
"num_tokens": 28539868.0,
|
|
"step": 12310
|
|
},
|
|
{
|
|
"entropy": 1.0796875,
|
|
"epoch": 1.6837501708350415,
|
|
"grad_norm": 0.11417324042309766,
|
|
"learning_rate": 8.154854163731155e-07,
|
|
"loss": 1.0874,
|
|
"mean_token_accuracy": 0.7343643546104431,
|
|
"num_tokens": 29467296.0,
|
|
"step": 12320
|
|
},
|
|
{
|
|
"entropy": 1.083203125,
|
|
"epoch": 1.6851168511685117,
|
|
"grad_norm": 0.10640457835832402,
|
|
"learning_rate": 8.119628011835988e-07,
|
|
"loss": 1.0864,
|
|
"mean_token_accuracy": 0.7362227141857147,
|
|
"num_tokens": 30380662.0,
|
|
"step": 12330
|
|
},
|
|
{
|
|
"entropy": 1.115625,
|
|
"epoch": 1.6864835315019817,
|
|
"grad_norm": 0.13063050013948396,
|
|
"learning_rate": 8.084401859940821e-07,
|
|
"loss": 1.118,
|
|
"mean_token_accuracy": 0.7269571244716644,
|
|
"num_tokens": 31304971.0,
|
|
"step": 12340
|
|
},
|
|
{
|
|
"entropy": 1.112109375,
|
|
"epoch": 1.6878502118354517,
|
|
"grad_norm": 0.12961171080719747,
|
|
"learning_rate": 8.049175708045654e-07,
|
|
"loss": 1.1114,
|
|
"mean_token_accuracy": 0.7294110000133515,
|
|
"num_tokens": 32196022.0,
|
|
"step": 12350
|
|
},
|
|
{
|
|
"entropy": 1.12421875,
|
|
"epoch": 1.6892168921689217,
|
|
"grad_norm": 0.12767015119529906,
|
|
"learning_rate": 8.013949556150487e-07,
|
|
"loss": 1.1308,
|
|
"mean_token_accuracy": 0.7281490862369537,
|
|
"num_tokens": 33135862.0,
|
|
"step": 12360
|
|
},
|
|
{
|
|
"entropy": 1.101171875,
|
|
"epoch": 1.6905835725023917,
|
|
"grad_norm": 0.1601632898555264,
|
|
"learning_rate": 7.97872340425532e-07,
|
|
"loss": 1.1054,
|
|
"mean_token_accuracy": 0.7304427444934845,
|
|
"num_tokens": 34067010.0,
|
|
"step": 12370
|
|
},
|
|
{
|
|
"entropy": 1.125,
|
|
"epoch": 1.6919502528358616,
|
|
"grad_norm": 0.12425904871345583,
|
|
"learning_rate": 7.943497252360153e-07,
|
|
"loss": 1.1173,
|
|
"mean_token_accuracy": 0.7297523021697998,
|
|
"num_tokens": 35013313.0,
|
|
"step": 12380
|
|
},
|
|
{
|
|
"entropy": 1.165625,
|
|
"epoch": 1.6933169331693319,
|
|
"grad_norm": 0.1260870990123666,
|
|
"learning_rate": 7.908271100464986e-07,
|
|
"loss": 1.1625,
|
|
"mean_token_accuracy": 0.7211195945739746,
|
|
"num_tokens": 35906978.0,
|
|
"step": 12390
|
|
},
|
|
{
|
|
"entropy": 1.151953125,
|
|
"epoch": 1.6946836135028018,
|
|
"grad_norm": 0.12013990270388158,
|
|
"learning_rate": 7.873044948569819e-07,
|
|
"loss": 1.1593,
|
|
"mean_token_accuracy": 0.7230053544044495,
|
|
"num_tokens": 36909392.0,
|
|
"step": 12400
|
|
},
|
|
{
|
|
"entropy": 1.105078125,
|
|
"epoch": 1.6960502938362718,
|
|
"grad_norm": 0.1222337221024678,
|
|
"learning_rate": 7.837818796674653e-07,
|
|
"loss": 1.1118,
|
|
"mean_token_accuracy": 0.7312757849693299,
|
|
"num_tokens": 37900621.0,
|
|
"step": 12410
|
|
},
|
|
{
|
|
"entropy": 1.14375,
|
|
"epoch": 1.6974169741697418,
|
|
"grad_norm": 0.13784356346565274,
|
|
"learning_rate": 7.802592644779484e-07,
|
|
"loss": 1.1476,
|
|
"mean_token_accuracy": 0.7241666376590729,
|
|
"num_tokens": 38852623.0,
|
|
"step": 12420
|
|
},
|
|
{
|
|
"entropy": 1.1796875,
|
|
"epoch": 1.6987836545032118,
|
|
"grad_norm": 0.1300529459519372,
|
|
"learning_rate": 7.767366492884317e-07,
|
|
"loss": 1.1798,
|
|
"mean_token_accuracy": 0.7187878787517548,
|
|
"num_tokens": 39767551.0,
|
|
"step": 12430
|
|
},
|
|
{
|
|
"entropy": 1.078125,
|
|
"epoch": 1.7001503348366818,
|
|
"grad_norm": 0.1211767636916043,
|
|
"learning_rate": 7.732140340989152e-07,
|
|
"loss": 1.0886,
|
|
"mean_token_accuracy": 0.7351120591163636,
|
|
"num_tokens": 40712705.0,
|
|
"step": 12440
|
|
},
|
|
{
|
|
"entropy": 1.115234375,
|
|
"epoch": 1.7015170151701517,
|
|
"grad_norm": 0.1285564161523357,
|
|
"learning_rate": 7.696914189093985e-07,
|
|
"loss": 1.1076,
|
|
"mean_token_accuracy": 0.7317219138145447,
|
|
"num_tokens": 41637522.0,
|
|
"step": 12450
|
|
},
|
|
{
|
|
"entropy": 1.1421875,
|
|
"epoch": 1.7028836955036217,
|
|
"grad_norm": 0.13529092796771727,
|
|
"learning_rate": 7.661688037198816e-07,
|
|
"loss": 1.1341,
|
|
"mean_token_accuracy": 0.7257923662662507,
|
|
"num_tokens": 42550960.0,
|
|
"step": 12460
|
|
},
|
|
{
|
|
"entropy": 1.133203125,
|
|
"epoch": 1.7042503758370917,
|
|
"grad_norm": 0.11598899562104965,
|
|
"learning_rate": 7.62646188530365e-07,
|
|
"loss": 1.1359,
|
|
"mean_token_accuracy": 0.7293453216552734,
|
|
"num_tokens": 43441422.0,
|
|
"step": 12470
|
|
},
|
|
{
|
|
"entropy": 1.159375,
|
|
"epoch": 1.7056170561705617,
|
|
"grad_norm": 0.12833090661894067,
|
|
"learning_rate": 7.591235733408483e-07,
|
|
"loss": 1.1652,
|
|
"mean_token_accuracy": 0.7225602686405181,
|
|
"num_tokens": 44354493.0,
|
|
"step": 12480
|
|
},
|
|
{
|
|
"entropy": 1.141796875,
|
|
"epoch": 1.7069837365040317,
|
|
"grad_norm": 0.12526216413996435,
|
|
"learning_rate": 7.556009581513315e-07,
|
|
"loss": 1.1413,
|
|
"mean_token_accuracy": 0.7214517891407013,
|
|
"num_tokens": 45289906.0,
|
|
"step": 12490
|
|
},
|
|
{
|
|
"entropy": 1.15,
|
|
"epoch": 1.7083504168375017,
|
|
"grad_norm": 0.1095919489014584,
|
|
"learning_rate": 7.520783429618149e-07,
|
|
"loss": 1.1601,
|
|
"mean_token_accuracy": 0.7224870681762695,
|
|
"num_tokens": 46221626.0,
|
|
"step": 12500
|
|
},
|
|
{
|
|
"entropy": 1.12890625,
|
|
"epoch": 1.7097170971709716,
|
|
"grad_norm": 0.12766678855946553,
|
|
"learning_rate": 7.485557277722982e-07,
|
|
"loss": 1.1246,
|
|
"mean_token_accuracy": 0.7280969321727753,
|
|
"num_tokens": 47164066.0,
|
|
"step": 12510
|
|
},
|
|
{
|
|
"entropy": 1.13203125,
|
|
"epoch": 1.7110837775044416,
|
|
"grad_norm": 0.12653908683134593,
|
|
"learning_rate": 7.450331125827815e-07,
|
|
"loss": 1.1247,
|
|
"mean_token_accuracy": 0.7313631713390351,
|
|
"num_tokens": 48118378.0,
|
|
"step": 12520
|
|
},
|
|
{
|
|
"entropy": 1.1546875,
|
|
"epoch": 1.7124504578379116,
|
|
"grad_norm": 0.11682121113337507,
|
|
"learning_rate": 7.415104973932648e-07,
|
|
"loss": 1.1512,
|
|
"mean_token_accuracy": 0.7227120041847229,
|
|
"num_tokens": 49039581.0,
|
|
"step": 12530
|
|
},
|
|
{
|
|
"entropy": 1.14921875,
|
|
"epoch": 1.7138171381713816,
|
|
"grad_norm": 0.12410538707729583,
|
|
"learning_rate": 7.379878822037481e-07,
|
|
"loss": 1.1557,
|
|
"mean_token_accuracy": 0.7219872772693634,
|
|
"num_tokens": 49973038.0,
|
|
"step": 12540
|
|
},
|
|
{
|
|
"entropy": 1.1515625,
|
|
"epoch": 1.7151838185048516,
|
|
"grad_norm": 0.1289436221893721,
|
|
"learning_rate": 7.344652670142314e-07,
|
|
"loss": 1.1577,
|
|
"mean_token_accuracy": 0.7234541177749634,
|
|
"num_tokens": 50885170.0,
|
|
"step": 12550
|
|
},
|
|
{
|
|
"entropy": 1.134375,
|
|
"epoch": 1.7165504988383216,
|
|
"grad_norm": 0.13409264341325233,
|
|
"learning_rate": 7.309426518247147e-07,
|
|
"loss": 1.1287,
|
|
"mean_token_accuracy": 0.7267938792705536,
|
|
"num_tokens": 51807456.0,
|
|
"step": 12560
|
|
},
|
|
{
|
|
"entropy": 1.11171875,
|
|
"epoch": 1.7179171791717918,
|
|
"grad_norm": 0.11874361739783605,
|
|
"learning_rate": 7.27420036635198e-07,
|
|
"loss": 1.1265,
|
|
"mean_token_accuracy": 0.7274623334407806,
|
|
"num_tokens": 52727249.0,
|
|
"step": 12570
|
|
},
|
|
{
|
|
"entropy": 1.103125,
|
|
"epoch": 1.7192838595052617,
|
|
"grad_norm": 0.12740859840996616,
|
|
"learning_rate": 7.238974214456813e-07,
|
|
"loss": 1.1074,
|
|
"mean_token_accuracy": 0.7306953966617584,
|
|
"num_tokens": 53608676.0,
|
|
"step": 12580
|
|
},
|
|
{
|
|
"entropy": 1.1703125,
|
|
"epoch": 1.7206505398387317,
|
|
"grad_norm": 0.11459854007034388,
|
|
"learning_rate": 7.203748062561647e-07,
|
|
"loss": 1.1882,
|
|
"mean_token_accuracy": 0.7225515842437744,
|
|
"num_tokens": 54545545.0,
|
|
"step": 12590
|
|
},
|
|
{
|
|
"entropy": 1.13515625,
|
|
"epoch": 1.7220172201722017,
|
|
"grad_norm": 0.1396266812142562,
|
|
"learning_rate": 7.168521910666479e-07,
|
|
"loss": 1.146,
|
|
"mean_token_accuracy": 0.7245380997657775,
|
|
"num_tokens": 55507524.0,
|
|
"step": 12600
|
|
},
|
|
{
|
|
"entropy": 1.13671875,
|
|
"epoch": 1.7233839005056717,
|
|
"grad_norm": 0.11813161467166322,
|
|
"learning_rate": 7.133295758771312e-07,
|
|
"loss": 1.14,
|
|
"mean_token_accuracy": 0.7261526405811309,
|
|
"num_tokens": 56422176.0,
|
|
"step": 12610
|
|
},
|
|
{
|
|
"entropy": 1.103125,
|
|
"epoch": 1.7247505808391417,
|
|
"grad_norm": 0.14785688315021173,
|
|
"learning_rate": 7.098069606876146e-07,
|
|
"loss": 1.1126,
|
|
"mean_token_accuracy": 0.7298131823539734,
|
|
"num_tokens": 57327436.0,
|
|
"step": 12620
|
|
},
|
|
{
|
|
"entropy": 1.11953125,
|
|
"epoch": 1.7261172611726119,
|
|
"grad_norm": 0.12701410494220824,
|
|
"learning_rate": 7.062843454980979e-07,
|
|
"loss": 1.1212,
|
|
"mean_token_accuracy": 0.7283440589904785,
|
|
"num_tokens": 58262622.0,
|
|
"step": 12630
|
|
},
|
|
{
|
|
"entropy": 1.1359375,
|
|
"epoch": 1.7274839415060819,
|
|
"grad_norm": 0.11390867862774953,
|
|
"learning_rate": 7.027617303085811e-07,
|
|
"loss": 1.1432,
|
|
"mean_token_accuracy": 0.7224138379096985,
|
|
"num_tokens": 59229150.0,
|
|
"step": 12640
|
|
},
|
|
{
|
|
"entropy": 1.09140625,
|
|
"epoch": 1.7288506218395518,
|
|
"grad_norm": 0.11692961234694851,
|
|
"learning_rate": 6.992391151190645e-07,
|
|
"loss": 1.0882,
|
|
"mean_token_accuracy": 0.7335573613643647,
|
|
"num_tokens": 60166758.0,
|
|
"step": 12650
|
|
},
|
|
{
|
|
"entropy": 1.139453125,
|
|
"epoch": 1.7302173021730218,
|
|
"grad_norm": 0.12690708871548798,
|
|
"learning_rate": 6.957164999295478e-07,
|
|
"loss": 1.1442,
|
|
"mean_token_accuracy": 0.7253235459327698,
|
|
"num_tokens": 61116905.0,
|
|
"step": 12660
|
|
},
|
|
{
|
|
"entropy": 1.1515625,
|
|
"epoch": 1.7315839825064918,
|
|
"grad_norm": 0.12783122926289586,
|
|
"learning_rate": 6.921938847400309e-07,
|
|
"loss": 1.1498,
|
|
"mean_token_accuracy": 0.7244520246982574,
|
|
"num_tokens": 62053868.0,
|
|
"step": 12670
|
|
},
|
|
{
|
|
"entropy": 1.0984375,
|
|
"epoch": 1.7329506628399618,
|
|
"grad_norm": 0.12120798578251496,
|
|
"learning_rate": 6.886712695505144e-07,
|
|
"loss": 1.0962,
|
|
"mean_token_accuracy": 0.7339078843593597,
|
|
"num_tokens": 62981293.0,
|
|
"step": 12680
|
|
},
|
|
{
|
|
"entropy": 1.096875,
|
|
"epoch": 1.7343173431734318,
|
|
"grad_norm": 0.13484745215037594,
|
|
"learning_rate": 6.851486543609977e-07,
|
|
"loss": 1.0973,
|
|
"mean_token_accuracy": 0.7345284521579742,
|
|
"num_tokens": 63887500.0,
|
|
"step": 12690
|
|
},
|
|
{
|
|
"entropy": 1.15,
|
|
"epoch": 1.7356840235069018,
|
|
"grad_norm": 0.13028762936505955,
|
|
"learning_rate": 6.816260391714811e-07,
|
|
"loss": 1.1617,
|
|
"mean_token_accuracy": 0.7196039438247681,
|
|
"num_tokens": 64850362.0,
|
|
"step": 12700
|
|
},
|
|
{
|
|
"entropy": 1.066015625,
|
|
"epoch": 1.7370507038403717,
|
|
"grad_norm": 0.15683328197047666,
|
|
"learning_rate": 6.781034239819642e-07,
|
|
"loss": 1.0706,
|
|
"mean_token_accuracy": 0.7389672756195068,
|
|
"num_tokens": 65777926.0,
|
|
"step": 12710
|
|
},
|
|
{
|
|
"entropy": 1.15390625,
|
|
"epoch": 1.7384173841738417,
|
|
"grad_norm": 0.12418045013993197,
|
|
"learning_rate": 6.745808087924475e-07,
|
|
"loss": 1.1562,
|
|
"mean_token_accuracy": 0.7218343913555145,
|
|
"num_tokens": 66703323.0,
|
|
"step": 12720
|
|
},
|
|
{
|
|
"entropy": 1.1390625,
|
|
"epoch": 1.7397840645073117,
|
|
"grad_norm": 0.11915619678242899,
|
|
"learning_rate": 6.710581936029308e-07,
|
|
"loss": 1.1487,
|
|
"mean_token_accuracy": 0.7254787981510162,
|
|
"num_tokens": 67612471.0,
|
|
"step": 12730
|
|
},
|
|
{
|
|
"entropy": 1.115625,
|
|
"epoch": 1.7411507448407817,
|
|
"grad_norm": 0.13497358098865392,
|
|
"learning_rate": 6.675355784134143e-07,
|
|
"loss": 1.1171,
|
|
"mean_token_accuracy": 0.7322192490100861,
|
|
"num_tokens": 68524718.0,
|
|
"step": 12740
|
|
},
|
|
{
|
|
"entropy": 1.129296875,
|
|
"epoch": 1.7425174251742517,
|
|
"grad_norm": 0.11214845927837422,
|
|
"learning_rate": 6.640129632238974e-07,
|
|
"loss": 1.1386,
|
|
"mean_token_accuracy": 0.7245809614658356,
|
|
"num_tokens": 69464905.0,
|
|
"step": 12750
|
|
},
|
|
{
|
|
"entropy": 1.114453125,
|
|
"epoch": 1.7438841055077217,
|
|
"grad_norm": 0.12986695317329613,
|
|
"learning_rate": 6.604903480343807e-07,
|
|
"loss": 1.1213,
|
|
"mean_token_accuracy": 0.7293673157691956,
|
|
"num_tokens": 70386359.0,
|
|
"step": 12760
|
|
},
|
|
{
|
|
"entropy": 1.13671875,
|
|
"epoch": 1.7452507858411916,
|
|
"grad_norm": 0.12191151054299595,
|
|
"learning_rate": 6.569677328448641e-07,
|
|
"loss": 1.1366,
|
|
"mean_token_accuracy": 0.726873642206192,
|
|
"num_tokens": 71300383.0,
|
|
"step": 12770
|
|
},
|
|
{
|
|
"entropy": 1.15,
|
|
"epoch": 1.7466174661746616,
|
|
"grad_norm": 0.13043455898858217,
|
|
"learning_rate": 6.534451176553473e-07,
|
|
"loss": 1.1657,
|
|
"mean_token_accuracy": 0.7213193774223328,
|
|
"num_tokens": 72189404.0,
|
|
"step": 12780
|
|
},
|
|
{
|
|
"entropy": 1.13203125,
|
|
"epoch": 1.7479841465081316,
|
|
"grad_norm": 0.10825640300458443,
|
|
"learning_rate": 6.499225024658306e-07,
|
|
"loss": 1.1531,
|
|
"mean_token_accuracy": 0.7234485983848572,
|
|
"num_tokens": 73134740.0,
|
|
"step": 12790
|
|
},
|
|
{
|
|
"entropy": 1.17734375,
|
|
"epoch": 1.7493508268416016,
|
|
"grad_norm": 0.1314506273312449,
|
|
"learning_rate": 6.46399887276314e-07,
|
|
"loss": 1.1918,
|
|
"mean_token_accuracy": 0.716097766160965,
|
|
"num_tokens": 74084065.0,
|
|
"step": 12800
|
|
},
|
|
{
|
|
"entropy": 1.06796875,
|
|
"epoch": 1.7507175071750718,
|
|
"grad_norm": 0.12743999182939575,
|
|
"learning_rate": 6.428772720867973e-07,
|
|
"loss": 1.06,
|
|
"mean_token_accuracy": 0.7393902540206909,
|
|
"num_tokens": 75002128.0,
|
|
"step": 12810
|
|
},
|
|
{
|
|
"entropy": 1.16953125,
|
|
"epoch": 1.7520841875085418,
|
|
"grad_norm": 0.13443796353071089,
|
|
"learning_rate": 6.393546568972805e-07,
|
|
"loss": 1.1616,
|
|
"mean_token_accuracy": 0.7221736192703248,
|
|
"num_tokens": 75915915.0,
|
|
"step": 12820
|
|
},
|
|
{
|
|
"entropy": 1.126171875,
|
|
"epoch": 1.7534508678420118,
|
|
"grad_norm": 0.12810040290292482,
|
|
"learning_rate": 6.358320417077639e-07,
|
|
"loss": 1.1313,
|
|
"mean_token_accuracy": 0.727418053150177,
|
|
"num_tokens": 76866359.0,
|
|
"step": 12830
|
|
},
|
|
{
|
|
"entropy": 1.18125,
|
|
"epoch": 1.7548175481754817,
|
|
"grad_norm": 0.1201551961862409,
|
|
"learning_rate": 6.323094265182472e-07,
|
|
"loss": 1.1924,
|
|
"mean_token_accuracy": 0.7175289928913117,
|
|
"num_tokens": 77774995.0,
|
|
"step": 12840
|
|
},
|
|
{
|
|
"entropy": 1.144921875,
|
|
"epoch": 1.7561842285089517,
|
|
"grad_norm": 0.14307556269220847,
|
|
"learning_rate": 6.287868113287306e-07,
|
|
"loss": 1.1472,
|
|
"mean_token_accuracy": 0.7268153369426728,
|
|
"num_tokens": 78694015.0,
|
|
"step": 12850
|
|
},
|
|
{
|
|
"entropy": 1.101953125,
|
|
"epoch": 1.7575509088424217,
|
|
"grad_norm": 0.12468471067631062,
|
|
"learning_rate": 6.252641961392138e-07,
|
|
"loss": 1.1013,
|
|
"mean_token_accuracy": 0.7327144801616668,
|
|
"num_tokens": 79606169.0,
|
|
"step": 12860
|
|
},
|
|
{
|
|
"entropy": 1.16640625,
|
|
"epoch": 1.758917589175892,
|
|
"grad_norm": 0.10998675659783223,
|
|
"learning_rate": 6.217415809496971e-07,
|
|
"loss": 1.1758,
|
|
"mean_token_accuracy": 0.7182612836360931,
|
|
"num_tokens": 80471663.0,
|
|
"step": 12870
|
|
},
|
|
{
|
|
"entropy": 1.13984375,
|
|
"epoch": 1.760284269509362,
|
|
"grad_norm": 0.12822587585032905,
|
|
"learning_rate": 6.182189657601804e-07,
|
|
"loss": 1.1408,
|
|
"mean_token_accuracy": 0.7274284541606904,
|
|
"num_tokens": 81394566.0,
|
|
"step": 12880
|
|
},
|
|
{
|
|
"entropy": 1.15703125,
|
|
"epoch": 1.7616509498428319,
|
|
"grad_norm": 0.12703823020211427,
|
|
"learning_rate": 6.146963505706637e-07,
|
|
"loss": 1.1691,
|
|
"mean_token_accuracy": 0.7202807426452636,
|
|
"num_tokens": 82357354.0,
|
|
"step": 12890
|
|
},
|
|
{
|
|
"entropy": 1.09765625,
|
|
"epoch": 1.7630176301763019,
|
|
"grad_norm": 0.13386135327479495,
|
|
"learning_rate": 6.11173735381147e-07,
|
|
"loss": 1.0969,
|
|
"mean_token_accuracy": 0.7329695582389831,
|
|
"num_tokens": 83340201.0,
|
|
"step": 12900
|
|
},
|
|
{
|
|
"entropy": 1.121875,
|
|
"epoch": 1.7643843105097718,
|
|
"grad_norm": 0.13179304718605228,
|
|
"learning_rate": 6.076511201916304e-07,
|
|
"loss": 1.1249,
|
|
"mean_token_accuracy": 0.7282289266586304,
|
|
"num_tokens": 84241511.0,
|
|
"step": 12910
|
|
},
|
|
{
|
|
"entropy": 1.11171875,
|
|
"epoch": 1.7657509908432418,
|
|
"grad_norm": 0.11782748477254137,
|
|
"learning_rate": 6.041285050021136e-07,
|
|
"loss": 1.12,
|
|
"mean_token_accuracy": 0.7281670331954956,
|
|
"num_tokens": 85197680.0,
|
|
"step": 12920
|
|
},
|
|
{
|
|
"entropy": 1.09921875,
|
|
"epoch": 1.7671176711767118,
|
|
"grad_norm": 0.1182995385775483,
|
|
"learning_rate": 6.006058898125969e-07,
|
|
"loss": 1.0954,
|
|
"mean_token_accuracy": 0.7348266422748566,
|
|
"num_tokens": 86126379.0,
|
|
"step": 12930
|
|
},
|
|
{
|
|
"entropy": 1.20234375,
|
|
"epoch": 1.7684843515101818,
|
|
"grad_norm": 0.12835959217473372,
|
|
"learning_rate": 5.970832746230803e-07,
|
|
"loss": 1.2218,
|
|
"mean_token_accuracy": 0.7119324684143067,
|
|
"num_tokens": 87048573.0,
|
|
"step": 12940
|
|
},
|
|
{
|
|
"entropy": 1.13515625,
|
|
"epoch": 1.7698510318436518,
|
|
"grad_norm": 0.1439262346948067,
|
|
"learning_rate": 5.935606594335636e-07,
|
|
"loss": 1.1351,
|
|
"mean_token_accuracy": 0.7263670682907104,
|
|
"num_tokens": 87970166.0,
|
|
"step": 12950
|
|
},
|
|
{
|
|
"entropy": 1.15078125,
|
|
"epoch": 1.7712177121771218,
|
|
"grad_norm": 0.12392638467946739,
|
|
"learning_rate": 5.900380442440468e-07,
|
|
"loss": 1.1666,
|
|
"mean_token_accuracy": 0.7230570614337921,
|
|
"num_tokens": 88850407.0,
|
|
"step": 12960
|
|
},
|
|
{
|
|
"entropy": 1.08125,
|
|
"epoch": 1.7725843925105917,
|
|
"grad_norm": 0.11303041078800619,
|
|
"learning_rate": 5.8651542905453e-07,
|
|
"loss": 1.0825,
|
|
"mean_token_accuracy": 0.737238883972168,
|
|
"num_tokens": 89772787.0,
|
|
"step": 12970
|
|
},
|
|
{
|
|
"entropy": 1.14609375,
|
|
"epoch": 1.7739510728440617,
|
|
"grad_norm": 0.12376048531516762,
|
|
"learning_rate": 5.829928138650135e-07,
|
|
"loss": 1.1573,
|
|
"mean_token_accuracy": 0.7208214640617371,
|
|
"num_tokens": 90705435.0,
|
|
"step": 12980
|
|
},
|
|
{
|
|
"entropy": 1.16015625,
|
|
"epoch": 1.7753177531775317,
|
|
"grad_norm": 0.12502357172034645,
|
|
"learning_rate": 5.794701986754967e-07,
|
|
"loss": 1.1747,
|
|
"mean_token_accuracy": 0.7210961163043976,
|
|
"num_tokens": 91635272.0,
|
|
"step": 12990
|
|
},
|
|
{
|
|
"entropy": 1.1671875,
|
|
"epoch": 1.7766844335110017,
|
|
"grad_norm": 0.12865172804095282,
|
|
"learning_rate": 5.759475834859801e-07,
|
|
"loss": 1.1674,
|
|
"mean_token_accuracy": 0.7203842163085937,
|
|
"num_tokens": 92556954.0,
|
|
"step": 13000
|
|
},
|
|
{
|
|
"entropy": 1.16484375,
|
|
"epoch": 1.7780511138444717,
|
|
"grad_norm": 0.12818845431041515,
|
|
"learning_rate": 5.724249682964633e-07,
|
|
"loss": 1.1791,
|
|
"mean_token_accuracy": 0.7171396791934967,
|
|
"num_tokens": 93506877.0,
|
|
"step": 13010
|
|
},
|
|
{
|
|
"entropy": 1.091015625,
|
|
"epoch": 1.7794177941779417,
|
|
"grad_norm": 0.11496358584530103,
|
|
"learning_rate": 5.689023531069466e-07,
|
|
"loss": 1.0875,
|
|
"mean_token_accuracy": 0.7362405300140381,
|
|
"num_tokens": 94415912.0,
|
|
"step": 13020
|
|
},
|
|
{
|
|
"entropy": 1.13671875,
|
|
"epoch": 1.7807844745114116,
|
|
"grad_norm": 0.12846414557471977,
|
|
"learning_rate": 5.6537973791743e-07,
|
|
"loss": 1.1363,
|
|
"mean_token_accuracy": 0.7261947810649871,
|
|
"num_tokens": 95348520.0,
|
|
"step": 13030
|
|
},
|
|
{
|
|
"entropy": 1.1328125,
|
|
"epoch": 1.7821511548448816,
|
|
"grad_norm": 0.12604185363405573,
|
|
"learning_rate": 5.618571227279133e-07,
|
|
"loss": 1.1361,
|
|
"mean_token_accuracy": 0.7264217555522918,
|
|
"num_tokens": 96243169.0,
|
|
"step": 13040
|
|
},
|
|
{
|
|
"entropy": 1.13671875,
|
|
"epoch": 1.7835178351783518,
|
|
"grad_norm": 0.12408170810069433,
|
|
"learning_rate": 5.583345075383965e-07,
|
|
"loss": 1.1565,
|
|
"mean_token_accuracy": 0.7209132254123688,
|
|
"num_tokens": 97165460.0,
|
|
"step": 13050
|
|
},
|
|
{
|
|
"entropy": 1.05546875,
|
|
"epoch": 1.7848845155118218,
|
|
"grad_norm": 0.14780676124585415,
|
|
"learning_rate": 5.548118923488799e-07,
|
|
"loss": 1.0516,
|
|
"mean_token_accuracy": 0.7421734511852265,
|
|
"num_tokens": 98065606.0,
|
|
"step": 13060
|
|
},
|
|
{
|
|
"entropy": 1.1296875,
|
|
"epoch": 1.7862511958452918,
|
|
"grad_norm": 0.11928503483956425,
|
|
"learning_rate": 5.512892771593632e-07,
|
|
"loss": 1.1243,
|
|
"mean_token_accuracy": 0.7292809009552002,
|
|
"num_tokens": 98944079.0,
|
|
"step": 13070
|
|
},
|
|
{
|
|
"entropy": 1.1359375,
|
|
"epoch": 1.7876178761787618,
|
|
"grad_norm": 0.14972502111725317,
|
|
"learning_rate": 5.477666619698464e-07,
|
|
"loss": 1.1393,
|
|
"mean_token_accuracy": 0.7250814378261566,
|
|
"num_tokens": 99834242.0,
|
|
"step": 13080
|
|
},
|
|
{
|
|
"entropy": 1.14375,
|
|
"epoch": 1.7889845565122318,
|
|
"grad_norm": 0.11469699366773599,
|
|
"learning_rate": 5.442440467803298e-07,
|
|
"loss": 1.1493,
|
|
"mean_token_accuracy": 0.7242798626422882,
|
|
"num_tokens": 100769863.0,
|
|
"step": 13090
|
|
},
|
|
{
|
|
"entropy": 1.111328125,
|
|
"epoch": 1.7903512368457017,
|
|
"grad_norm": 0.12016673750097531,
|
|
"learning_rate": 5.407214315908131e-07,
|
|
"loss": 1.1114,
|
|
"mean_token_accuracy": 0.7287180066108704,
|
|
"num_tokens": 101666867.0,
|
|
"step": 13100
|
|
},
|
|
{
|
|
"entropy": 1.15625,
|
|
"epoch": 1.791717917179172,
|
|
"grad_norm": 0.12272203486768558,
|
|
"learning_rate": 5.371988164012964e-07,
|
|
"loss": 1.1548,
|
|
"mean_token_accuracy": 0.7217296481132507,
|
|
"num_tokens": 102624418.0,
|
|
"step": 13110
|
|
},
|
|
{
|
|
"entropy": 1.123046875,
|
|
"epoch": 1.793084597512642,
|
|
"grad_norm": 0.1299009870774112,
|
|
"learning_rate": 5.336762012117797e-07,
|
|
"loss": 1.1148,
|
|
"mean_token_accuracy": 0.730418348312378,
|
|
"num_tokens": 103539237.0,
|
|
"step": 13120
|
|
},
|
|
{
|
|
"entropy": 1.124609375,
|
|
"epoch": 1.794451277846112,
|
|
"grad_norm": 0.12251841000022283,
|
|
"learning_rate": 5.30153586022263e-07,
|
|
"loss": 1.1437,
|
|
"mean_token_accuracy": 0.7262480676174163,
|
|
"num_tokens": 104445626.0,
|
|
"step": 13130
|
|
},
|
|
{
|
|
"entropy": 1.1390625,
|
|
"epoch": 1.795817958179582,
|
|
"grad_norm": 0.11701308400083336,
|
|
"learning_rate": 5.266309708327462e-07,
|
|
"loss": 1.1366,
|
|
"mean_token_accuracy": 0.7275892555713653,
|
|
"num_tokens": 105373855.0,
|
|
"step": 13140
|
|
},
|
|
{
|
|
"entropy": 1.11328125,
|
|
"epoch": 1.7971846385130519,
|
|
"grad_norm": 0.13835024610939997,
|
|
"learning_rate": 5.231083556432296e-07,
|
|
"loss": 1.1158,
|
|
"mean_token_accuracy": 0.7295054018497467,
|
|
"num_tokens": 106302873.0,
|
|
"step": 13150
|
|
},
|
|
{
|
|
"entropy": 1.0828125,
|
|
"epoch": 1.7985513188465219,
|
|
"grad_norm": 0.12841180062955626,
|
|
"learning_rate": 5.195857404537129e-07,
|
|
"loss": 1.0888,
|
|
"mean_token_accuracy": 0.7324483573436738,
|
|
"num_tokens": 107297035.0,
|
|
"step": 13160
|
|
},
|
|
{
|
|
"entropy": 1.104296875,
|
|
"epoch": 1.7999179991799918,
|
|
"grad_norm": 0.13890254150454698,
|
|
"learning_rate": 5.160631252641961e-07,
|
|
"loss": 1.0977,
|
|
"mean_token_accuracy": 0.7356093525886536,
|
|
"num_tokens": 108205581.0,
|
|
"step": 13170
|
|
},
|
|
{
|
|
"entropy": 1.090234375,
|
|
"epoch": 1.8012846795134618,
|
|
"grad_norm": 0.13894645385071766,
|
|
"learning_rate": 5.125405100746795e-07,
|
|
"loss": 1.0953,
|
|
"mean_token_accuracy": 0.7336654484272003,
|
|
"num_tokens": 109079757.0,
|
|
"step": 13180
|
|
},
|
|
{
|
|
"entropy": 1.146875,
|
|
"epoch": 1.8026513598469318,
|
|
"grad_norm": 0.12286015417910137,
|
|
"learning_rate": 5.090178948851628e-07,
|
|
"loss": 1.1554,
|
|
"mean_token_accuracy": 0.7222882568836212,
|
|
"num_tokens": 110030170.0,
|
|
"step": 13190
|
|
},
|
|
{
|
|
"entropy": 1.126171875,
|
|
"epoch": 1.8040180401804018,
|
|
"grad_norm": 0.12215254408663109,
|
|
"learning_rate": 5.054952796956461e-07,
|
|
"loss": 1.1292,
|
|
"mean_token_accuracy": 0.7282620429992676,
|
|
"num_tokens": 110939835.0,
|
|
"step": 13200
|
|
},
|
|
{
|
|
"entropy": 1.14140625,
|
|
"epoch": 1.8053847205138718,
|
|
"grad_norm": 0.12858363497753597,
|
|
"learning_rate": 5.019726645061294e-07,
|
|
"loss": 1.1465,
|
|
"mean_token_accuracy": 0.7233679592609406,
|
|
"num_tokens": 111883244.0,
|
|
"step": 13210
|
|
},
|
|
{
|
|
"entropy": 1.1515625,
|
|
"epoch": 1.8067514008473418,
|
|
"grad_norm": 0.13109723362802533,
|
|
"learning_rate": 4.984500493166127e-07,
|
|
"loss": 1.1517,
|
|
"mean_token_accuracy": 0.7227410972118378,
|
|
"num_tokens": 112802431.0,
|
|
"step": 13220
|
|
},
|
|
{
|
|
"entropy": 1.0859375,
|
|
"epoch": 1.8081180811808117,
|
|
"grad_norm": 0.12082482584102547,
|
|
"learning_rate": 4.94927434127096e-07,
|
|
"loss": 1.0896,
|
|
"mean_token_accuracy": 0.735493266582489,
|
|
"num_tokens": 113687834.0,
|
|
"step": 13230
|
|
},
|
|
{
|
|
"entropy": 1.14453125,
|
|
"epoch": 1.8094847615142817,
|
|
"grad_norm": 0.1169543325134482,
|
|
"learning_rate": 4.914048189375794e-07,
|
|
"loss": 1.1443,
|
|
"mean_token_accuracy": 0.726165771484375,
|
|
"num_tokens": 114605083.0,
|
|
"step": 13240
|
|
},
|
|
{
|
|
"entropy": 1.084765625,
|
|
"epoch": 1.8108514418477517,
|
|
"grad_norm": 0.12540285327057926,
|
|
"learning_rate": 4.878822037480626e-07,
|
|
"loss": 1.0901,
|
|
"mean_token_accuracy": 0.7367809474468231,
|
|
"num_tokens": 115546216.0,
|
|
"step": 13250
|
|
},
|
|
{
|
|
"entropy": 1.109375,
|
|
"epoch": 1.8122181221812217,
|
|
"grad_norm": 0.1234262871805594,
|
|
"learning_rate": 4.843595885585459e-07,
|
|
"loss": 1.1219,
|
|
"mean_token_accuracy": 0.7275927543640137,
|
|
"num_tokens": 116484736.0,
|
|
"step": 13260
|
|
},
|
|
{
|
|
"entropy": 1.165625,
|
|
"epoch": 1.8135848025146917,
|
|
"grad_norm": 0.11812256187727266,
|
|
"learning_rate": 4.808369733690292e-07,
|
|
"loss": 1.1684,
|
|
"mean_token_accuracy": 0.7201204776763916,
|
|
"num_tokens": 117415166.0,
|
|
"step": 13270
|
|
},
|
|
{
|
|
"entropy": 1.1,
|
|
"epoch": 1.8149514828481617,
|
|
"grad_norm": 0.1129992908494195,
|
|
"learning_rate": 4.773143581795125e-07,
|
|
"loss": 1.1069,
|
|
"mean_token_accuracy": 0.7330294907093048,
|
|
"num_tokens": 118349848.0,
|
|
"step": 13280
|
|
},
|
|
{
|
|
"entropy": 1.19375,
|
|
"epoch": 1.8163181631816319,
|
|
"grad_norm": 0.13967963319955765,
|
|
"learning_rate": 4.7379174298999583e-07,
|
|
"loss": 1.2028,
|
|
"mean_token_accuracy": 0.7157984852790833,
|
|
"num_tokens": 119246330.0,
|
|
"step": 13290
|
|
},
|
|
{
|
|
"entropy": 1.10546875,
|
|
"epoch": 1.8176848435151018,
|
|
"grad_norm": 0.1349168449583537,
|
|
"learning_rate": 4.702691278004791e-07,
|
|
"loss": 1.1149,
|
|
"mean_token_accuracy": 0.7311884820461273,
|
|
"num_tokens": 120181319.0,
|
|
"step": 13300
|
|
},
|
|
{
|
|
"entropy": 1.1578125,
|
|
"epoch": 1.8190515238485718,
|
|
"grad_norm": 0.11203043310258842,
|
|
"learning_rate": 4.6674651261096245e-07,
|
|
"loss": 1.1624,
|
|
"mean_token_accuracy": 0.7222895681858063,
|
|
"num_tokens": 121135502.0,
|
|
"step": 13310
|
|
},
|
|
{
|
|
"entropy": 1.14921875,
|
|
"epoch": 1.8204182041820418,
|
|
"grad_norm": 0.12412889327975274,
|
|
"learning_rate": 4.632238974214457e-07,
|
|
"loss": 1.1668,
|
|
"mean_token_accuracy": 0.7199769258499146,
|
|
"num_tokens": 122041051.0,
|
|
"step": 13320
|
|
},
|
|
{
|
|
"entropy": 1.134375,
|
|
"epoch": 1.8217848845155118,
|
|
"grad_norm": 0.1221852143845726,
|
|
"learning_rate": 4.5970128223192907e-07,
|
|
"loss": 1.1385,
|
|
"mean_token_accuracy": 0.7241899073123932,
|
|
"num_tokens": 122940371.0,
|
|
"step": 13330
|
|
},
|
|
{
|
|
"entropy": 1.1359375,
|
|
"epoch": 1.8231515648489818,
|
|
"grad_norm": 0.12076615959467965,
|
|
"learning_rate": 4.561786670424123e-07,
|
|
"loss": 1.124,
|
|
"mean_token_accuracy": 0.7300233900547027,
|
|
"num_tokens": 123832828.0,
|
|
"step": 13340
|
|
},
|
|
{
|
|
"entropy": 1.12421875,
|
|
"epoch": 1.824518245182452,
|
|
"grad_norm": 0.15083653039863035,
|
|
"learning_rate": 4.5265605185289563e-07,
|
|
"loss": 1.1303,
|
|
"mean_token_accuracy": 0.7271045684814453,
|
|
"num_tokens": 124782029.0,
|
|
"step": 13350
|
|
},
|
|
{
|
|
"entropy": 1.1203125,
|
|
"epoch": 1.825884925515922,
|
|
"grad_norm": 0.12013714265136466,
|
|
"learning_rate": 4.491334366633789e-07,
|
|
"loss": 1.1094,
|
|
"mean_token_accuracy": 0.731562715768814,
|
|
"num_tokens": 125695844.0,
|
|
"step": 13360
|
|
},
|
|
{
|
|
"entropy": 1.125,
|
|
"epoch": 1.827251605849392,
|
|
"grad_norm": 0.12190850601358441,
|
|
"learning_rate": 4.456108214738622e-07,
|
|
"loss": 1.1287,
|
|
"mean_token_accuracy": 0.7290099442005158,
|
|
"num_tokens": 126602844.0,
|
|
"step": 13370
|
|
},
|
|
{
|
|
"entropy": 1.1046875,
|
|
"epoch": 1.828618286182862,
|
|
"grad_norm": 0.12209601759956566,
|
|
"learning_rate": 4.4208820628434553e-07,
|
|
"loss": 1.1079,
|
|
"mean_token_accuracy": 0.7328003525733948,
|
|
"num_tokens": 127496479.0,
|
|
"step": 13380
|
|
},
|
|
{
|
|
"entropy": 1.12734375,
|
|
"epoch": 1.829984966516332,
|
|
"grad_norm": 0.13204298919788868,
|
|
"learning_rate": 4.385655910948288e-07,
|
|
"loss": 1.1247,
|
|
"mean_token_accuracy": 0.7288756370544434,
|
|
"num_tokens": 128390571.0,
|
|
"step": 13390
|
|
},
|
|
{
|
|
"entropy": 1.125,
|
|
"epoch": 1.831351646849802,
|
|
"grad_norm": 0.12466131133976925,
|
|
"learning_rate": 4.3504297590531215e-07,
|
|
"loss": 1.1335,
|
|
"mean_token_accuracy": 0.727796870470047,
|
|
"num_tokens": 129324966.0,
|
|
"step": 13400
|
|
},
|
|
{
|
|
"entropy": 1.14140625,
|
|
"epoch": 1.8327183271832719,
|
|
"grad_norm": 0.12373172245539579,
|
|
"learning_rate": 4.3152036071579543e-07,
|
|
"loss": 1.1503,
|
|
"mean_token_accuracy": 0.7248182773590088,
|
|
"num_tokens": 130223833.0,
|
|
"step": 13410
|
|
},
|
|
{
|
|
"entropy": 1.14765625,
|
|
"epoch": 1.8340850075167419,
|
|
"grad_norm": 0.11538088708138432,
|
|
"learning_rate": 4.2799774552627877e-07,
|
|
"loss": 1.1477,
|
|
"mean_token_accuracy": 0.7250828742980957,
|
|
"num_tokens": 131152117.0,
|
|
"step": 13420
|
|
},
|
|
{
|
|
"entropy": 1.128125,
|
|
"epoch": 1.8354516878502118,
|
|
"grad_norm": 0.11829934098014108,
|
|
"learning_rate": 4.2447513033676205e-07,
|
|
"loss": 1.1271,
|
|
"mean_token_accuracy": 0.7280493974685669,
|
|
"num_tokens": 132065998.0,
|
|
"step": 13430
|
|
},
|
|
{
|
|
"entropy": 1.183203125,
|
|
"epoch": 1.8368183681836818,
|
|
"grad_norm": 0.12828929874887135,
|
|
"learning_rate": 4.209525151472454e-07,
|
|
"loss": 1.1813,
|
|
"mean_token_accuracy": 0.7165474832057953,
|
|
"num_tokens": 132958023.0,
|
|
"step": 13440
|
|
},
|
|
{
|
|
"entropy": 1.146875,
|
|
"epoch": 1.8381850485171518,
|
|
"grad_norm": 0.12334591169457704,
|
|
"learning_rate": 4.1742989995772867e-07,
|
|
"loss": 1.1436,
|
|
"mean_token_accuracy": 0.7230403840541839,
|
|
"num_tokens": 133876787.0,
|
|
"step": 13450
|
|
},
|
|
{
|
|
"entropy": 1.107421875,
|
|
"epoch": 1.8395517288506218,
|
|
"grad_norm": 0.11916275464866825,
|
|
"learning_rate": 4.13907284768212e-07,
|
|
"loss": 1.1028,
|
|
"mean_token_accuracy": 0.7306994616985321,
|
|
"num_tokens": 134801760.0,
|
|
"step": 13460
|
|
},
|
|
{
|
|
"entropy": 1.1453125,
|
|
"epoch": 1.8409184091840918,
|
|
"grad_norm": 0.12972816227223086,
|
|
"learning_rate": 4.1038466957869523e-07,
|
|
"loss": 1.1528,
|
|
"mean_token_accuracy": 0.7223409295082093,
|
|
"num_tokens": 135734222.0,
|
|
"step": 13470
|
|
},
|
|
{
|
|
"entropy": 1.1328125,
|
|
"epoch": 1.8422850895175618,
|
|
"grad_norm": 0.11235702995842414,
|
|
"learning_rate": 4.068620543891785e-07,
|
|
"loss": 1.1345,
|
|
"mean_token_accuracy": 0.7283306181430816,
|
|
"num_tokens": 136645982.0,
|
|
"step": 13480
|
|
},
|
|
{
|
|
"entropy": 1.10625,
|
|
"epoch": 1.8436517698510317,
|
|
"grad_norm": 0.1246714924411429,
|
|
"learning_rate": 4.0333943919966185e-07,
|
|
"loss": 1.1131,
|
|
"mean_token_accuracy": 0.7297213912010193,
|
|
"num_tokens": 137542267.0,
|
|
"step": 13490
|
|
},
|
|
{
|
|
"entropy": 1.1234375,
|
|
"epoch": 1.8450184501845017,
|
|
"grad_norm": 0.12273466083037313,
|
|
"learning_rate": 3.9981682401014513e-07,
|
|
"loss": 1.1209,
|
|
"mean_token_accuracy": 0.7280346512794494,
|
|
"num_tokens": 138430757.0,
|
|
"step": 13500
|
|
},
|
|
{
|
|
"entropy": 1.159375,
|
|
"epoch": 1.8463851305179717,
|
|
"grad_norm": 0.11995552865474704,
|
|
"learning_rate": 3.9629420882062847e-07,
|
|
"loss": 1.1662,
|
|
"mean_token_accuracy": 0.721648383140564,
|
|
"num_tokens": 139388862.0,
|
|
"step": 13510
|
|
},
|
|
{
|
|
"entropy": 1.129296875,
|
|
"epoch": 1.8477518108514417,
|
|
"grad_norm": 0.12726673114974732,
|
|
"learning_rate": 3.9277159363111175e-07,
|
|
"loss": 1.1443,
|
|
"mean_token_accuracy": 0.725454843044281,
|
|
"num_tokens": 140236398.0,
|
|
"step": 13520
|
|
},
|
|
{
|
|
"entropy": 1.083203125,
|
|
"epoch": 1.849118491184912,
|
|
"grad_norm": 0.12441795659660179,
|
|
"learning_rate": 3.892489784415951e-07,
|
|
"loss": 1.0935,
|
|
"mean_token_accuracy": 0.7329600274562835,
|
|
"num_tokens": 141122220.0,
|
|
"step": 13530
|
|
},
|
|
{
|
|
"entropy": 1.128125,
|
|
"epoch": 1.8504851715183819,
|
|
"grad_norm": 0.12555541278204463,
|
|
"learning_rate": 3.8572636325207837e-07,
|
|
"loss": 1.1302,
|
|
"mean_token_accuracy": 0.7267957985401153,
|
|
"num_tokens": 142051017.0,
|
|
"step": 13540
|
|
},
|
|
{
|
|
"entropy": 1.12265625,
|
|
"epoch": 1.8518518518518519,
|
|
"grad_norm": 0.12777467380091567,
|
|
"learning_rate": 3.822037480625617e-07,
|
|
"loss": 1.127,
|
|
"mean_token_accuracy": 0.7254845798015594,
|
|
"num_tokens": 142919158.0,
|
|
"step": 13550
|
|
},
|
|
{
|
|
"entropy": 1.12109375,
|
|
"epoch": 1.8532185321853218,
|
|
"grad_norm": 0.125033521392866,
|
|
"learning_rate": 3.78681132873045e-07,
|
|
"loss": 1.1144,
|
|
"mean_token_accuracy": 0.7292994260787964,
|
|
"num_tokens": 143824740.0,
|
|
"step": 13560
|
|
},
|
|
{
|
|
"entropy": 1.06875,
|
|
"epoch": 1.8545852125187918,
|
|
"grad_norm": 0.13799767393875453,
|
|
"learning_rate": 3.7515851768352827e-07,
|
|
"loss": 1.067,
|
|
"mean_token_accuracy": 0.7423497080802918,
|
|
"num_tokens": 144768182.0,
|
|
"step": 13570
|
|
},
|
|
{
|
|
"entropy": 1.153515625,
|
|
"epoch": 1.8559518928522618,
|
|
"grad_norm": 0.12655391490408543,
|
|
"learning_rate": 3.716359024940116e-07,
|
|
"loss": 1.1628,
|
|
"mean_token_accuracy": 0.7202269196510315,
|
|
"num_tokens": 145700745.0,
|
|
"step": 13580
|
|
},
|
|
{
|
|
"entropy": 1.12890625,
|
|
"epoch": 1.857318573185732,
|
|
"grad_norm": 0.1277264958694746,
|
|
"learning_rate": 3.6811328730449484e-07,
|
|
"loss": 1.1318,
|
|
"mean_token_accuracy": 0.7265775859355926,
|
|
"num_tokens": 146654090.0,
|
|
"step": 13590
|
|
},
|
|
{
|
|
"entropy": 1.13203125,
|
|
"epoch": 1.858685253519202,
|
|
"grad_norm": 0.12236283374307624,
|
|
"learning_rate": 3.645906721149782e-07,
|
|
"loss": 1.1335,
|
|
"mean_token_accuracy": 0.7289546847343444,
|
|
"num_tokens": 147575414.0,
|
|
"step": 13600
|
|
},
|
|
{
|
|
"entropy": 1.13671875,
|
|
"epoch": 1.860051933852672,
|
|
"grad_norm": 0.1188500241601098,
|
|
"learning_rate": 3.6106805692546145e-07,
|
|
"loss": 1.1425,
|
|
"mean_token_accuracy": 0.7252379179000854,
|
|
"num_tokens": 148555252.0,
|
|
"step": 13610
|
|
},
|
|
{
|
|
"entropy": 1.13671875,
|
|
"epoch": 1.861418614186142,
|
|
"grad_norm": 0.21794364456709284,
|
|
"learning_rate": 3.575454417359448e-07,
|
|
"loss": 1.1334,
|
|
"mean_token_accuracy": 0.7272163033485413,
|
|
"num_tokens": 149469104.0,
|
|
"step": 13620
|
|
},
|
|
{
|
|
"entropy": 1.1125,
|
|
"epoch": 1.862785294519612,
|
|
"grad_norm": 0.1200952887925531,
|
|
"learning_rate": 3.5402282654642807e-07,
|
|
"loss": 1.1237,
|
|
"mean_token_accuracy": 0.7285793960094452,
|
|
"num_tokens": 150403775.0,
|
|
"step": 13630
|
|
},
|
|
{
|
|
"entropy": 1.18046875,
|
|
"epoch": 1.864151974853082,
|
|
"grad_norm": 0.12634543787198785,
|
|
"learning_rate": 3.505002113569114e-07,
|
|
"loss": 1.1778,
|
|
"mean_token_accuracy": 0.7192704975605011,
|
|
"num_tokens": 151299096.0,
|
|
"step": 13640
|
|
},
|
|
{
|
|
"entropy": 1.1109375,
|
|
"epoch": 1.865518655186552,
|
|
"grad_norm": 0.11977660352723547,
|
|
"learning_rate": 3.469775961673947e-07,
|
|
"loss": 1.1014,
|
|
"mean_token_accuracy": 0.7366556406021119,
|
|
"num_tokens": 152219490.0,
|
|
"step": 13650
|
|
},
|
|
{
|
|
"entropy": 1.086328125,
|
|
"epoch": 1.866885335520022,
|
|
"grad_norm": 0.13010390168486954,
|
|
"learning_rate": 3.43454980977878e-07,
|
|
"loss": 1.0792,
|
|
"mean_token_accuracy": 0.7369484424591064,
|
|
"num_tokens": 153167398.0,
|
|
"step": 13660
|
|
},
|
|
{
|
|
"entropy": 1.1234375,
|
|
"epoch": 1.8682520158534919,
|
|
"grad_norm": 0.12017893787795872,
|
|
"learning_rate": 3.399323657883613e-07,
|
|
"loss": 1.1226,
|
|
"mean_token_accuracy": 0.7282936036586761,
|
|
"num_tokens": 154073257.0,
|
|
"step": 13670
|
|
},
|
|
{
|
|
"entropy": 1.128515625,
|
|
"epoch": 1.8696186961869619,
|
|
"grad_norm": 0.13145275322623917,
|
|
"learning_rate": 3.364097505988446e-07,
|
|
"loss": 1.127,
|
|
"mean_token_accuracy": 0.7282096326351166,
|
|
"num_tokens": 155032425.0,
|
|
"step": 13680
|
|
},
|
|
{
|
|
"entropy": 1.1578125,
|
|
"epoch": 1.8709853765204318,
|
|
"grad_norm": 0.13632892403950334,
|
|
"learning_rate": 3.328871354093279e-07,
|
|
"loss": 1.1604,
|
|
"mean_token_accuracy": 0.7228013098239898,
|
|
"num_tokens": 155938174.0,
|
|
"step": 13690
|
|
},
|
|
{
|
|
"entropy": 1.1171875,
|
|
"epoch": 1.8723520568539018,
|
|
"grad_norm": 0.12478448149531322,
|
|
"learning_rate": 3.293645202198112e-07,
|
|
"loss": 1.1121,
|
|
"mean_token_accuracy": 0.7319933354854584,
|
|
"num_tokens": 156871113.0,
|
|
"step": 13700
|
|
},
|
|
{
|
|
"entropy": 1.15390625,
|
|
"epoch": 1.8737187371873718,
|
|
"grad_norm": 0.15909773299918367,
|
|
"learning_rate": 3.2584190503029454e-07,
|
|
"loss": 1.1545,
|
|
"mean_token_accuracy": 0.7245196580886841,
|
|
"num_tokens": 157781512.0,
|
|
"step": 13710
|
|
},
|
|
{
|
|
"entropy": 1.112890625,
|
|
"epoch": 1.8750854175208418,
|
|
"grad_norm": 0.13616266857448783,
|
|
"learning_rate": 3.223192898407778e-07,
|
|
"loss": 1.1084,
|
|
"mean_token_accuracy": 0.7309696078300476,
|
|
"num_tokens": 158691360.0,
|
|
"step": 13720
|
|
},
|
|
{
|
|
"entropy": 1.08984375,
|
|
"epoch": 1.8764520978543118,
|
|
"grad_norm": 0.12220196341665017,
|
|
"learning_rate": 3.1879667465126116e-07,
|
|
"loss": 1.0812,
|
|
"mean_token_accuracy": 0.7371710002422333,
|
|
"num_tokens": 159582543.0,
|
|
"step": 13730
|
|
},
|
|
{
|
|
"entropy": 1.171875,
|
|
"epoch": 1.8778187781877818,
|
|
"grad_norm": 0.3638587535432352,
|
|
"learning_rate": 3.152740594617444e-07,
|
|
"loss": 1.1898,
|
|
"mean_token_accuracy": 0.7158417284488678,
|
|
"num_tokens": 160558445.0,
|
|
"step": 13740
|
|
},
|
|
{
|
|
"entropy": 1.09921875,
|
|
"epoch": 1.8791854585212517,
|
|
"grad_norm": 0.13786133930891495,
|
|
"learning_rate": 3.117514442722277e-07,
|
|
"loss": 1.0881,
|
|
"mean_token_accuracy": 0.7356575906276703,
|
|
"num_tokens": 161423229.0,
|
|
"step": 13750
|
|
},
|
|
{
|
|
"entropy": 1.1375,
|
|
"epoch": 1.8805521388547217,
|
|
"grad_norm": 0.1148864705979759,
|
|
"learning_rate": 3.08228829082711e-07,
|
|
"loss": 1.1393,
|
|
"mean_token_accuracy": 0.7266925990581512,
|
|
"num_tokens": 162367954.0,
|
|
"step": 13760
|
|
},
|
|
{
|
|
"entropy": 1.1296875,
|
|
"epoch": 1.881918819188192,
|
|
"grad_norm": 0.13310601670462796,
|
|
"learning_rate": 3.0470621389319434e-07,
|
|
"loss": 1.1346,
|
|
"mean_token_accuracy": 0.7252279639244079,
|
|
"num_tokens": 163255158.0,
|
|
"step": 13770
|
|
},
|
|
{
|
|
"entropy": 1.1140625,
|
|
"epoch": 1.883285499521662,
|
|
"grad_norm": 0.10952496196083185,
|
|
"learning_rate": 3.0118359870367763e-07,
|
|
"loss": 1.1028,
|
|
"mean_token_accuracy": 0.73399116396904,
|
|
"num_tokens": 164147726.0,
|
|
"step": 13780
|
|
},
|
|
{
|
|
"entropy": 1.094921875,
|
|
"epoch": 1.884652179855132,
|
|
"grad_norm": 0.12301620377402347,
|
|
"learning_rate": 2.9766098351416096e-07,
|
|
"loss": 1.0899,
|
|
"mean_token_accuracy": 0.7370373368263244,
|
|
"num_tokens": 165051383.0,
|
|
"step": 13790
|
|
},
|
|
{
|
|
"entropy": 1.1484375,
|
|
"epoch": 1.8860188601886019,
|
|
"grad_norm": 0.1251603311488367,
|
|
"learning_rate": 2.9413836832464424e-07,
|
|
"loss": 1.1403,
|
|
"mean_token_accuracy": 0.7239563286304473,
|
|
"num_tokens": 165994504.0,
|
|
"step": 13800
|
|
},
|
|
{
|
|
"entropy": 1.151171875,
|
|
"epoch": 1.8873855405220719,
|
|
"grad_norm": 0.11809855574894575,
|
|
"learning_rate": 2.906157531351276e-07,
|
|
"loss": 1.165,
|
|
"mean_token_accuracy": 0.719918692111969,
|
|
"num_tokens": 166947789.0,
|
|
"step": 13810
|
|
},
|
|
{
|
|
"entropy": 1.097265625,
|
|
"epoch": 1.8887522208555418,
|
|
"grad_norm": 0.12329576777364534,
|
|
"learning_rate": 2.8709313794561086e-07,
|
|
"loss": 1.0931,
|
|
"mean_token_accuracy": 0.7338417887687683,
|
|
"num_tokens": 167854357.0,
|
|
"step": 13820
|
|
},
|
|
{
|
|
"entropy": 1.103125,
|
|
"epoch": 1.890118901189012,
|
|
"grad_norm": 0.1309400975516351,
|
|
"learning_rate": 2.8357052275609415e-07,
|
|
"loss": 1.1044,
|
|
"mean_token_accuracy": 0.7303215265274048,
|
|
"num_tokens": 168751355.0,
|
|
"step": 13830
|
|
},
|
|
{
|
|
"entropy": 1.1859375,
|
|
"epoch": 1.891485581522482,
|
|
"grad_norm": 0.1393915734910495,
|
|
"learning_rate": 2.8004790756657743e-07,
|
|
"loss": 1.1762,
|
|
"mean_token_accuracy": 0.7178963363170624,
|
|
"num_tokens": 169638746.0,
|
|
"step": 13840
|
|
},
|
|
{
|
|
"entropy": 1.159375,
|
|
"epoch": 1.892852261855952,
|
|
"grad_norm": 0.12847626934286854,
|
|
"learning_rate": 2.7652529237706076e-07,
|
|
"loss": 1.163,
|
|
"mean_token_accuracy": 0.7203345119953155,
|
|
"num_tokens": 170598271.0,
|
|
"step": 13850
|
|
},
|
|
{
|
|
"entropy": 1.137890625,
|
|
"epoch": 1.894218942189422,
|
|
"grad_norm": 0.12351684736661417,
|
|
"learning_rate": 2.7300267718754405e-07,
|
|
"loss": 1.1573,
|
|
"mean_token_accuracy": 0.7227859616279602,
|
|
"num_tokens": 171484862.0,
|
|
"step": 13860
|
|
},
|
|
{
|
|
"entropy": 1.112109375,
|
|
"epoch": 1.895585622522892,
|
|
"grad_norm": 0.12487375843501898,
|
|
"learning_rate": 2.694800619980274e-07,
|
|
"loss": 1.1183,
|
|
"mean_token_accuracy": 0.7314191520214081,
|
|
"num_tokens": 172409180.0,
|
|
"step": 13870
|
|
},
|
|
{
|
|
"entropy": 1.096484375,
|
|
"epoch": 1.896952302856362,
|
|
"grad_norm": 0.11525455131877257,
|
|
"learning_rate": 2.6595744680851066e-07,
|
|
"loss": 1.1106,
|
|
"mean_token_accuracy": 0.730965518951416,
|
|
"num_tokens": 173310985.0,
|
|
"step": 13880
|
|
},
|
|
{
|
|
"entropy": 1.1046875,
|
|
"epoch": 1.898318983189832,
|
|
"grad_norm": 0.1270974356033179,
|
|
"learning_rate": 2.6243483161899395e-07,
|
|
"loss": 1.1018,
|
|
"mean_token_accuracy": 0.7309568226337433,
|
|
"num_tokens": 174254723.0,
|
|
"step": 13890
|
|
},
|
|
{
|
|
"entropy": 1.12109375,
|
|
"epoch": 1.899685663523302,
|
|
"grad_norm": 0.1065636948305853,
|
|
"learning_rate": 2.589122164294773e-07,
|
|
"loss": 1.1203,
|
|
"mean_token_accuracy": 0.7294828951358795,
|
|
"num_tokens": 175165940.0,
|
|
"step": 13900
|
|
},
|
|
{
|
|
"entropy": 1.14453125,
|
|
"epoch": 1.901052343856772,
|
|
"grad_norm": 0.13759483400946468,
|
|
"learning_rate": 2.5538960123996056e-07,
|
|
"loss": 1.1493,
|
|
"mean_token_accuracy": 0.7250528275966645,
|
|
"num_tokens": 176057007.0,
|
|
"step": 13910
|
|
},
|
|
{
|
|
"entropy": 1.142578125,
|
|
"epoch": 1.902419024190242,
|
|
"grad_norm": 0.11095074543032066,
|
|
"learning_rate": 2.518669860504439e-07,
|
|
"loss": 1.1548,
|
|
"mean_token_accuracy": 0.7254100978374481,
|
|
"num_tokens": 177011947.0,
|
|
"step": 13920
|
|
},
|
|
{
|
|
"entropy": 1.076171875,
|
|
"epoch": 1.9037857045237119,
|
|
"grad_norm": 0.12571418595120168,
|
|
"learning_rate": 2.483443708609272e-07,
|
|
"loss": 1.0715,
|
|
"mean_token_accuracy": 0.7360704064369201,
|
|
"num_tokens": 177924400.0,
|
|
"step": 13930
|
|
},
|
|
{
|
|
"entropy": 1.125390625,
|
|
"epoch": 1.9051523848571819,
|
|
"grad_norm": 0.12358308503011593,
|
|
"learning_rate": 2.4482175567141046e-07,
|
|
"loss": 1.1212,
|
|
"mean_token_accuracy": 0.7287846565246582,
|
|
"num_tokens": 178845001.0,
|
|
"step": 13940
|
|
},
|
|
{
|
|
"entropy": 1.1078125,
|
|
"epoch": 1.9065190651906518,
|
|
"grad_norm": 0.12980114022892258,
|
|
"learning_rate": 2.4129914048189375e-07,
|
|
"loss": 1.1117,
|
|
"mean_token_accuracy": 0.7301311910152435,
|
|
"num_tokens": 179748324.0,
|
|
"step": 13950
|
|
},
|
|
{
|
|
"entropy": 1.184765625,
|
|
"epoch": 1.9078857455241218,
|
|
"grad_norm": 0.11736572583472513,
|
|
"learning_rate": 2.3777652529237708e-07,
|
|
"loss": 1.1939,
|
|
"mean_token_accuracy": 0.7163356661796569,
|
|
"num_tokens": 180662022.0,
|
|
"step": 13960
|
|
},
|
|
{
|
|
"entropy": 1.134375,
|
|
"epoch": 1.9092524258575918,
|
|
"grad_norm": 0.1279814562735858,
|
|
"learning_rate": 2.342539101028604e-07,
|
|
"loss": 1.1332,
|
|
"mean_token_accuracy": 0.7269836902618408,
|
|
"num_tokens": 181563942.0,
|
|
"step": 13970
|
|
},
|
|
{
|
|
"entropy": 1.13359375,
|
|
"epoch": 1.9106191061910618,
|
|
"grad_norm": 0.12153600833961209,
|
|
"learning_rate": 2.307312949133437e-07,
|
|
"loss": 1.138,
|
|
"mean_token_accuracy": 0.7255854189395905,
|
|
"num_tokens": 182500356.0,
|
|
"step": 13980
|
|
},
|
|
{
|
|
"entropy": 1.12109375,
|
|
"epoch": 1.9119857865245318,
|
|
"grad_norm": 0.1427015657599427,
|
|
"learning_rate": 2.2720867972382698e-07,
|
|
"loss": 1.1262,
|
|
"mean_token_accuracy": 0.726888918876648,
|
|
"num_tokens": 183445124.0,
|
|
"step": 13990
|
|
},
|
|
{
|
|
"entropy": 1.12890625,
|
|
"epoch": 1.9133524668580018,
|
|
"grad_norm": 0.12021365883707703,
|
|
"learning_rate": 2.236860645343103e-07,
|
|
"loss": 1.1201,
|
|
"mean_token_accuracy": 0.730306762456894,
|
|
"num_tokens": 184396746.0,
|
|
"step": 14000
|
|
},
|
|
{
|
|
"entropy": 1.1421875,
|
|
"epoch": 1.914719147191472,
|
|
"grad_norm": 0.12273210857795364,
|
|
"learning_rate": 2.201634493447936e-07,
|
|
"loss": 1.145,
|
|
"mean_token_accuracy": 0.72520831823349,
|
|
"num_tokens": 185359281.0,
|
|
"step": 14010
|
|
},
|
|
{
|
|
"entropy": 1.14296875,
|
|
"epoch": 1.916085827524942,
|
|
"grad_norm": 0.11787848396972453,
|
|
"learning_rate": 2.166408341552769e-07,
|
|
"loss": 1.1518,
|
|
"mean_token_accuracy": 0.722899752855301,
|
|
"num_tokens": 186280196.0,
|
|
"step": 14020
|
|
},
|
|
{
|
|
"entropy": 1.142578125,
|
|
"epoch": 1.917452507858412,
|
|
"grad_norm": 0.12277102556601298,
|
|
"learning_rate": 2.131182189657602e-07,
|
|
"loss": 1.1608,
|
|
"mean_token_accuracy": 0.7197115540504455,
|
|
"num_tokens": 187232100.0,
|
|
"step": 14030
|
|
},
|
|
{
|
|
"entropy": 1.11875,
|
|
"epoch": 1.918819188191882,
|
|
"grad_norm": 0.11499484626473239,
|
|
"learning_rate": 2.095956037762435e-07,
|
|
"loss": 1.1202,
|
|
"mean_token_accuracy": 0.7310818374156952,
|
|
"num_tokens": 188154418.0,
|
|
"step": 14040
|
|
},
|
|
{
|
|
"entropy": 1.101953125,
|
|
"epoch": 1.920185868525352,
|
|
"grad_norm": 0.11951697702426788,
|
|
"learning_rate": 2.0607298858672678e-07,
|
|
"loss": 1.1025,
|
|
"mean_token_accuracy": 0.7348897576332092,
|
|
"num_tokens": 189090001.0,
|
|
"step": 14050
|
|
},
|
|
{
|
|
"entropy": 1.1515625,
|
|
"epoch": 1.9215525488588219,
|
|
"grad_norm": 0.13678977408896936,
|
|
"learning_rate": 2.025503733972101e-07,
|
|
"loss": 1.1511,
|
|
"mean_token_accuracy": 0.7234688222408294,
|
|
"num_tokens": 190014866.0,
|
|
"step": 14060
|
|
},
|
|
{
|
|
"entropy": 1.1546875,
|
|
"epoch": 1.922919229192292,
|
|
"grad_norm": 0.12498092181276395,
|
|
"learning_rate": 1.990277582076934e-07,
|
|
"loss": 1.1563,
|
|
"mean_token_accuracy": 0.7225572824478149,
|
|
"num_tokens": 190917483.0,
|
|
"step": 14070
|
|
},
|
|
{
|
|
"entropy": 1.08671875,
|
|
"epoch": 1.924285909525762,
|
|
"grad_norm": 0.11825134251034447,
|
|
"learning_rate": 1.955051430181767e-07,
|
|
"loss": 1.0803,
|
|
"mean_token_accuracy": 0.7371215701103211,
|
|
"num_tokens": 191865929.0,
|
|
"step": 14080
|
|
},
|
|
{
|
|
"entropy": 1.1359375,
|
|
"epoch": 1.925652589859232,
|
|
"grad_norm": 0.13752033466278735,
|
|
"learning_rate": 1.9198252782866002e-07,
|
|
"loss": 1.1432,
|
|
"mean_token_accuracy": 0.7271552860736847,
|
|
"num_tokens": 192813435.0,
|
|
"step": 14090
|
|
},
|
|
{
|
|
"entropy": 1.141015625,
|
|
"epoch": 1.927019270192702,
|
|
"grad_norm": 0.1378305433702974,
|
|
"learning_rate": 1.8845991263914333e-07,
|
|
"loss": 1.1506,
|
|
"mean_token_accuracy": 0.7256171405315399,
|
|
"num_tokens": 193726480.0,
|
|
"step": 14100
|
|
},
|
|
{
|
|
"entropy": 1.14453125,
|
|
"epoch": 1.928385950526172,
|
|
"grad_norm": 0.1204785334985954,
|
|
"learning_rate": 1.8493729744962664e-07,
|
|
"loss": 1.1479,
|
|
"mean_token_accuracy": 0.7250540673732757,
|
|
"num_tokens": 194707283.0,
|
|
"step": 14110
|
|
},
|
|
{
|
|
"entropy": 1.0875,
|
|
"epoch": 1.929752630859642,
|
|
"grad_norm": 0.11887988064488007,
|
|
"learning_rate": 1.8141468226010995e-07,
|
|
"loss": 1.0874,
|
|
"mean_token_accuracy": 0.7333123564720154,
|
|
"num_tokens": 195656150.0,
|
|
"step": 14120
|
|
},
|
|
{
|
|
"entropy": 1.10390625,
|
|
"epoch": 1.931119311193112,
|
|
"grad_norm": 0.1176948345622635,
|
|
"learning_rate": 1.778920670705932e-07,
|
|
"loss": 1.1029,
|
|
"mean_token_accuracy": 0.7312156796455384,
|
|
"num_tokens": 196550109.0,
|
|
"step": 14130
|
|
},
|
|
{
|
|
"entropy": 1.122265625,
|
|
"epoch": 1.932485991526582,
|
|
"grad_norm": 0.13381045096258723,
|
|
"learning_rate": 1.743694518810765e-07,
|
|
"loss": 1.1167,
|
|
"mean_token_accuracy": 0.7304522037506104,
|
|
"num_tokens": 197476705.0,
|
|
"step": 14140
|
|
},
|
|
{
|
|
"entropy": 1.12265625,
|
|
"epoch": 1.933852671860052,
|
|
"grad_norm": 0.11930232058197902,
|
|
"learning_rate": 1.7084683669155982e-07,
|
|
"loss": 1.1268,
|
|
"mean_token_accuracy": 0.7297001421451569,
|
|
"num_tokens": 198405164.0,
|
|
"step": 14150
|
|
},
|
|
{
|
|
"entropy": 1.13515625,
|
|
"epoch": 1.935219352193522,
|
|
"grad_norm": 0.1412858472156583,
|
|
"learning_rate": 1.6732422150204313e-07,
|
|
"loss": 1.1349,
|
|
"mean_token_accuracy": 0.7265946090221405,
|
|
"num_tokens": 199281622.0,
|
|
"step": 14160
|
|
},
|
|
{
|
|
"entropy": 1.12578125,
|
|
"epoch": 1.936586032526992,
|
|
"grad_norm": 0.11992167044918191,
|
|
"learning_rate": 1.6380160631252644e-07,
|
|
"loss": 1.1169,
|
|
"mean_token_accuracy": 0.7326765298843384,
|
|
"num_tokens": 200213447.0,
|
|
"step": 14170
|
|
},
|
|
{
|
|
"entropy": 1.16328125,
|
|
"epoch": 1.937952712860462,
|
|
"grad_norm": 0.1341168229169103,
|
|
"learning_rate": 1.6027899112300975e-07,
|
|
"loss": 1.1553,
|
|
"mean_token_accuracy": 0.7205090045928955,
|
|
"num_tokens": 201132847.0,
|
|
"step": 14180
|
|
},
|
|
{
|
|
"entropy": 1.15078125,
|
|
"epoch": 1.9393193931939319,
|
|
"grad_norm": 0.12423738724074226,
|
|
"learning_rate": 1.5675637593349303e-07,
|
|
"loss": 1.1636,
|
|
"mean_token_accuracy": 0.7231822192668915,
|
|
"num_tokens": 202041330.0,
|
|
"step": 14190
|
|
},
|
|
{
|
|
"entropy": 1.13984375,
|
|
"epoch": 1.9406860735274019,
|
|
"grad_norm": 0.1393970152077717,
|
|
"learning_rate": 1.5323376074397634e-07,
|
|
"loss": 1.1481,
|
|
"mean_token_accuracy": 0.7213244736194611,
|
|
"num_tokens": 202927230.0,
|
|
"step": 14200
|
|
},
|
|
{
|
|
"entropy": 1.1671875,
|
|
"epoch": 1.9420527538608718,
|
|
"grad_norm": 0.13483184014429944,
|
|
"learning_rate": 1.4971114555445965e-07,
|
|
"loss": 1.187,
|
|
"mean_token_accuracy": 0.7189782977104187,
|
|
"num_tokens": 203852110.0,
|
|
"step": 14210
|
|
},
|
|
{
|
|
"entropy": 1.105859375,
|
|
"epoch": 1.9434194341943418,
|
|
"grad_norm": 0.11531481635270928,
|
|
"learning_rate": 1.4618853036494293e-07,
|
|
"loss": 1.1048,
|
|
"mean_token_accuracy": 0.7326868593692779,
|
|
"num_tokens": 204754873.0,
|
|
"step": 14220
|
|
},
|
|
{
|
|
"entropy": 1.1609375,
|
|
"epoch": 1.9447861145278118,
|
|
"grad_norm": 0.12016126274080406,
|
|
"learning_rate": 1.4266591517542624e-07,
|
|
"loss": 1.1693,
|
|
"mean_token_accuracy": 0.722251296043396,
|
|
"num_tokens": 205689965.0,
|
|
"step": 14230
|
|
},
|
|
{
|
|
"entropy": 1.12578125,
|
|
"epoch": 1.9461527948612818,
|
|
"grad_norm": 0.11670743648993359,
|
|
"learning_rate": 1.3914329998590955e-07,
|
|
"loss": 1.1242,
|
|
"mean_token_accuracy": 0.728280657529831,
|
|
"num_tokens": 206650172.0,
|
|
"step": 14240
|
|
},
|
|
{
|
|
"entropy": 1.16171875,
|
|
"epoch": 1.947519475194752,
|
|
"grad_norm": 0.12038237594316596,
|
|
"learning_rate": 1.3562068479639286e-07,
|
|
"loss": 1.1655,
|
|
"mean_token_accuracy": 0.7205449998378753,
|
|
"num_tokens": 207623109.0,
|
|
"step": 14250
|
|
},
|
|
{
|
|
"entropy": 1.1421875,
|
|
"epoch": 1.948886155528222,
|
|
"grad_norm": 0.1106669293174605,
|
|
"learning_rate": 1.3209806960687614e-07,
|
|
"loss": 1.1468,
|
|
"mean_token_accuracy": 0.7249425709247589,
|
|
"num_tokens": 208577518.0,
|
|
"step": 14260
|
|
},
|
|
{
|
|
"entropy": 1.13203125,
|
|
"epoch": 1.950252835861692,
|
|
"grad_norm": 0.12049078193979895,
|
|
"learning_rate": 1.2857545441735945e-07,
|
|
"loss": 1.142,
|
|
"mean_token_accuracy": 0.7245071351528167,
|
|
"num_tokens": 209533088.0,
|
|
"step": 14270
|
|
},
|
|
{
|
|
"entropy": 1.12578125,
|
|
"epoch": 1.951619516195162,
|
|
"grad_norm": 0.11937526690371666,
|
|
"learning_rate": 1.2505283922784276e-07,
|
|
"loss": 1.1284,
|
|
"mean_token_accuracy": 0.7280556619167328,
|
|
"num_tokens": 210472496.0,
|
|
"step": 14280
|
|
},
|
|
{
|
|
"entropy": 1.13515625,
|
|
"epoch": 1.952986196528632,
|
|
"grad_norm": 0.12614558476227908,
|
|
"learning_rate": 1.2153022403832607e-07,
|
|
"loss": 1.1292,
|
|
"mean_token_accuracy": 0.7276358187198639,
|
|
"num_tokens": 211347026.0,
|
|
"step": 14290
|
|
},
|
|
{
|
|
"entropy": 1.10859375,
|
|
"epoch": 1.954352876862102,
|
|
"grad_norm": 0.13412144970629636,
|
|
"learning_rate": 1.1800760884880937e-07,
|
|
"loss": 1.1239,
|
|
"mean_token_accuracy": 0.7271620571613312,
|
|
"num_tokens": 212245046.0,
|
|
"step": 14300
|
|
},
|
|
{
|
|
"entropy": 1.22734375,
|
|
"epoch": 1.9557195571955721,
|
|
"grad_norm": 0.1360538688744242,
|
|
"learning_rate": 1.1448499365929266e-07,
|
|
"loss": 1.2425,
|
|
"mean_token_accuracy": 0.7091282427310943,
|
|
"num_tokens": 213166602.0,
|
|
"step": 14310
|
|
},
|
|
{
|
|
"entropy": 1.13125,
|
|
"epoch": 1.957086237529042,
|
|
"grad_norm": 0.12034832528442574,
|
|
"learning_rate": 1.1096237846977597e-07,
|
|
"loss": 1.1363,
|
|
"mean_token_accuracy": 0.7255271553993226,
|
|
"num_tokens": 214061122.0,
|
|
"step": 14320
|
|
},
|
|
{
|
|
"entropy": 1.12890625,
|
|
"epoch": 1.958452917862512,
|
|
"grad_norm": 0.1134362864695453,
|
|
"learning_rate": 1.0743976328025928e-07,
|
|
"loss": 1.1307,
|
|
"mean_token_accuracy": 0.728524136543274,
|
|
"num_tokens": 215007170.0,
|
|
"step": 14330
|
|
},
|
|
{
|
|
"entropy": 1.12734375,
|
|
"epoch": 1.959819598195982,
|
|
"grad_norm": 0.11494881720497867,
|
|
"learning_rate": 1.0391714809074258e-07,
|
|
"loss": 1.1371,
|
|
"mean_token_accuracy": 0.727102530002594,
|
|
"num_tokens": 215916034.0,
|
|
"step": 14340
|
|
},
|
|
{
|
|
"entropy": 1.146875,
|
|
"epoch": 1.961186278529452,
|
|
"grad_norm": 0.12849142064352448,
|
|
"learning_rate": 1.0039453290122588e-07,
|
|
"loss": 1.1517,
|
|
"mean_token_accuracy": 0.7243241310119629,
|
|
"num_tokens": 216926531.0,
|
|
"step": 14350
|
|
},
|
|
{
|
|
"entropy": 1.15,
|
|
"epoch": 1.962552958862922,
|
|
"grad_norm": 0.12746484765691518,
|
|
"learning_rate": 9.687191771170918e-08,
|
|
"loss": 1.1645,
|
|
"mean_token_accuracy": 0.7198676943778992,
|
|
"num_tokens": 217853362.0,
|
|
"step": 14360
|
|
},
|
|
{
|
|
"entropy": 1.10546875,
|
|
"epoch": 1.963919639196392,
|
|
"grad_norm": 0.12575340230574,
|
|
"learning_rate": 9.334930252219248e-08,
|
|
"loss": 1.1081,
|
|
"mean_token_accuracy": 0.7304174602031708,
|
|
"num_tokens": 218743465.0,
|
|
"step": 14370
|
|
},
|
|
{
|
|
"entropy": 1.15625,
|
|
"epoch": 1.965286319529862,
|
|
"grad_norm": 0.12018404940218383,
|
|
"learning_rate": 8.982668733267578e-08,
|
|
"loss": 1.1516,
|
|
"mean_token_accuracy": 0.7217306673526764,
|
|
"num_tokens": 219649104.0,
|
|
"step": 14380
|
|
},
|
|
{
|
|
"entropy": 1.155859375,
|
|
"epoch": 1.966652999863332,
|
|
"grad_norm": 0.13690737884432133,
|
|
"learning_rate": 8.630407214315909e-08,
|
|
"loss": 1.1581,
|
|
"mean_token_accuracy": 0.7228217959403992,
|
|
"num_tokens": 220584028.0,
|
|
"step": 14390
|
|
},
|
|
{
|
|
"entropy": 1.15625,
|
|
"epoch": 1.968019680196802,
|
|
"grad_norm": 0.13207191128669654,
|
|
"learning_rate": 8.27814569536424e-08,
|
|
"loss": 1.1689,
|
|
"mean_token_accuracy": 0.7194275081157684,
|
|
"num_tokens": 221478507.0,
|
|
"step": 14400
|
|
},
|
|
{
|
|
"entropy": 1.14375,
|
|
"epoch": 1.969386360530272,
|
|
"grad_norm": 0.12874807562286522,
|
|
"learning_rate": 7.925884176412568e-08,
|
|
"loss": 1.1467,
|
|
"mean_token_accuracy": 0.7247754573822022,
|
|
"num_tokens": 222372101.0,
|
|
"step": 14410
|
|
},
|
|
{
|
|
"entropy": 1.120703125,
|
|
"epoch": 1.970753040863742,
|
|
"grad_norm": 0.12712927906189314,
|
|
"learning_rate": 7.573622657460899e-08,
|
|
"loss": 1.1222,
|
|
"mean_token_accuracy": 0.7318640410900116,
|
|
"num_tokens": 223299776.0,
|
|
"step": 14420
|
|
},
|
|
{
|
|
"entropy": 1.13125,
|
|
"epoch": 1.972119721197212,
|
|
"grad_norm": 0.1102841226394374,
|
|
"learning_rate": 7.22136113850923e-08,
|
|
"loss": 1.1365,
|
|
"mean_token_accuracy": 0.7268287301063537,
|
|
"num_tokens": 224268136.0,
|
|
"step": 14430
|
|
},
|
|
{
|
|
"entropy": 1.14296875,
|
|
"epoch": 1.9734864015306819,
|
|
"grad_norm": 0.12604156520475573,
|
|
"learning_rate": 6.869099619557561e-08,
|
|
"loss": 1.1354,
|
|
"mean_token_accuracy": 0.7279965877532959,
|
|
"num_tokens": 225212103.0,
|
|
"step": 14440
|
|
},
|
|
{
|
|
"entropy": 1.105859375,
|
|
"epoch": 1.9748530818641519,
|
|
"grad_norm": 0.1238523051092994,
|
|
"learning_rate": 6.51683810060589e-08,
|
|
"loss": 1.1084,
|
|
"mean_token_accuracy": 0.7305983006954193,
|
|
"num_tokens": 226179853.0,
|
|
"step": 14450
|
|
},
|
|
{
|
|
"entropy": 1.128125,
|
|
"epoch": 1.9762197621976219,
|
|
"grad_norm": 0.1247711584654436,
|
|
"learning_rate": 6.164576581654221e-08,
|
|
"loss": 1.126,
|
|
"mean_token_accuracy": 0.7278917014598847,
|
|
"num_tokens": 227077445.0,
|
|
"step": 14460
|
|
},
|
|
{
|
|
"entropy": 1.12890625,
|
|
"epoch": 1.9775864425310918,
|
|
"grad_norm": 0.12167330312842972,
|
|
"learning_rate": 5.8123150627025515e-08,
|
|
"loss": 1.141,
|
|
"mean_token_accuracy": 0.7268628656864167,
|
|
"num_tokens": 227957010.0,
|
|
"step": 14470
|
|
},
|
|
{
|
|
"entropy": 1.1546875,
|
|
"epoch": 1.9789531228645618,
|
|
"grad_norm": 0.13399571499638144,
|
|
"learning_rate": 5.460053543750881e-08,
|
|
"loss": 1.1685,
|
|
"mean_token_accuracy": 0.7203802406787873,
|
|
"num_tokens": 228898359.0,
|
|
"step": 14480
|
|
},
|
|
{
|
|
"entropy": 1.08203125,
|
|
"epoch": 1.980319803198032,
|
|
"grad_norm": 0.12172914076306496,
|
|
"learning_rate": 5.107792024799211e-08,
|
|
"loss": 1.0863,
|
|
"mean_token_accuracy": 0.7349555313587188,
|
|
"num_tokens": 229796201.0,
|
|
"step": 14490
|
|
},
|
|
{
|
|
"entropy": 1.109375,
|
|
"epoch": 1.981686483531502,
|
|
"grad_norm": 0.11904917330329003,
|
|
"learning_rate": 4.7555305058475415e-08,
|
|
"loss": 1.1118,
|
|
"mean_token_accuracy": 0.7321444094181061,
|
|
"num_tokens": 230723005.0,
|
|
"step": 14500
|
|
},
|
|
{
|
|
"entropy": 1.130078125,
|
|
"epoch": 1.983053163864972,
|
|
"grad_norm": 0.11993839351973465,
|
|
"learning_rate": 4.403268986895872e-08,
|
|
"loss": 1.1331,
|
|
"mean_token_accuracy": 0.7264376580715179,
|
|
"num_tokens": 231614723.0,
|
|
"step": 14510
|
|
},
|
|
{
|
|
"entropy": 1.1640625,
|
|
"epoch": 1.984419844198442,
|
|
"grad_norm": 0.12106799336199377,
|
|
"learning_rate": 4.0510074679442026e-08,
|
|
"loss": 1.1616,
|
|
"mean_token_accuracy": 0.7211284160614013,
|
|
"num_tokens": 232521569.0,
|
|
"step": 14520
|
|
},
|
|
{
|
|
"entropy": 1.15234375,
|
|
"epoch": 1.985786524531912,
|
|
"grad_norm": 0.12442431418562398,
|
|
"learning_rate": 3.698745948992532e-08,
|
|
"loss": 1.1577,
|
|
"mean_token_accuracy": 0.7232869267463684,
|
|
"num_tokens": 233451981.0,
|
|
"step": 14530
|
|
},
|
|
{
|
|
"entropy": 1.13671875,
|
|
"epoch": 1.987153204865382,
|
|
"grad_norm": 0.13415874410742423,
|
|
"learning_rate": 3.3464844300408624e-08,
|
|
"loss": 1.1437,
|
|
"mean_token_accuracy": 0.7264166533946991,
|
|
"num_tokens": 234349422.0,
|
|
"step": 14540
|
|
},
|
|
{
|
|
"entropy": 1.16953125,
|
|
"epoch": 1.9885198851988521,
|
|
"grad_norm": 0.1374802011565807,
|
|
"learning_rate": 2.994222911089193e-08,
|
|
"loss": 1.1789,
|
|
"mean_token_accuracy": 0.7168438792228699,
|
|
"num_tokens": 235273744.0,
|
|
"step": 14550
|
|
},
|
|
{
|
|
"entropy": 1.10625,
|
|
"epoch": 1.9898865655323221,
|
|
"grad_norm": 0.12063722156936371,
|
|
"learning_rate": 2.6419613921375232e-08,
|
|
"loss": 1.1245,
|
|
"mean_token_accuracy": 0.7306301355361938,
|
|
"num_tokens": 236223471.0,
|
|
"step": 14560
|
|
},
|
|
{
|
|
"entropy": 1.12890625,
|
|
"epoch": 1.991253245865792,
|
|
"grad_norm": 0.11768925687055937,
|
|
"learning_rate": 2.289699873185853e-08,
|
|
"loss": 1.1264,
|
|
"mean_token_accuracy": 0.7284922897815704,
|
|
"num_tokens": 237094812.0,
|
|
"step": 14570
|
|
},
|
|
{
|
|
"entropy": 1.11484375,
|
|
"epoch": 1.992619926199262,
|
|
"grad_norm": 0.12311737134044515,
|
|
"learning_rate": 1.9374383542341837e-08,
|
|
"loss": 1.1208,
|
|
"mean_token_accuracy": 0.7295483529567719,
|
|
"num_tokens": 238004049.0,
|
|
"step": 14580
|
|
},
|
|
{
|
|
"entropy": 1.11640625,
|
|
"epoch": 1.993986606532732,
|
|
"grad_norm": 0.12704591943991167,
|
|
"learning_rate": 1.585176835282514e-08,
|
|
"loss": 1.1194,
|
|
"mean_token_accuracy": 0.728802067041397,
|
|
"num_tokens": 238890267.0,
|
|
"step": 14590
|
|
},
|
|
{
|
|
"entropy": 1.17265625,
|
|
"epoch": 1.995353286866202,
|
|
"grad_norm": 0.12130260293688136,
|
|
"learning_rate": 1.2329153163308442e-08,
|
|
"loss": 1.1844,
|
|
"mean_token_accuracy": 0.7165128648281097,
|
|
"num_tokens": 239811788.0,
|
|
"step": 14600
|
|
},
|
|
{
|
|
"entropy": 1.099609375,
|
|
"epoch": 1.996719967199672,
|
|
"grad_norm": 0.13036664914854199,
|
|
"learning_rate": 8.806537973791744e-09,
|
|
"loss": 1.0957,
|
|
"mean_token_accuracy": 0.7332609713077545,
|
|
"num_tokens": 240716356.0,
|
|
"step": 14610
|
|
},
|
|
{
|
|
"entropy": 1.140625,
|
|
"epoch": 1.998086647533142,
|
|
"grad_norm": 0.14045284369246935,
|
|
"learning_rate": 5.2839227842750465e-09,
|
|
"loss": 1.1556,
|
|
"mean_token_accuracy": 0.7212701082229614,
|
|
"num_tokens": 241601601.0,
|
|
"step": 14620
|
|
},
|
|
{
|
|
"entropy": 1.1984375,
|
|
"epoch": 1.999453327866612,
|
|
"grad_norm": 0.12317908488088698,
|
|
"learning_rate": 1.7613075947583486e-09,
|
|
"loss": 1.2053,
|
|
"mean_token_accuracy": 0.7130117774009704,
|
|
"num_tokens": 242552770.0,
|
|
"step": 14630
|
|
},
|
|
{
|
|
"entropy": 1.12109375,
|
|
"epoch": 2.0,
|
|
"mean_token_accuracy": 0.7266275435686111,
|
|
"num_tokens": 242935223.0,
|
|
"step": 14634,
|
|
"total_flos": 2.19812273324032e+16,
|
|
"train_loss": 0.20388387141371744,
|
|
"train_runtime": 9381.1717,
|
|
"train_samples_per_second": 199.657,
|
|
"train_steps_per_second": 1.56
|
|
}
|
|
],
|
|
"logging_steps": 10,
|
|
"max_steps": 14634,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 2,
|
|
"save_steps": 200,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 2.19812273324032e+16,
|
|
"train_batch_size": 16,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|